diff --git a/.coveragerc b/.coveragerc
index 8b036f1f426..8a5a2a144ec 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,4 +1,7 @@
 [report]
+omit =
+    tools/*
+
 # Regexes for lines to exclude from consideration
 exclude_lines =
     # Have to re-enable the standard pragma
diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml
new file mode 100644
index 00000000000..7027d78785d
--- /dev/null
+++ b/.github/workflows/cancel.yml
@@ -0,0 +1,13 @@
+name: Cancel
+on:
+  workflow_run:
+    workflows: ["CI", "centos7", "debian9", "doc"]
+    types:
+      - requested
+jobs:
+  cancel:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: styfle/cancel-workflow-action@0.9.1
+      with:
+        workflow_id: ${{ github.event.workflow.id }}
diff --git a/.github/workflows/centos7.yml b/.github/workflows/centos7.yml
index e1373ee14c0..94d5973e859 100644
--- a/.github/workflows/centos7.yml
+++ b/.github/workflows/centos7.yml
@@ -9,25 +9,24 @@ on:
       - master
 
 jobs:
-  check_skip:
-    runs-on: ubuntu-latest
-    if: "! contains(github.event.head_commit.message, '[ci skip]')"
-    steps:
-      - run: echo "${{ github.event.head_commit.message }}"
-  linter_and_test:
+  test_centos7:
     runs-on: ubuntu-latest
     container:
       image: centos:7
       env:
-        ESPNET_PYTHON_VERSION: 3.6
-        TH_VERSION: 1.8.0
+        ESPNET_PYTHON_VERSION: 3.7
+        # NOTE: 1.9.0 raised libstdc++ version errors in pyworld.
+        # ImportError: /lib64/libstdc++.so.6: version `CXXABI_1.3.8' not found
+        # (required by /__w/espnet/espnet/tools/venv/envs/espnet/lib/python3.6/site-packages/pyworld/pyworld.cpython-36m-x86_64-linux-gnu.so)
+        # NOTE(kamo): The issue doens't exist for python3.7?
+        TH_VERSION: 1.10.1
         CHAINER_VERSION: 6.0.0
         USE_CONDA: true
         CC: /opt/rh/devtoolset-7/root/usr/bin/gcc
         CXX: /opt/rh/devtoolset-7/root/usr/bin/g++
+        MAKE: /opt/rh/devtoolset-7/root/usr/bin/make
         # To avoid UnicodeEncodeError for python<=3.6
         LC_ALL: en_US.UTF-8
-    needs: check_skip
     steps:
       - uses: actions/checkout@master
       - name: check OS
@@ -35,9 +34,9 @@ jobs:
       - name: install dependencies
         run: |
           # NOTE(kamo): cmake sndfile will be download using anacond:
-          yum install -y git centos-release-scl make bzip2 wget which unzip bc patch
+          yum install -y git centos-release-scl bzip2 wget which unzip bc patch
           yum-config-manager --enable rhel-server-rhscl-7-rpms
-          yum install -y devtoolset-7-gcc-c++ sox
+          yum install -y devtoolset-7-gcc-c++ devtoolset-7-make sox ncurses-devel libtool automake autoconf
           localedef -f UTF-8 -i en_US en_US
       - name: install espnet
         run: |
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 718254bdcb6..f1eb6fb47ae 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -9,42 +9,28 @@ on:
       - master
 
 jobs:
-  check_skip:
-    runs-on: ubuntu-18.04
-    if: "! contains(github.event.head_commit.message, '[ci skip]')"
-    steps:
-      - run: echo "${{ github.event.head_commit.message }}"
   linter_and_test:
     runs-on: ${{ matrix.os }}
-    needs: check_skip
     strategy:
       max-parallel: 20
       matrix:
-        # os: [ubuntu-16.04, ubuntu-18.04]
         os: [ubuntu-18.04]
-        python-version: [3.7, 3.8]
-        pytorch-version: [1.0.1, 1.1.0, 1.2.0, 1.3.1, 1.4.0, 1.5.1, 1.6.0, 1.7.1, 1.8.0]
+        python-version: [3.7]
+        pytorch-version: [1.4.0, 1.5.1, 1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1]
         chainer-version: [6.0.0]
         # NOTE(kamo): Conda is tested by Circle-CI
         use-conda: [false]
-        exclude:
-          # Exclude python=3.8 tests except for latest pytorch
-          - python-version: 3.8
-            pytorch-version: 1.0.1
-          - python-version: 3.8
-            pytorch-version: 1.1.0
-          - python-version: 3.8
-            pytorch-version: 1.2.0
-          - python-version: 3.8
-            pytorch-version: 1.3.1
-          - python-version: 3.8
-            pytorch-version: 1.4.0
-          - python-version: 3.8
-            pytorch-version: 1.5.1
-          - python-version: 3.8
-            pytorch-version: 1.6.0
-          - python-version: 3.8
-            pytorch-version: 1.7.1
+        include:
+          - os: ubuntu-20.04
+            python-version: 3.8
+            pytorch-version: 1.10.1
+            chainer-verssion: 6.0.0
+            use-conda: false
+          - os: ubuntu-20.04
+            python-version: 3.9
+            pytorch-version: 1.10.1
+            chainer-verssion: 6.0.0
+            use-conda: false
     steps:
       - uses: actions/checkout@master
       - uses: actions/cache@v1
@@ -69,16 +55,47 @@ jobs:
           CXX: g++-7
         run: |
           ./ci/install.sh
+
       - name: test shell
         run: |
           ./ci/test_shell.sh
+
       - name: test python
+        run: ./ci/test_python.sh
+      - uses: codecov/codecov-action@v2
+        with:
+          flags: test_python
+      - name: coverage erase
         run: |
-          ./ci/test_python.sh
+          source tools/activate_python.sh
+          coverage erase
+
       - name: install kaldi
         run: |
           ./ci/install_kaldi.sh
-      - name: test integration
+
+      - name: test utils
+        run: ./ci/test_utils.sh
+      - uses: codecov/codecov-action@v2
+        with:
+          flags: test_utils
+      - name: coverage erase
+        run: |
+          source tools/activate_python.sh
+          coverage erase
+
+      - name: test espnet1 integration
+        run: ./ci/test_integration_espnet1.sh
+      - uses: codecov/codecov-action@v2
+        with:
+          flags: test_integration_espnet1
+      - name: coverage erase
         run: |
-          ./ci/test_integration.sh
-      - uses: codecov/codecov-action@v1
+          source tools/activate_python.sh
+          coverage erase
+
+      - name: test espnet2 integration
+        run: ./ci/test_integration_espnet2.sh
+      - uses: codecov/codecov-action@v2
+        with:
+          flags: test_integration_espnet2
diff --git a/.github/workflows/debian9.yml b/.github/workflows/debian9.yml
index e4288e8a836..a29e5474ad4 100644
--- a/.github/workflows/debian9.yml
+++ b/.github/workflows/debian9.yml
@@ -9,25 +9,19 @@ on:
       - master
 
 jobs:
-  check_skip:
-    runs-on: ubuntu-latest
-    if: "! contains(github.event.head_commit.message, '[ci skip]')"
-    steps:
-      - run: echo "${{ github.event.head_commit.message }}"
-  linter_and_test:
+  test_debian9:
     runs-on: ubuntu-latest
     container:
       image: debian:9
       env:
-        ESPNET_PYTHON_VERSION: 3.6
-        TH_VERSION: 1.8.0
+        ESPNET_PYTHON_VERSION: 3.7
+        TH_VERSION: 1.10.1
         CHAINER_VERSION: 6.0.0
         USE_CONDA: true
         CC: gcc-6
         CXX: g++-6
         # To avoid UnicodeEncodeError for python<=3.6
         LC_ALL: en_US.UTF-8
-    needs: check_skip
     steps:
       - uses: actions/checkout@master
       - name: check OS
@@ -36,7 +30,9 @@ jobs:
         run: |
           apt-get update -qq
           # NOTE(kamo): cmake sndfile will be download using anacond:
-          apt-get install -qq -y build-essential git g++-6 unzip bzip2 wget curl bc locales make sox
+          apt-get install -qq -y \
+            build-essential git g++-6 unzip bzip2 wget curl bc locales make sox \
+            libncurses5-dev automake libtool pkg-config
           localedef -f UTF-8 -i en_US en_US
       - name: install espnet
         run: ./ci/install.sh
@@ -46,6 +42,9 @@ jobs:
         run: ./ci/test_python.sh
       - name: install kaldi
         run: ./ci/install_kaldi.sh
-      - name: test integration
-        run: ./ci/test_integration.sh
-      - uses: codecov/codecov-action@v1
+      - name: test utils
+        run: ./ci/test_utils.sh
+      - name: test espnet1 integration
+        run: ./ci/test_integration_espnet1.sh
+      - name: test espnet2 integration
+        run: ./ci/test_integration_espnet2.sh
diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml
new file mode 100644
index 00000000000..eede0bc8044
--- /dev/null
+++ b/.github/workflows/doc.yml
@@ -0,0 +1,46 @@
+name: doc
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  linter_and_test:
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: actions/checkout@master
+      - uses: actions/cache@v1
+        with:
+          path: ~/.cache/pip
+          key: pip-${{ hashFiles('**/setup.py') }}
+      - uses: actions/setup-python@v1
+        with:
+          python-version: 3.8
+          architecture: 'x64'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: install dependencies
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -qq -y cmake python3-dev git g++-7 pandoc ffmpeg bc
+      - name: install espnet
+        env:
+          ESPNET_PYTHON_VERSION: 3.8
+          TH_VERSION: 1.10.1
+          CHAINER_VERSION: 6.0.0
+          USE_CONDA: false
+          CC: gcc-7
+          CXX: g++-7
+        run: ./ci/install.sh
+      - name: generate doc
+        run: ./ci/doc.sh
+      - name: deploy
+        if: github.ref == 'refs/heads/master'
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: doc/build
diff --git a/.github/workflows/test_import.yaml b/.github/workflows/test_import.yaml
new file mode 100644
index 00000000000..ead9f587c07
--- /dev/null
+++ b/.github/workflows/test_import.yaml
@@ -0,0 +1,51 @@
+name: Test import espnet
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  test_import:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      max-parallel: 20
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.9]
+        pytorch-version: [1.10.1]
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/cache@v1
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-${{ hashFiles('**/setup.py') }}
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        sudo apt-get install -qq -y libsndfile1-dev
+        python3 -m pip install --upgrade pip
+    - name: Install espnet with the least requirement
+      env:
+        TH_VERSION: ${{ matrix.pytorch-version }}
+      run: |
+        ./tools/installers/install_torch.sh false ${TH_VERSION} CPU
+        ./tools/installers/install_chainer.sh CPU
+        python3 -m pip install -e .
+    - name: Import all modules (Try1)
+      run: |
+        python3 ./ci/test_import_all.py
+    - name: Install espnet with the full requirement
+      env:
+        TH_VERSION: ${{ matrix.pytorch-version }}
+      run: |
+        python3 -m pip install -e ".[all]"
+    - name: Import all modules (Try2)
+      run: |
+        python3 ./ci/test_import_all.py
diff --git a/.gitignore b/.gitignore
index deff1897ea8..7170a376705 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ test_spm.model
 .vscode*
 *.vim
 *.swp
+*.nfs*
 
 # recipe related
 egs*/*/*/data*
@@ -39,8 +40,13 @@ egs*/*/*/mfcc
 egs*/*/*/stft
 egs*/*/*/tensorboard
 egs*/*/*/wav*
+egs*/*/*/nltk*
+egs*/*/*/.cache*
+egs*/*/*/pretrained_models*
+egs*/fisher_callhome_spanish/*/local/mapping*
 
 # tools related
+tools/chainer
 tools/bin
 tools/include
 tools/lib
@@ -48,6 +54,7 @@ tools/lib64
 tools/bats-core
 tools/chainer_ctc/
 tools/kaldi*
+tools/activate_python.sh
 tools/miniconda.sh
 tools/moses/
 tools/mwerSegmenter/
@@ -62,6 +69,8 @@ tools/PESQ*
 tools/hts_engine_API*
 tools/open_jtalk*
 tools/pyopenjtalk*
+tools/tdmelodic_openjtalk*
+tools/s3prl
 tools/sctk*
 tools/sph2pipe*
 tools/espeak-ng*
@@ -69,3 +78,5 @@ tools/MBROLA*
 tools/festival*
 tools/speech_tools*
 tools/phonemizer*
+tools/py3mmseg
+tools/._*
diff --git a/.gitmodules b/.gitmodules
index bc771d8c6ee..e69de29bb2d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "doc/notebook"]
-	path = doc/notebook
-	url = https://github.com/espnet/notebook
diff --git a/.mergify.yml b/.mergify.yml
index 8e6872169ac..0304250182c 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -2,19 +2,22 @@ pull_request_rules:
   - name: automatic merge if label=auto-merge
     conditions:
       - "label=auto-merge"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.0.1, 6.0.0, false)"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.1.0, 6.0.0, false)"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.2.0, 6.0.0, false)"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.3.1, 6.0.0, false)"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.4.0, 6.0.0, false)"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.5.1, 6.0.0, false)"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.6.0, 6.0.0, false)"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.7, 1.7.1, 6.0.0, false)"
-      - "status-success=linter_and_test (ubuntu-18.04, 3.8, 1.8.0, 6.0.0, false)"
+      - "check-success=test_centos7"
+      - "check-success=test_debian9"
+      - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.3.1, 6.0.0, false)"
+      - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.4.0, 6.0.0, false)"
+      - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.5.1, 6.0.0, false)"
+      - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.6.0, 6.0.0, false)"
+      - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.7.1, 6.0.0, false)"
+      - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.8.1, 6.0.0, false)"
+      - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.9.1, 6.0.0, false)"
+      - "check-success=linter_and_test (ubuntu-18.04, 3.7, 1.10.1, 6.0.0, false)"
+      - "check-success=linter_and_test (ubuntu-20.04, 3.8, 1.10.1, false, 6.0.0)"
+      - "check-success=linter_and_test (ubuntu-20.04, 3.9, 1.10.1, false, 6.0.0)"
+      - "check-success=test_import (ubuntu-latest, 3.9, 1.10.1)"
     actions:
       merge:
         method: merge
-        strict: false
   - name: delete head branch after merged
     conditions:
       - merged
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index be2f40c96df..00000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-dist: xenial
-language: python
-python:
-  - "3.8"
-
-cache:
-  - pip
-  - ccache
-
-addons:
-  apt:
-    sources:
-      - ubuntu-toolchain-r-test
-    packages:
-      - cmake
-      - python3-dev
-      - g++-7
-      - pandoc
-      - ffmpeg
-      - bc
-
-env:
-  - USE_CONDA=false ESPNET_PYTHON_VERSION=3.8 TH_VERSION=1.8.0 CHAINER_VERSION=6.0.0 CC=gcc-7 CXX=g++-7
-  # torch nightly
-  # - USE_CONDA=false ESPNET_PYTHON_VERSION=3.7.3 TH_VERSION=nightly CHAINER_VERSION=6.0.0 CC=gcc-7 CXX=g++-7
-
-matrix:
-  allow_failures:
-    # torch nightly
-    # - env: USE_CONDA=false ESPNET_PYTHON_VERSION=3.7.3 TH_VERSION=nightly CHAINER_VERSION=6.0.0 CC=gcc-7 CXX=g++-7
-
-install:
-  - travis_retry ./ci/install.sh
-  # - travis_retry ./ci/install_kaldi.sh
-
-script:
-  # NOTE(kamo): unittests and build documentation only
-  # - ./ci/test_shell.sh
-  - ./ci/test_python.sh
-  # - ./ci/test_integration.sh
-  - ./ci/doc.sh
-
-sudo: false
-
-after_success:
-  # - bash <(curl -s https://codecov.io/bash)
-  - travis-sphinx deploy -m "Update documentation [ci skip]"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 31eca6141e1..979e7397012 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,19 +1,19 @@
 # How to contribute to ESPnet
 
 ## 1. What to contribute
-If you are interested in contributing to ESPnet, your contributions will fall into three categories: major features, minor updates, and recipes. 
+If you are interested in contributing to ESPnet, your contributions will fall into three categories: major features, minor updates, and recipes.
 
 ### 1.1 Major features
 
-If you want to ask or propose a new feature, please first open a new issue with the tag `Feature request` 
-or directly contact Shinji Watanabe <shinjiw@ieee.org> or other main developers. Each feature implementation 
+If you want to ask or propose a new feature, please first open a new issue with the tag `Feature request`
+or directly contact Shinji Watanabe <shinjiw@ieee.org> or other main developers. Each feature implementation
 and design should be discussed and modified according to ongoing and future works.
-You can find ongoing major development plans at https://github.com/espnet/espnet/milestones 
+You can find ongoing major development plans at https://github.com/espnet/espnet/milestones
 or in https://github.com/espnet/espnet/issues (pinned issues)
 
 ### 1.2 Minor Updates (minor feature, bug-fix for an issue)
 
-If you want to propose a minor feature, update an existing minor feature, or fix a bug, please first take a look at 
+If you want to propose a minor feature, update an existing minor feature, or fix a bug, please first take a look at
 the existing [issues](https://github.com/espnet/espnet/pulls) and/or [pull requests](https://github.com/espnet/espnet/pulls).
 Pick an issue and comment on the task that you want to work on this feature.
 
@@ -21,26 +21,26 @@ If you need help or additional information to propose the feature, you can open
 
 ### 1.3 Recipes
 
-ESPnet provides and maintains many example scripts, called `recipes`, that demonstrate how to 
+ESPnet provides and maintains many example scripts, called `recipes`, that demonstrate how to
 use the toolkit.  The recipes for ESPnet1 are put under `egs` directory, while ESPnet2 ones are put under `egs2`.
 Similar to Kaldi, each subdirectory of `egs` and `egs2` corresponds to a corpus that we have example scripts for.
 
 #### 1.3.1 ESPnet1 recipes
 
-ESPnet1 recipes (`egs/X`) follow the convention from [Kaldi](https://github.com/kaldi-asr/kaldi) and may rely on 
-several utilities available in Kaldi. As such, porting a new recipe from Kaldi to ESPnet is natural, and the user 
+ESPnet1 recipes (`egs/X`) follow the convention from [Kaldi](https://github.com/kaldi-asr/kaldi) and may rely on
+several utilities available in Kaldi. As such, porting a new recipe from Kaldi to ESPnet is natural, and the user
 may refer to [port-kaldi-recipe](https://github.com/espnet/espnet/wiki/How-to-port-the-Kaldi-recipe-to-the-ESPnet-recipe%3F)
-and other existing recipes for new additions. For the Kaldi-style recipe architecture, please refer to 
+and other existing recipes for new additions. For the Kaldi-style recipe architecture, please refer to
 [Prepare-Kaldi-Style-Directory](https://kaldi-asr.org/doc/data_prep.html).
-   
-For each recipe, we ask you to report the following: experiments results and environnement, model information. 
-For reproducibility, a link to upload the pre-trained model may also be added. All this information should be written 
-in a markdown file called `RESULTS.md` and put at the recipe root. You can refer to 
+
+For each recipe, we ask you to report the following: experiments results and environnement, model information.
+For reproducibility, a link to upload the pre-trained model may also be added. All this information should be written
+in a markdown file called `RESULTS.md` and put at the recipe root. You can refer to
 [tedlium2-example](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md) for an example.
-   
+
 To generate `RESULTS.md` for a recipe, please follow the following instructions:
-- Execute `~/espnet/utils/show_result.sh` at the recipe root (where `run.sh` is located). 
-You'll get your environment information and evaluation results for each experiment in a markdown format. 
+- Execute `~/espnet/utils/show_result.sh` at the recipe root (where `run.sh` is located).
+You'll get your environment information and evaluation results for each experiment in a markdown format.
 From here, you can copy or redirect text output to `RESULTS.md`.
 - Execute `~/espnet/utils/pack_model.sh` at the recipe root to generate a packed ESPnet model called `model.tar.gz`
 and output model information. Executing the utility script without argument will give you the expected arguments.
@@ -50,30 +50,62 @@ and output model information. Executing the utility script without argument will
 #### 1.3.2 ESPnet2 recipes
 
 ESPnet2's recipes correspond to `egs2`. ESPnet2 applies a new paradigm without dependencies of Kaldi's binaries, which makes it lighter and more generalized.
-For ESPnet2, we do not recommend preparing the recipe's stages for each corpus but using the common pipelines we provided in `asr.sh`, `tts.sh`, and 
+For ESPnet2, we do not recommend preparing the recipe's stages for each corpus but using the common pipelines we provided in `asr.sh`, `tts.sh`, and
 `enh.sh`. For details of creating ESPnet2 recipes, please refer to [egs2-readme](https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/README.md).
 
-The common pipeline of ESPnet2 recipes will take care of the `RESULTS.md` generation, model packing, and uploading. ESPnet2 models are maintained at Zenodo.
+The common pipeline of ESPnet2 recipes will take care of the `RESULTS.md` generation, model packing, and uploading. ESPnet2 models are maintained at Hugging Face and Zenodo (Deprecated).
 You can also refer to the document in https://github.com/espnet/espnet_model_zoo
-To upload your model, you need first:
+To upload your model, you need first (This is currently deprecated , uploading to Huggingface Hub is prefered) :
 1. Sign up to Zenodo: https://zenodo.org/
 2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
 3. Set your environment: % export ACCESS_TOKEN="<your token>"
 
+To port models from zenodo using Hugging Face hub,
+1. Create a Hugging Face account - https://huggingface.co/
+2. Request to be added to espnet organisation - https://huggingface.co/espnet
+3. Go to `egs2/RECIPE/*/scripts/utils` and run `./upload_models_to_hub.sh "ZENODO_MODEL_NAME"`
+   
+To upload models using Huggingface-cli follow the following steps:
+You can also refer to https://huggingface.co/docs/transformers/model_sharing
+1. Create a Hugging Face account - https://huggingface.co/
+2. Request to be added to espnet organisation - https://huggingface.co/espnet 
+3. Run huggingface-cli login (You can get the token request at this step under setting > Access Tokens > espnet token  
+4. `huggingface-cli repo create your-model-name --organization espnet`
+5. `git clone https://huggingface.co/username/your-model-name` (clone this outside ESPNet to avoid issues as this a git repo)
+6. `cd your-model-name`
+7. `git lfs install`
+8. copy contents from exp diretory of your recipe into this directory (Check other models of similar task under ESPNet to confirm your directory structure) 
+9. `git add . `
+10. `git commit -m "Add model files"`
+11. `git push`
+12. Check if the inference demo on HF is running successfully to verify the upload      
+
 #### 1.3.3 Additional requirements for new recipe
 
 - Common/shared files and directories such as `utils`, `steps`, `asr.sh`, etc, should be linked using
-a symbolic link (e.g.: `ln -s <source-path> <target-path>`). Please refer to existing recipes if you're 
+a symbolic link (e.g.: `ln -s <source-path> <target-path>`). Please refer to existing recipes if you're
 unaware which files/directories are shared. Noted that in espnet2, some of them are automatically generated by https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/setup.sh.
-- Default training and decoding configurations (i.e.: the default one in `run.sh`) should be named respectively `train.yaml` 
+- Default training and decoding configurations (i.e.: the default one in `run.sh`) should be named respectively `train.yaml`
 and `decode.yaml` and put in `conf/`. Additional or variant configurations should be put in `conf/tuning/` and named accordingly
-to its differences. 
+to its differences.
 - If a recipe for a new corpus is proposed, you should add its name and information to:
-https://github.com/espnet/espnet/blob/master/egs/README.md if it's a ESPnet1 recipe, 
+https://github.com/espnet/espnet/blob/master/egs/README.md if it's a ESPnet1 recipe,
 or https://github.com/espnet/espnet/blob/master/egs2/README.md + `db.sh` if it's a ESPnet2 recipe.
+   
+#### 1.3.4 Checklist before you submit the recipe-based PR
+
+- [ ] be careful about the name for the recipe. It is recommended to follow naming conventions of the other recipes
+- [ ] common/shared files are linked with **soft link** (see Section 1.3.3)
+- [ ] modified or new python scripts should be passed through **latest** black formating (by using python package black). The command to be executed could be `black espnet espnet2 test utils setup.py egs*/*/*/local egs2/TEMPLATE/asr1/pyscripts`
+- [ ] cluster settings should be set as **default** (e.g., cmd.sh conf/slurm.conf conf/queue.conf conf/pbs.conf)
+- [ ] update `egs/README.md` or `egs2/README.md` with corresponding recipes
+- [ ] add corresponding entry in `egs2/TEMPLATE/db.sh` for a new corpus
+- [ ] try to **simplify** the model configurations. We recommend to have only the best configuration for the start of a recipe. Please also follow the default rule defined in Section 1.3.3
+- [ ] large meta-information for a corpus should be maintained elsewhere other than in the recipe itself
+- [ ] recommend to also include results and pre-trained model with the recipe
 
 ## 2 Pull Request
-If your proposed feature or bugfix is ready, please open a Pull Request (PR) at https://github.com/espnet/espnet 
+If your proposed feature or bugfix is ready, please open a Pull Request (PR) at https://github.com/espnet/espnet
 or use the Pull Request button in your forked repo. If you're not familiar with the process, please refer to the following guides:
 
 - http://stackoverflow.com/questions/14680711/how-to-do-a-github-pull-request
@@ -85,7 +117,7 @@ We basically develop in the `master` branch.
 
 1. We will keep the first version digit `0` until we have some super major changes in the project organization level.
 
-2. The second version digit will be updated when we have major updates, including new functions and refactoring, and 
+2. The second version digit will be updated when we have major updates, including new functions and refactoring, and
    their related bug fix and recipe changes.
    This version update will be done roughly every half year so far (but it depends on the development plan).
 
@@ -114,11 +146,11 @@ have the format `def test_yyy(...)`.  [Pytest](https://docs.pytest.org/en/latest
 Technically, a test file should only cover methods from one file (e.g.: `test_transformer_utils.py` to test `transformer_utils.py`).
 - To monitor test coverage and avoid the overlapping test, we recommend using  `pytest --cov-report term-missing <test_file|dir>`
 to highlight covered and missed lines. For more details, please refer to [coverage-test](https://pytest-cov.readthedocs.io/en/latest/readme.html).
-- We limited test running time to 2.0 seconds (see: [pytest-timeouts](https://pypi.org/project/pytest-timeouts/)). As such, 
+- We limited test running time to 2.0 seconds (see: [pytest-timeouts](https://pypi.org/project/pytest-timeouts/)). As such,
 we recommend using small model parameters and avoiding dynamic imports, file access, and unnecessary loops. If a unit test needs
 more running time, you can annotate your test with `@pytest.mark.execution_timeout(sec)`.
 - For test initialization (parameters, modules, etc), you can use pytest fixtures. Refer to  [pytest fixtures](https://docs.pytest.org/en/latest/fixture.html#using-fixtures-from-classes-modules-or-projects) for more information.
-   
+
 
 ### 4.2 Bash scripts
 
@@ -127,15 +159,15 @@ You can also test the scripts in `utils` with [bats-core](https://github.com/bat
 To test:
 
 ``` console
-./ci/test_bash.sh
+./ci/test_shell.sh
 ```
 
 ## 5 Integration testing
 
-Write new integration tests in [ci/test_integration.sh](ci/test_integration.sh) when you add new features in [espnet/bin](espnet/bin). They use our smallest dataset [egs/mini_an4](egs/mini_an4) to test `run.sh`. To make the coverage take them into account, don't forget `--python ${python}` support in your `run.sh`
+Write new integration tests in [ci/test_integration_espnet1.sh](ci/test_integration_espnet1.sh) or [ci/test_integration_espnet2.sh](ci/test_integration_espnet2.sh) when you add new features in [espnet/bin](espnet/bin) or [espnet2/bin](espnet2/bin), respectively. They use our smallest dataset [egs/mini_an4](egs/mini_an4) or [egs2/mini_an4](egs/mini_an4) to test `run.sh`. **Don't call `python` directly in integration tests. Instead, use `coverage run --append`** as a python interpreter. Especially, `run.sh` should support `--python ${python}` to call the custom interpreter.
 
 ```bash
-# ci/integration_test.sh
+# ci/test_integration_espnet{1,2}.sh
 
 python="coverage run --append"
 
@@ -150,6 +182,7 @@ cd egs/mini_an4/your_task
 - [.travis.yml](.travis.yml) configures Travis-CI (unittests, doc deploy).
 - [.circleci/config.yml](.circleci/config.yml) configures Circle-CI (unittests, integration tests).
 - [.github/workflows](.github/workflows/) configures Github Actions (unittests, integration tests).
+- [codecov.yml](codecov.yml) configures CodeCov (code coverage).
 
 ## 6 Writing new tools
 
diff --git a/README.md b/README.md
index 05d56d86f66..082e5450f78 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,14 @@
 
 # ESPnet: end-to-end speech processing toolkit
 
-|system/pytorch ver.|1.0.1|1.1.0|1.2.0|1.3.1|1.4.0|1.5.1|1.6.0|1.7.1|1.8.0|
-| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
-|ubuntu18/python3.8/pip|||||||||[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
-|ubuntu18/python3.7/pip|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|[![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)|
-|debian9/python3.6/conda|||||||||[![debian9](https://github.com/espnet/espnet/workflows/debian9/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adebian9)|
-|centos7/python3.6/conda|||||||||[![centos7](https://github.com/espnet/espnet/workflows/centos7/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Acentos7)|
-|[docs/coverage] python3.8|||||||||[![Build Status](https://travis-ci.org/espnet/espnet.svg?branch=master)](https://travis-ci.org/espnet/espnet)|
+|   system/pytorch ver.   |                                                         1.4.0                                                          |                                                         1.5.1                                                          |                                                         1.6.0                                                          |                                                         1.7.1                                                          |                                                         1.8.1                                                          |                                                         1.9.1                                                          |                                                                    1.10.1                                                                     |
+| :---------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------: |
+| ubuntu20/python3.9/pip  |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |            [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)             |
+| ubuntu20/python3.8/pip  |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |            [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)             |
+| ubuntu18/python3.7/pip  | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) | [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions) |            [![Github Actions](https://github.com/espnet/espnet/workflows/CI/badge.svg)](https://github.com/espnet/espnet/actions)             |
+| debian9/python3.7/conda |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        | [![debian9](https://github.com/espnet/espnet/workflows/debian9/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adebian9) |
+| centos7/python3.7/conda |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        | [![centos7](https://github.com/espnet/espnet/workflows/centos7/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Acentos7) |
+|      doc/python3.8      |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |                                                                                                                        |       [![doc](https://github.com/espnet/espnet/workflows/doc/badge.svg)](https://github.com/espnet/espnet/actions?query=workflow%3Adoc)       |
 
 [![PyPI version](https://badge.fury.io/py/espnet.svg)](https://badge.fury.io/py/espnet)
 [![Python Versions](https://img.shields.io/pypi/pyversions/espnet.svg)](https://pypi.org/project/espnet/)
@@ -26,53 +27,97 @@
 | [**Notebook**](https://github.com/espnet/notebook)
 | [**Tutorial (2019)**](https://github.com/espnet/interspeech2019-tutorial)
 
-ESPnet is an end-to-end speech processing toolkit, mainly focuses on end-to-end speech recognition and end-to-end text-to-speech.
-ESPnet uses [chainer](https://chainer.org/) and [pytorch](http://pytorch.org/) as a main deep learning engine,
-and also follows [Kaldi](http://kaldi-asr.org/) style data processing, feature extraction/format, and recipes to provide a complete setup for speech recognition and other speech processing experiments.
-
+ESPnet is an end-to-end speech processing toolkit covering end-to-end speech recognition, text-to-speech, speech translation, speech enhancement, speaker diarization, spoken language understanding, and so on.
+ESPnet uses [pytorch](http://pytorch.org/) as a deep learning engine and also follows [Kaldi](http://kaldi-asr.org/) style data processing, feature extraction/format, and recipes to provide a complete setup for various speech processing experiments.
 ## Key Features
 
 ### Kaldi style complete recipe
 - Support numbers of `ASR` recipes (WSJ, Switchboard, CHiME-4/5, Librispeech, TED, CSJ, AMI, HKUST, Voxforge, REVERB, etc.)
 - Support numbers of `TTS` recipes with a similar manner to the ASR recipe (LJSpeech, LibriTTS, M-AILABS, etc.)
 - Support numbers of `ST` recipes (Fisher-CallHome Spanish, Libri-trans, IWSLT'18, How2, Must-C, Mboshi-French, etc.)
-- Support numbers of `MT` recipes (IWSLT'16, the above ST recipes etc.)
-- Support speech separation and recognition recipe (WSJ-2mix)
-- Support voice conversion recipe (VCC2020 baseline) (new!)
-
+- Support numbers of `MT` recipes (IWSLT'14, IWSLT'16, the above ST recipes etc.)
+- Support numbers of `SLU` recipes (CATSLU-MAPS, FSC, Grabo, IEMOCAP, JDCINAL, SNIPS, SLURP, SWBD-DA, etc.)
+- Support numbers of `SE/SS` recipes (DNS-IS2020, LibriMix, SMS-WSJ, VCTK-noisyreverb, WHAM!, WHAMR!, WSJ-2mix, etc.)
+- Support voice conversion recipe (VCC2020 baseline)
+- Support speaker diarization recipe (mini_librispeech)
 
 ### ASR: Automatic Speech Recognition
 - **State-of-the-art performance** in several ASR benchmarks (comparable/superior to hybrid DNN/HMM and CTC)
 - **Hybrid CTC/attention** based end-to-end ASR
   - Fast/accurate training with CTC/attention multitask training
   - CTC/attention joint decoding to boost monotonic alignment decoding
-  - Encoder: VGG-like CNN + BiRNN (LSTM/GRU), sub-sampling BiRNN (LSTM/GRU) or Transformer
+  - Encoder: VGG-like CNN + BiRNN (LSTM/GRU), sub-sampling BiRNN (LSTM/GRU), Transformer, or conformer
 - Attention: Dot product, location-aware attention, variants of multihead
 - Incorporate RNNLM/LSTMLM/TransformerLM/N-gram trained only with text data
 - Batch GPU decoding
+- Data augmentation
 - **Transducer** based end-to-end ASR
-  - Available: RNN-based encoder/decoder or custom encoder/decoder w/ supports for Transformer, Conformer, TDNN (encoder) and causal conv1d (decoder) blocks.
-  - Also support: mixed RNN/Custom encoder-decoder, VGG2L (RNN/Cutom encoder) and various decoding algorithms.
+  - Architecture:
+    - RNN-based encoder and decoder.
+    - Custom encoder and decoder supporting Transformer, Conformer (encoder), 1D Conv / TDNN (encoder) and causal 1D Conv (decoder) blocks.
+    - VGG2L (RNN/custom encoder) and Conv2D (custom encoder) bottlenecks.
+  - Search algorithms:
+    - Greedy search constrained to one emission by timestep.
+    - Default beam search algorithm [[Graves, 2012]](https://arxiv.org/abs/1211.3711) without prefix search.
+    - Alignment-Length Synchronous decoding [[Saon et al., 2020]](https://ieeexplore.ieee.org/abstract/document/9053040).
+    - Time Synchronous Decoding [[Saon et al., 2020]](https://ieeexplore.ieee.org/abstract/document/9053040).
+    - N-step Constrained beam search modified from [[Kim et al., 2020]](https://arxiv.org/abs/2002.03577).
+    - modified Adaptive Expansion Search based on [[Kim et al., 2021]](https://ieeexplore.ieee.org/abstract/document/9250505) and NSC.
+  - Features:
+    - Multi-task learning with various auxiliary losses:
+      - Encoder: CTC, auxiliary Transducer and symmetric KL divergence.
+      - Decoder: cross-entropy w/ label smoothing.
+    - Transfer learning with acoustic model and/or language model.
+    - Training with FastEmit regularization method [[Yu et al., 2021]](https://arxiv.org/abs/2010.11148).
   > Please refer to the [tutorial page](https://espnet.github.io/espnet/tutorial.html#transducer) for complete documentation.
 - CTC segmentation
-- Non-autoregressive based on Mask CTC
+- Non-autoregressive model based on Mask-CTC
 - ASR examples for supporting endangered language documentation (Please refer to egs/puebla_nahuatl and egs/yoloxochitl_mixtec for details)
 - Wav2Vec2.0 pretrained model as Encoder, imported from [FairSeq](https://github.com/pytorch/fairseq/tree/master/fairseq).
+- Self-supervised learning representations as features, using upstream models in [S3PRL](https://github.com/s3prl/s3prl) in frontend.
+  - Set `frontend` to be `s3prl`
+  - Select any upstream model by setting the `frontend_conf` to the corresponding name.
+- Streaming Transformer/Conformer ASR with blockwise synchronous beam search.
+- Restricted Self-Attention based on [Longformer](https://arxiv.org/abs/2004.05150) as an encoder for long sequences 
+
+### SUM: Speech Summarization
+- End to End Speech Summarization Recipe for Instructional Videos using Restricted Self-Attention [[Sharma et al., 2022]](https://arxiv.org/abs/2110.06263)
+
+Demonstration
+- Real-time ASR demo with ESPnet2  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_asr_realtime_demo.ipynb)
+- [Gradio](https://github.com/gradio-app/gradio) Web Demo on [Huggingface Spaces](https://huggingface.co/docs/hub/spaces). Check out the [Web Demo](https://huggingface.co/spaces/akhaliq/espnet2_asr)
+- Streaming Transformer ASR [Local Demo](https://github.com/espnet/notebook/blob/master/espnet2_streaming_asr_demo.ipynb) with ESPnet2.
 
 ### TTS: Text-to-speech
-- Tacotron2
-- Transformer-TTS
-- FastSpeech
-- FastSpeech2 (in ESPnet2)
-- Conformer-based FastSpeech & FastSpeech2 (in ESPnet2)
-- Multi-speaker model with pretrained speaker embedding
-- Multi-speaker model with GST (in ESPnet2)
-- Phoneme-based training (En, Jp, and Zn)
-- Integration with neural vocoders (WaveNet, ParallelWaveGAN, and MelGAN)
-
-You can try demo online now!
+- Architecture
+    - Tacotron2
+    - Transformer-TTS
+    - FastSpeech
+    - FastSpeech2
+    - Conformer FastSpeech & FastSpeech2
+    - VITS
+- Multi-speaker & multi-language extention
+    - Pretrined speaker embedding (e.g., X-vector)
+    - Speaker ID embedding
+    - Language ID embedding
+    - Global style token (GST) embedding
+    - Mix of the above embeddings
+- End-to-end training
+    - End-to-end text-to-wav model (e.g., VITS)
+    - Joint training of text2mel and vocoder
+- Various language support
+    - En / Jp / Zn / De / Ru / And more...
+- Integration with neural vocoders
+    - Parallel WaveGAN
+    - MelGAN
+    - Multi-band MelGAN
+    - HiFiGAN
+    - StyleMelGAN
+    - Mix of the above models
+
+Demonstration
 - Real-time TTS demo with ESPnet2  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb)
-- Real-time TTS demo with ESPnet1  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/tts_realtime_demo.ipynb)
+- Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/akhaliq/ESPnet2-TTS)
 
 To train the neural vocoder, please check the following repositories:
 - [kan-bayashi/ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
@@ -82,6 +127,21 @@ To train the neural vocoder, please check the following repositories:
 > - We are moving on ESPnet2-based development for TTS.
 > - If you are beginner, we recommend using [ESPnet2-TTS](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1).
 
+### SE: Speech enhancement (and separation)
+
+- Single-speaker speech enhancement
+- Multi-speaker speech separation
+- Unified encoder-separator-decoder structure for time-domain and frequency-domain models
+  - Encoder/Decoder: STFT/iSTFT, Convolution/Transposed-Convolution
+  - Separators: BLSTM, Transformer, Conformer, [TasNet](https://arxiv.org/abs/1809.07454), [DPRNN](https://arxiv.org/abs/1910.06379), [DC-CRN](https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf), [DCCRN](https://arxiv.org/abs/2008.00264), Neural Beamformers, etc.
+- Flexible ASR integration: working as an individual task or as the ASR frontend
+- Easy to import pretrained models from [Asteroid](https://github.com/asteroid-team/asteroid)
+  - Both the pre-trained models from Asteroid and the specific configuration are supported.
+
+Demonstration
+- Interactive SE demo with ESPnet2 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing)
+
+
 ### ST: Speech Translation & MT: Machine Translation
 - **State-of-the-art performance** in several ST benchmarks (comparable/superior to cascaded ASR and MT)
 - Transformer based end-to-end ST (new!)
@@ -91,6 +151,11 @@ To train the neural vocoder, please check the following repositories:
 - Transformer and Tacotron2 based parallel VC using melspectrogram (new!)
 - End-to-end VC based on cascaded ASR+TTS (Baseline system for Voice Conversion Challenge 2020!)
 
+### SLU: Speech Language Understanding
+- Predicting intent by directly classifying it as one of intent or decoding by character
+- Transformer & RNN based encoder-decoder model
+- Establish SOTA results with spectral augmentation (Performs better than reported results of pretrained model on Fluent Speech Command Dataset)
+
 ### DNN Framework
 - Flexible network architecture thanks to chainer and pytorch
 - Flexible front-end processing thanks to [kaldiio](https://github.com/nttcslab-sp/kaldiio) and HDF5 support
@@ -99,13 +164,13 @@ To train the neural vocoder, please check the following repositories:
 ### ESPnet2
 See [ESPnet2](https://espnet.github.io/espnet/espnet2_tutorial.html).
 
-- Indepedent from Kaldi/Chainer, unlike ESPnet1
+- Independent from Kaldi/Chainer, unlike ESPnet1
 - On the fly feature extraction and text processing when training
 - Supporting DistributedDataParallel and DaraParallel both
 - Supporting multiple nodes training and integrated with [Slurm](https://slurm.schedmd.com/) or MPI
 - Supporting Sharded Training provided by [fairscale](https://github.com/facebookresearch/fairscale)
 - A template recipe which can be applied for all corpora
-- Possible to train any size of corpus without cpu memory error
+- Possible to train any size of corpus without CPU memory error
 - [ESPnet Model Zoo](https://github.com/espnet/espnet_model_zoo)
 - Integrated with [wandb](https://espnet.github.io/espnet/espnet2_training_option.html#weights-biases-integration)
 
@@ -113,21 +178,23 @@ See [ESPnet2](https://espnet.github.io/espnet/espnet2_tutorial.html).
 - If you intend to do full experiments including DNN training, then see [Installation](https://espnet.github.io/espnet/installation.html).
 - If you just need the Python module only:
     ```sh
+    # We recommend you installing pytorch before installing espnet following https://pytorch.org/get-started/locally/
     pip install espnet
     # To install latest
     # pip install git+https://github.com/espnet/espnet
+    # To install additional packages
+    # pip install "espnet[all]"
     ```
 
-    You need to install some packages.
+    If you'll use ESPnet1, please install chainer and cupy.
 
     ```sh
-    pip install torch
-    pip install chainer==6.0.0 cupy==6.0.0    # [Option] If you'll use ESPnet1
-    pip install torchaudio                    # [Option] If you'll use enhancement task
-    pip install torch_optimizer               # [Option] If you'll use additional optimizers in ESPnet2
+    pip install chainer==6.0.0 cupy==6.0.0    # [Option]
     ```
 
-    There are some required packages depending on each task other than above. If you meet ImportError, please intall them at that time.
+    You might need to install some packages depending on each task. We prepared various installation scripts at [tools/installers](tools/installers).
+
+- (ESPnet2) Once installed, run `wandb login` and set `--use_wandb true` to enable tracking runs using W&B.
 
 ## Usage
 See [Usage](https://espnet.github.io/espnet/tutorial.html).
@@ -137,7 +204,7 @@ See [Usage](https://espnet.github.io/espnet/tutorial.html).
 go to [docker/](docker/) and follow [instructions](https://espnet.github.io/espnet/docker.html).
 
 ## Contribution
-Thank you for taking times for ESPnet! Any contributions to ESPNet are welcome and feel free to ask any questions or requests to [issues](https://github.com/espnet/espnet/issues).
+Thank you for taking times for ESPnet! Any contributions to ESPnet are welcome and feel free to ask any questions or requests to [issues](https://github.com/espnet/espnet/issues).
 If it's the first contribution to ESPnet for you,  please follow the [contribution guide](CONTRIBUTING.md).
 
 ## Results and demo
@@ -151,20 +218,22 @@ You can find useful tutorials and demos in [Interspeech 2019 Tutorial](https://g
 
 We list the character error rate (CER) and word error rate (WER) of major ASR tasks.
 
-| Task                   | CER (%) | WER (%) | Pretrained model|
-| -----------            | :----:  | :----:  | :----:                                                                                                                                                                |
-| Aishell dev/test            | 4.6/5.1    | N/A     | [link](https://github.com/espnet/espnet/blob/master/egs/aishell/asr1/RESULTS.md#conformer-kernel-size--15--specaugment--lm-weight--00-result) |
-| **ESPnet2** Aishell dev/test            | 4.4/4.7    | N/A     | [link](https://github.com/espnet/espnet/tree/master/egs2/aishell/asr1#conformer--specaug--speed-perturbation-featsraw-n_fft512-hop_length128) |
-| Common Voice dev/test       | 1.7/1.8     | 2.2/2.3     | [link](https://github.com/espnet/espnet/blob/master/egs/commonvoice/asr1/RESULTS.md#first-results-default-pytorch-transformer-setting-with-bpe-100-epochs-single-gpu) |
-| CSJ eval1/eval2/eval3              | 5.7/3.8/4.2     | N/A     | [link](https://github.com/espnet/espnet/blob/master/egs/csj/asr1/RESULTS.md#pytorch-backend-transformer-without-any-hyperparameter-tuning)                            |
-| **ESPnet2** CSJ eval1/eval2/eval3              | 4.5/3.3/3.6     | N/A     | [link](https://github.com/espnet/espnet/tree/master/egs2/csj/asr1#initial-conformer-results)                            |
-| HKUST dev              | 23.5    | N/A     | [link](https://github.com/espnet/espnet/blob/master/egs/hkust/asr1/RESULTS.md#transformer-only-20-epochs)                                                             |
-| Librispeech dev_clean/dev_other/test_clean/test_other  | N/A     | 1.9/4.9/2.1/4.9     | [link](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-conformer-with-specaug--speed-perturbation-8-gpus--transformer-lm-4-gpus)             |
-| Switchboard (eval2000) callhm/swbd           | N/A     | 14.0/6.8     | [link](https://github.com/espnet/espnet/blob/master/egs/swbd/asr1/RESULTS.md#conformer-with-bpe-2000-specaug-speed-perturbation-transformer-lm-decoding)   |
-| TEDLIUM2 dev/test           | N/A     | 8.6/7.2     | [link](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md#conformer-large-model--specaug--speed-perturbation--rnnlm)   |
-| TEDLIUM3 dev/test           | N/A     | 9.6/7.6     | [link](https://github.com/espnet/espnet/blob/master/egs/tedlium3/asr1/RESULTS.md)                   |
-| WSJ dev93/eval92              | 3.2/2.1     | 7.0/4.7     | N/A |
-|  **ESPnet2** WSJ dev93/eval92              | 2.7/1.8     | 6.6/4.6     | [link](https://github.com/espnet/espnet/tree/master/egs2/wsj/asr1#using-transformer-lm-asr-model-is-same-as-the-above-lm_weight12-ctc_weight03-beam_size20) |
+| Task                                                              |     CER (%)     |     WER (%)     |                                                                              Pretrained model                                                                               |
+| ----------------------------------------------------------------- | :-------------: | :-------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Aishell dev/test                                                  |     4.6/5.1     |       N/A       |                [link](https://github.com/espnet/espnet/blob/master/egs/aishell/asr1/RESULTS.md#conformer-kernel-size--15--specaugment--lm-weight--00-result)                |
+| **ESPnet2** Aishell dev/test                                      |     4.4/4.7     |       N/A       |                [link](https://github.com/espnet/espnet/tree/master/egs2/aishell/asr1#conformer--specaug--speed-perturbation-featsraw-n_fft512-hop_length128)                |
+| Common Voice dev/test                                             |     1.7/1.8     |     2.2/2.3     |    [link](https://github.com/espnet/espnet/blob/master/egs/commonvoice/asr1/RESULTS.md#first-results-default-pytorch-transformer-setting-with-bpe-100-epochs-single-gpu)    |
+| CSJ eval1/eval2/eval3                                             |   5.7/3.8/4.2   |       N/A       |                 [link](https://github.com/espnet/espnet/blob/master/egs/csj/asr1/RESULTS.md#pytorch-backend-transformer-without-any-hyperparameter-tuning)                  |
+| **ESPnet2** CSJ eval1/eval2/eval3                                 |   4.5/3.3/3.6   |       N/A       |                                        [link](https://github.com/espnet/espnet/tree/master/egs2/csj/asr1#initial-conformer-results)                                         |
+| HKUST dev                                                         |      23.5       |       N/A       |                                  [link](https://github.com/espnet/espnet/blob/master/egs/hkust/asr1/RESULTS.md#transformer-only-20-epochs)                                  |
+| **ESPnet2** HKUST dev                                             |      21.2       |       N/A       |                                    [link](https://github.com/espnet/espnet/tree/master/egs2/hkust/asr1#transformer-asr--transformer-lm)                                     |
+| Librispeech dev_clean/dev_other/test_clean/test_other             |       N/A       | 1.9/4.9/2.1/4.9 | [link](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-conformer-with-specaug--speed-perturbation-8-gpus--transformer-lm-4-gpus) |
+| **ESPnet2** Librispeech dev_clean/dev_other/test_clean/test_other | 0.6/1.5/0.6/1.4 | 1.7/3.4/1.8/3.6 |    [link](https://github.com/espnet/espnet/tree/master/egs2/librispeech/asr1#self-supervised-learning-features-hubert_large_ll60k-conformer-utt_mvn-with-transformer-lm)    |
+| Switchboard (eval2000) callhm/swbd                                |       N/A       |    14.0/6.8     |          [link](https://github.com/espnet/espnet/blob/master/egs/swbd/asr1/RESULTS.md#conformer-with-bpe-2000-specaug-speed-perturbation-transformer-lm-decoding)           |
+| TEDLIUM2 dev/test                                                 |       N/A       |     8.6/7.2     |                 [link](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md#conformer-large-model--specaug--speed-perturbation--rnnlm)                 |
+| TEDLIUM3 dev/test                                                 |       N/A       |     9.6/7.6     |                                              [link](https://github.com/espnet/espnet/blob/master/egs/tedlium3/asr1/RESULTS.md)                                              |
+| WSJ dev93/eval92                                                  |     3.2/2.1     |     7.0/4.7     |                                                                                     N/A                                                                                     |
+| **ESPnet2** WSJ dev93/eval92                                      |     1.1/0.8     |     2.8/1.8     |       [link](https://github.com/espnet/espnet/tree/master/egs2/wsj/asr1#self-supervised-learning-features-wav2vec2_large_ll60k-conformer-utt_mvn-with-transformer-lm)       |
 
 Note that the performance of the CSJ, HKUST, and Librispeech tasks was significantly improved by using the wide network (#units = 1024) and large subword units if necessary reported by [RWTH](https://arxiv.org/pdf/1805.03294.pdf).
 
@@ -191,7 +260,7 @@ The sampling rate must be consistent with that of data used in training.
 Available pretrained models in the demo script are listed as below.
 
 | Model                                                                                            | Notes                                                      |
-| :------                                                                                          | :------                                                    |
+| :----------------------------------------------------------------------------------------------- | :--------------------------------------------------------- |
 | [tedlium2.rnn.v1](https://drive.google.com/open?id=1UqIY6WJMZ4sxNxSugUqp3mrGb3j6h7xe)            | Streaming decoding based on CTC-based VAD                  |
 | [tedlium2.rnn.v2](https://drive.google.com/open?id=1cac5Uc09lJrCYfWkLQsF8eapQcxZnYdf)            | Streaming decoding based on CTC-based VAD (batch decoding) |
 | [tedlium2.transformer.v1](https://drive.google.com/open?id=1cVeSOYY1twOfL9Gns7Z3ZDnkrJqNwPow)    | Joint-CTC attention Transformer trained on Tedlium 2       |
@@ -203,6 +272,30 @@ Available pretrained models in the demo script are listed as below.
 
 </div></details>
 
+### SE results
+<details><summary>expand</summary><div>
+
+We list results from three different models on WSJ0-2mix, which is one the most widely used benchmark dataset for speech separation.
+
+| Model                                             | STOI | SAR   | SDR   | SIR   |
+| ------------------------------------------------- | ---- | ----- | ----- | ----- |
+| [TF Masking](https://zenodo.org/record/4498554)   | 0.89 | 11.40 | 10.24 | 18.04 |
+| [Conv-Tasnet](https://zenodo.org/record/4498562)  | 0.95 | 16.62 | 15.94 | 25.90 |
+| [DPRNN-Tasnet](https://zenodo.org/record/4688000) | 0.96 | 18.82 | 18.29 | 28.92 |
+
+</div></details>
+
+### SE demos
+<details><summary>expand</summary><div>
+You can try the interactive demo with Google Colab. Please click the following button to get access to the demos.
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1fjRJCh96SoYLZPRxsjF9VDv4Q2VoIckI?usp=sharing)
+
+
+It is based on ESPnet2. Pretrained models are available for both speech enhancement and speech separation tasks.
+
+</div></details>
+
 ### ST results
 
 <details><summary>expand</summary><div>
@@ -210,23 +303,23 @@ Available pretrained models in the demo script are listed as below.
 We list 4-gram BLEU of major ST tasks.
 
 #### end-to-end system
-| Task | BLEU | Pretrained model |
-| ---- | :----: | :----: |
+| Task                                              | BLEU  |                                                                                         Pretrained model                                                                                          |
+| ------------------------------------------------- | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | Fisher-CallHome Spanish fisher_test (Es->En)      | 51.03 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/RESULTS.md#train_spen_lcrm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans) |
 | Fisher-CallHome Spanish callhome_evltest (Es->En) | 20.44 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/st1/RESULTS.md#train_spen_lcrm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans) |
-| Libri-trans test (En->Fr)                         | 16.70 | [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/st1/RESULTS.md#train_spfr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans-1) |
-| How2 dev5 (En->Pt)                                | 45.68 | [link](https://github.com/espnet/espnet/blob/master/egs/how2/st1/RESULTS.md#trainpt_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans-1) |
-| Must-C tst-COMMON (En->De)                        | 22.91 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/st1/RESULTS.md#train_spen-dede_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans) |
-| Mboshi-French dev (Fr->Mboshi)                    | 6.18  | N/A  |
+| Libri-trans test (En->Fr)                         | 16.70 |       [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/st1/RESULTS.md#train_spfr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans-1)       |
+| How2 dev5 (En->Pt)                                | 45.68 |              [link](https://github.com/espnet/espnet/blob/master/egs/how2/st1/RESULTS.md#trainpt_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans-1)              |
+| Must-C tst-COMMON (En->De)                        | 22.91 |          [link](https://github.com/espnet/espnet/blob/master/egs/must_c/st1/RESULTS.md#train_spen-dede_tc_pytorch_train_pytorch_transformer_short_long_bpe8000_specaug_asrtrans_mttrans)          |
+| Mboshi-French dev (Fr->Mboshi)                    | 6.18  |                                                                                                N/A                                                                                                |
 
 #### cascaded system
-| Task | BLEU | Pretrained model |
-| ---- | :----: | :----: |
-| Fisher-CallHome Spanish fisher_test (Es->En)      | 42.16 | N/A  |
-| Fisher-CallHome Spanish callhome_evltest (Es->En) | 19.82 | N/A  |
-| Libri-trans test (En->Fr)                         | 16.96 | N/A  |
-| How2 dev5 (En->Pt)                                | 44.90 | N/A  |
-| Must-C tst-COMMON (En->De)                        | 23.65 | N/A  |
+| Task                                              | BLEU  | Pretrained model |
+| ------------------------------------------------- | :---: | :--------------: |
+| Fisher-CallHome Spanish fisher_test (Es->En)      | 42.16 |       N/A        |
+| Fisher-CallHome Spanish callhome_evltest (Es->En) | 19.82 |       N/A        |
+| Libri-trans test (En->Fr)                         | 16.96 |       N/A        |
+| How2 dev5 (En->Pt)                                | 44.90 |       N/A        |
+| Must-C tst-COMMON (En->De)                        | 23.65 |       N/A        |
 
 If you want to check the results of the other recipes, please check `egs/<name_of_recipe>/st1/RESULTS.md`.
 
@@ -259,9 +352,9 @@ The sampling rate must be consistent with that of data used in training.
 
 Available pretrained models in the demo script are listed as below.
 
-| Model                                                                                            | Notes                                                      |
-| :------                                                                                          | :------                                                    |
-| [fisher_callhome_spanish.transformer.v1](https://drive.google.com/open?id=1hawp5ZLw4_SIHIT3edglxbKIIkPVe8n3)            | Transformer-ST trained on Fisher-CallHome Spanish Es->En                  |
+| Model                                                                                                        | Notes                                                    |
+| :----------------------------------------------------------------------------------------------------------- | :------------------------------------------------------- |
+| [fisher_callhome_spanish.transformer.v1](https://drive.google.com/open?id=1hawp5ZLw4_SIHIT3edglxbKIIkPVe8n3) | Transformer-ST trained on Fisher-CallHome Spanish Es->En |
 
 </div></details>
 
@@ -270,17 +363,18 @@ Available pretrained models in the demo script are listed as below.
 
 <details><summary>expand</summary><div>
 
-| Task | BLEU | Pretrained model |
-| ---- | :----: | :----: |
+| Task                                              | BLEU  |                                                                        Pretrained model                                                                         |
+| ------------------------------------------------- | :---: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | Fisher-CallHome Spanish fisher_test (Es->En)      | 61.45 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/mt1/RESULTS.md#trainen_lcrm_lcrm_pytorch_train_pytorch_transformer_bpe_bpe1000) |
 | Fisher-CallHome Spanish callhome_evltest (Es->En) | 29.86 | [link](https://github.com/espnet/espnet/blob/master/egs/fisher_callhome_spanish/mt1/RESULTS.md#trainen_lcrm_lcrm_pytorch_train_pytorch_transformer_bpe_bpe1000) |
-| Libri-trans test (En->Fr)                         | 18.09 | [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/mt1/RESULTS.md#trainfr_lcrm_tc_pytorch_train_pytorch_transformer_bpe1000) |
-| How2 dev5 (En->Pt)                                | 58.61 | [link](https://github.com/espnet/espnet/blob/master/egs/how2/mt1/RESULTS.md#trainpt_tc_tc_pytorch_train_pytorch_transformer_bpe8000) |
-| Must-C tst-COMMON (En->De)                        | 27.63 | [link](https://github.com/espnet/espnet/blob/master/egs/must_c/mt1/RESULTS.md#summary-4-gram-bleu) |
-| IWSLT'14 test2014 (En->De)                        | 24.70 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
-| IWSLT'14 test2014 (De->En)                        | 29.22 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
-| IWSLT'16 test2014 (En->De)                        | 24.05 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
-| IWSLT'16 test2014 (De->En)                        | 29.13 | [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result) |
+| Libri-trans test (En->Fr)                         | 18.09 |          [link](https://github.com/espnet/espnet/blob/master/egs/libri_trans/mt1/RESULTS.md#trainfr_lcrm_tc_pytorch_train_pytorch_transformer_bpe1000)          |
+| How2 dev5 (En->Pt)                                | 58.61 |              [link](https://github.com/espnet/espnet/blob/master/egs/how2/mt1/RESULTS.md#trainpt_tc_tc_pytorch_train_pytorch_transformer_bpe8000)               |
+| Must-C tst-COMMON (En->De)                        | 27.63 |                               [link](https://github.com/espnet/espnet/blob/master/egs/must_c/mt1/RESULTS.md#summary-4-gram-bleu)                                |
+| IWSLT'14 test2014 (En->De)                        | 24.70 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
+| IWSLT'14 test2014 (De->En)                        | 29.22 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
+| IWSLT'14 test2014 (De->En)                        | 32.2  | [link](https://github.com/espnet/espnet/blob/master/egs2/iwslt14/mt1/README.md)  |
+| IWSLT'16 test2014 (En->De)                        | 24.05 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
+| IWSLT'16 test2014 (De->En)                        | 29.13 |                                     [link](https://github.com/espnet/espnet/blob/master/egs/iwslt16/mt1/RESULTS.md#result)                                      |
 
 </div></details>
 
@@ -288,7 +382,7 @@ Available pretrained models in the demo script are listed as below.
 
 <details><summary>ESPnet2</summary><div>
 
-You can listen to the generated samples in the following url.
+You can listen to the generated samples in the following URL.
 - [ESPnet2 TTS generated samples](https://drive.google.com/drive/folders/1H3fnlBbWMEkQUfrHqosKN_ZX_WjO29ma?usp=sharing)
 
 > Note that in the generation we use Griffin-Lim (`wav/`) and [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) (`wav_pwg/`).
@@ -340,19 +434,19 @@ If you want to build your own neural vocoder, please check the above repositorie
 Here we list all of the pretrained neural vocoders. Please download and enjoy the generation of high quality speech!
 
 | Model link                                                                                           | Lang  | Fs [Hz] | Mel range [Hz] | FFT / Shift / Win [pt] | Model type                                                              |
-| :------                                                                                              | :---: | :----:  | :--------:     | :---------------:      | :------                                                                 |
-| [ljspeech.wavenet.softmax.ns.v1](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) | EN    | 22.05k  | None           | 1024 / 256 / None      | [Softmax WaveNet](https://github.com/kan-bayashi/PytorchWaveNetVocoder) |
-| [ljspeech.wavenet.mol.v1](https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t)        | EN    | 22.05k  | None           | 1024 / 256 / None      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [ljspeech.parallel_wavegan.v1](https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7)   | EN    | 22.05k  | None           | 1024 / 256 / None      | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
-| [ljspeech.wavenet.mol.v2](https://drive.google.com/open?id=1es2HuKUeKVtEdq6YDtAsLNpqCy4fhIXr)        | EN    | 22.05k  | 80-7600        | 1024 / 256 / None      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [ljspeech.parallel_wavegan.v2](https://drive.google.com/open?id=1Grn7X9wD35UcDJ5F7chwdTqTa4U7DeVB)   | EN    | 22.05k  | 80-7600        | 1024 / 256 / None      | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
-| [ljspeech.melgan.v1](https://drive.google.com/open?id=1ipPWYl8FBNRlBFaKj1-i23eQpW_W_YcR)             | EN    | 22.05k  | 80-7600        | 1024 / 256 / None      | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN)                |
-| [ljspeech.melgan.v3](https://drive.google.com/open?id=1_a8faVA5OGCzIcJNw4blQYjfG4oA9VEt)             | EN    | 22.05k  | 80-7600        | 1024 / 256 / None      | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN)                |
-| [libritts.wavenet.mol.v1](https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h)        | EN    | 24k     | None           | 1024 / 256 / None      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [jsut.wavenet.mol.v1](https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK)            | JP    | 24k     | 80-7600        | 2048 / 300 / 1200      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [jsut.parallel_wavegan.v1](https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM)       | JP    | 24k     | 80-7600        | 2048 / 300 / 1200      | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
-| [csmsc.wavenet.mol.v1](https://drive.google.com/open?id=1PsjFRV5eUP0HHwBaRYya9smKy5ghXKzj)           | ZH    | 24k     | 80-7600        | 2048 / 300 / 1200      | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
-| [csmsc.parallel_wavegan.v1](https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy)      | ZH    | 24k     | 80-7600        | 2048 / 300 / 1200      | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
+| :--------------------------------------------------------------------------------------------------- | :---: | :-----: | :------------: | :--------------------: | :---------------------------------------------------------------------- |
+| [ljspeech.wavenet.softmax.ns.v1](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) |  EN   | 22.05k  |      None      |   1024 / 256 / None    | [Softmax WaveNet](https://github.com/kan-bayashi/PytorchWaveNetVocoder) |
+| [ljspeech.wavenet.mol.v1](https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t)        |  EN   | 22.05k  |      None      |   1024 / 256 / None    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [ljspeech.parallel_wavegan.v1](https://drive.google.com/open?id=1tv9GKyRT4CDsvUWKwH3s_OfXkiTi0gw7)   |  EN   | 22.05k  |      None      |   1024 / 256 / None    | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
+| [ljspeech.wavenet.mol.v2](https://drive.google.com/open?id=1es2HuKUeKVtEdq6YDtAsLNpqCy4fhIXr)        |  EN   | 22.05k  |    80-7600     |   1024 / 256 / None    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [ljspeech.parallel_wavegan.v2](https://drive.google.com/open?id=1Grn7X9wD35UcDJ5F7chwdTqTa4U7DeVB)   |  EN   | 22.05k  |    80-7600     |   1024 / 256 / None    | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
+| [ljspeech.melgan.v1](https://drive.google.com/open?id=1ipPWYl8FBNRlBFaKj1-i23eQpW_W_YcR)             |  EN   | 22.05k  |    80-7600     |   1024 / 256 / None    | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN)                |
+| [ljspeech.melgan.v3](https://drive.google.com/open?id=1_a8faVA5OGCzIcJNw4blQYjfG4oA9VEt)             |  EN   | 22.05k  |    80-7600     |   1024 / 256 / None    | [MelGAN](https://github.com/kan-bayashi/ParallelWaveGAN)                |
+| [libritts.wavenet.mol.v1](https://drive.google.com/open?id=1jHUUmQFjWiQGyDd7ZeiCThSjjpbF_B4h)        |  EN   |   24k   |      None      |   1024 / 256 / None    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [jsut.wavenet.mol.v1](https://drive.google.com/open?id=187xvyNbmJVZ0EZ1XHCdyjZHTXK9EcfkK)            |  JP   |   24k   |    80-7600     |   2048 / 300 / 1200    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [jsut.parallel_wavegan.v1](https://drive.google.com/open?id=1OwrUQzAmvjj1x9cDhnZPp6dqtsEqGEJM)       |  JP   |   24k   |    80-7600     |   2048 / 300 / 1200    | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
+| [csmsc.wavenet.mol.v1](https://drive.google.com/open?id=1PsjFRV5eUP0HHwBaRYya9smKy5ghXKzj)           |  ZH   |   24k   |    80-7600     |   2048 / 300 / 1200    | [MoL WaveNet](https://github.com/r9y9/wavenet_vocoder)                  |
+| [csmsc.parallel_wavegan.v1](https://drive.google.com/open?id=10M6H88jEUGbRWBmU1Ff2VaTmOAeL8CEy)      |  ZH   |   24k   |    80-7600     |   2048 / 300 / 1200    | [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)      |
 
 If you want to use the above pretrained vocoders, please exactly match the feature setting with them.
 
@@ -393,7 +487,7 @@ synth_wav.sh example.txt
 
 # also you can use multiple sentences
 echo "THIS IS A DEMONSTRATION OF TEXT TO SPEECH." > example_multi.txt
-echo "TEXT TO SPEECH IS A TECHQNIQUE TO CONVERT TEXT INTO SPEECH." >> example_multi.txt
+echo "TEXT TO SPEECH IS A TECHNIQUE TO CONVERT TEXT INTO SPEECH." >> example_multi.txt
 synth_wav.sh example_multi.txt
 ```
 
@@ -434,16 +528,27 @@ The [Voice Conversion Challenge 2020](http://www.vc-challenge.org/) (VCC2020) ad
 In VCC2020, the objective is intra/cross lingual nonparallel VC.
 You can download converted samples of the cascade ASR+TTS baseline system [here](https://drive.google.com/drive/folders/1oeZo83GrOgtqxGwF7KagzIrfjr8X59Ue?usp=sharing).
 
+</div></details>
+
+### SLU results
+
+<details><summary>ESPnet2</summary><div>
+
+- Transformer based SLU for Fluent Speech Command Dataset
+
+In SLU, The objective is to infer the meaning or intent of spoken utterance. The [Fluent Speech Command Dataset](https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/) describes an intent as combination of 3 slot values: action, object and location. You can see baseline results on this dataset [here](https://github.com/espnet/espnet/blob/master/egs2/fsc/asr1/RESULTS.md)
+
+
 </div></details>
 
 ### CTC Segmentation demo
 
-<details><summary>expand</summary><div>
+<details><summary>ESPnet1</summary><div>
 
 [CTC segmentation](https://arxiv.org/abs/2007.09127) determines utterance segments within audio files.
 Aligned utterance segments constitute the labels of speech datasets.
 
-As demo, we align start and end of utterances within the audio file `ctc_align_test.wav`, using the example script `utils/ctc_align_wav.sh`.
+As demo, we align start and end of utterances within the audio file `ctc_align_test.wav`, using the example script `utils/asr_align_wav.sh`.
 For preparation, set up a data directory:
 
 ```sh
@@ -497,14 +602,95 @@ A full example recipe is in `egs/tedlium2/align1/`.
 
 </div></details>
 
+<details><summary>ESPnet2</summary><div>
+
+[CTC segmentation](https://arxiv.org/abs/2007.09127) determines utterance segments within audio files.
+Aligned utterance segments constitute the labels of speech datasets.
+
+As demo, we align start and end of utterances within the audio file `ctc_align_test.wav`.
+This can be done either directly from the Python command line or using the script `espnet2/bin/asr_align.py`.
+
+From the Python command line interface:
+
+```python
+# load a model with character tokens
+from espnet_model_zoo.downloader import ModelDownloader
+d = ModelDownloader(cachedir="./modelcache")
+wsjmodel = d.download_and_unpack("kamo-naoyuki/wsj")
+# load the example file included in the ESPnet repository
+import soundfile
+speech, rate = soundfile.read("./test_utils/ctc_align_test.wav")
+# CTC segmentation
+from espnet2.bin.asr_align import CTCSegmentation
+aligner = CTCSegmentation( **wsjmodel , fs=rate )
+text = """
+utt1 THE SALE OF THE HOTELS
+utt2 IS PART OF HOLIDAY'S STRATEGY
+utt3 TO SELL OFF ASSETS
+utt4 AND CONCENTRATE ON PROPERTY MANAGEMENT
+"""
+segments = aligner(speech, text)
+print(segments)
+# utt1 utt 0.26 1.73 -0.0154 THE SALE OF THE HOTELS
+# utt2 utt 1.73 3.19 -0.7674 IS PART OF HOLIDAY'S STRATEGY
+# utt3 utt 3.19 4.20 -0.7433 TO SELL OFF ASSETS
+# utt4 utt 4.20 6.10 -0.4899 AND CONCENTRATE ON PROPERTY MANAGEMENT
+```
+
+Aligning also works with fragments of the text.
+For this, set the `gratis_blank` option that allows skipping unrelated audio sections without penalty.
+It's also possible to omit the utterance names at the beginning of each line, by setting `kaldi_style_text` to False.
+
+```python
+aligner.set_config( gratis_blank=True, kaldi_style_text=False )
+text = ["SALE OF THE HOTELS", "PROPERTY MANAGEMENT"]
+segments = aligner(speech, text)
+print(segments)
+# utt_0000 utt 0.37 1.72 -2.0651 SALE OF THE HOTELS
+# utt_0001 utt 4.70 6.10 -5.0566 PROPERTY MANAGEMENT
+```
+
+The script `espnet2/bin/asr_align.py` uses a similar interface. To align utterances:
 
-## References
+```sh
+# ASR model and config files from pretrained model (e.g. from cachedir):
+asr_config=<path-to-model>/config.yaml
+asr_model=<path-to-model>/valid.*best.pth
+# prepare the text file
+wav="test_utils/ctc_align_test.wav"
+text="test_utils/ctc_align_text.txt"
+cat << EOF > ${text}
+utt1 THE SALE OF THE HOTELS
+utt2 IS PART OF HOLIDAY'S STRATEGY
+utt3 TO SELL OFF ASSETS
+utt4 AND CONCENTRATE
+utt5 ON PROPERTY MANAGEMENT
+EOF
+# obtain alignments:
+python espnet2/bin/asr_align.py --asr_train_config ${asr_config} --asr_model_file ${asr_model} --audio ${wav} --text ${text}
+# utt1 ctc_align_test 0.26 1.73 -0.0154 THE SALE OF THE HOTELS
+# utt2 ctc_align_test 1.73 3.19 -0.7674 IS PART OF HOLIDAY'S STRATEGY
+# utt3 ctc_align_test 3.19 4.20 -0.7433 TO SELL OFF ASSETS
+# utt4 ctc_align_test 4.20 4.97 -0.6017 AND CONCENTRATE
+# utt5 ctc_align_test 4.97 6.10 -0.3477 ON PROPERTY MANAGEMENT
+```
+
+The output of the script can be redirected to a `segments` file by adding the argument `--output segments`.
+Each line contains file/utterance name, utterance start and end times in seconds and a confidence score; optionally also the utterance text.
+The confidence score is a probability in log space that indicates how good the utterance was aligned. If needed, remove bad utterances:
 
-[1] Shinji Watanabe, Takaaki Hori, Shigeki Karita, Tomoki Hayashi, Jiro Nishitoba, Yuya Unno, Nelson Enrique Yalta Soplin, Jahn Heymann, Matthew Wiesner, Nanxin Chen, Adithya Renduchintala, and Tsubasa Ochiai, "ESPnet: End-to-End Speech Processing Toolkit," *Proc. Interspeech'18*, pp. 2207-2211 (2018)
+```sh
+min_confidence_score=-7
+# here, we assume that the output was written to the file `segments`
+awk -v ms=${min_confidence_score} '{ if ($5 > ms) {print} }' segments
+```
 
-[2] Suyoun Kim, Takaaki Hori, and Shinji Watanabe, "Joint CTC-attention based end-to-end speech recognition using multi-task learning," *Proc. ICASSP'17*, pp. 4835--4839 (2017)
+See the module documentation for more information.
+It is recommended to use models with RNN-based encoders (such as BLSTMP) for aligning large audio files;
+rather than using Transformer models that have a high memory consumption on longer audio data.
+The sample rate of the audio must be consistent with that of the data used in training; adjust with `sox` if needed.
 
-[3] Shinji Watanabe, Takaaki Hori, Suyoun Kim, John R. Hershey and Tomoki Hayashi, "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition," *IEEE Journal of Selected Topics in Signal Processing*, vol. 11, no. 8, pp. 1240-1253, Dec. 2017
+</div></details>
 
 ## Citations
 
@@ -551,4 +737,10 @@ A full example recipe is in `egs/tedlium2/align1/`.
   year={2021},
   organization={IEEE},
 }
+@article{arora2021espnet,
+  title={ESPnet-SLU: Advancing Spoken Language Understanding through ESPnet},
+  author={Arora, Siddhant and Dalmia, Siddharth and Denisov, Pavel and Chang, Xuankai and Ueda, Yushi and Peng, Yifan and Zhang, Yuekai and Kumar, Sujay and Ganesan, Karthik and Yan, Brian and others},
+  journal={arXiv preprint arXiv:2111.14706},
+  year={2021}
+}
 ```
diff --git a/ci/doc.sh b/ci/doc.sh
index d1f36bdfd13..114bc92b952 100755
--- a/ci/doc.sh
+++ b/ci/doc.sh
@@ -26,11 +26,13 @@ set -euo pipefail
 find ./utils/{*.sh,spm_*} -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/utils_sh.rst
 find ./espnet2/bin/*.py -exec ./doc/usage2rst.sh {} \; | tee ./doc/_gen/espnet2_bin.rst
 
+./doc/notebook2rst.sh > ./doc/_gen/notebooks.rst
+
 # generate package doc
 ./doc/module2rst.py --root espnet espnet2 --dst ./doc --exclude espnet.bin
 
 # build html
-travis-sphinx build --source=doc --nowarn
+# TODO(karita): add -W to turn warnings into errors
+sphinx-build -b html doc doc/build
 
 touch doc/build/.nojekyll
-
diff --git a/ci/install.sh b/ci/install.sh
index 90d7b92d567..5bfed7584ad 100755
--- a/ci/install.sh
+++ b/ci/install.sh
@@ -14,27 +14,44 @@ ${CXX:-g++} -v
     mkdir -p kaldi/egs/wsj/s5/utils && touch kaldi/egs/wsj/s5/utils/parse_options.sh
     if ${USE_CONDA}; then
         ./setup_anaconda.sh venv espnet ${ESPNET_PYTHON_VERSION}
+        # To install via pip instead of conda
     else
-        ./setup_python.sh "$(command -v python3)" venv
+        ./setup_venv.sh "$(command -v python3)" venv
     fi
     . ./activate_python.sh
     make TH_VERSION="${TH_VERSION}"
 
-    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done
+    make warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq pyopenjtalk.done py3mmseg.done s3prl.done transformers.done phonemizer.done fairseq.done k2.done gtn.done longformer.done
     rm -rf kaldi
 )
 . tools/activate_python.sh
 python3 --version
 
-pip3 install https://github.com/kpu/kenlm/archive/master.zip
+python3 -m pip install https://github.com/kpu/kenlm/archive/master.zip
+# NOTE(kamo): tensorboardx is used for chainer mode only
+python3 -m pip install tensorboardx
+# NOTE(kamo): Create matplotlib.cache to reduce runtime for test phase
+python3 -c "import matplotlib.pyplot"
 
 # NOTE(kan-bayashi): Fix the error in black installation.
 #   See: https://github.com/psf/black/issues/1707
-pip3 uninstall -y typing
+python3 -m pip uninstall -y typing
 
 # install espnet
-pip3 install -e ".[test]"
-pip3 install -e ".[doc]"
+python3 -m pip install -e ".[test]"
+python3 -m pip install -e ".[doc]"
 
 # log
-pip3 freeze
+python3 -m pip freeze
+
+
+# Check pytorch version
+python3 <<EOF
+import torch
+from distutils.version import LooseVersion as L
+version = '$TH_VERSION'.split(".")
+next_version = f"{version[0]}.{version[1]}.{int(version[2]) + 1}"
+
+if L(torch.__version__) < L('$TH_VERSION') or L(torch.__version__) >= L(next_version):
+    raise RuntimeError(f"Pytorch=$TH_VERSION is expected, but got pytorch={torch.__version__}. This is a bug in installation scripts")
+EOF
diff --git a/ci/test_import_all.py b/ci/test_import_all.py
new file mode 100755
index 00000000000..e8621bf9340
--- /dev/null
+++ b/ci/test_import_all.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+import glob
+import importlib
+import sys
+
+try:
+    import k2
+except Exception:
+    has_k2 = False
+else:
+    has_k2 = True
+try:
+    import mir_eval
+except Exception:
+    has_mir_eval = False
+else:
+    has_mir_eval = True
+
+
+for dirname in ["espnet", "espnet2"]:
+    for f in glob.glob(f"{dirname}/**/*.py"):
+        module_name = f.replace("/", ".")[:-3]
+
+        if (
+            (
+                not has_k2
+                and (
+                    module_name == "espnet2.bin.asr_inference_k2"
+                    or module_name == "espnet2.fst.lm_rescore"
+                )
+            )
+            or (not has_mir_eval and module_name == "espnet2.bin.enh_scoring")
+            or module_name == "espnet2.tasks.enh_asr"
+        ):
+            print(f"[Skip] import {module_name}", file=sys.stderr)
+            continue
+        else:
+            print(f"import {module_name}", file=sys.stderr)
+
+        importlib.import_module(module_name)
diff --git a/ci/test_integration.sh b/ci/test_integration.sh
deleted file mode 100755
index a1b763489ab..00000000000
--- a/ci/test_integration.sh
+++ /dev/null
@@ -1,269 +0,0 @@
-#!/usr/bin/env bash
-
-python="coverage run --append"
-
-touch .coverage
-
-# test asr recipe
-cwd=$(pwd)
-cd ./egs/mini_an4/asr1 || exit 1
-ln -sf ${cwd}/.coverage .
-. path.sh  # source here to avoid undefined variable errors
-
-set -euo pipefail
-
-echo "==== ASR (backend=pytorch lm=RNNLM) ==="
-./run.sh --python "${python}"
-echo "==== ASR (backend=pytorch, lm=TransformerLM) ==="
-./run.sh --python "${python}" --stage 3 --stop-stage 3 --lm-config conf/lm_transformer.yaml --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2)"
-# skip duplicated ASR training stage 4
-./run.sh --python "${python}" --stage 5 --lm-config conf/lm_transformer.yaml --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2)"
-echo "==== ASR (backend=pytorch, dtype=float64) ==="
-./run.sh --python "${python}" --stage 3 --train-config "$(change_yaml.py conf/train.yaml -a train-dtype=float64)" --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2 -a dtype=float64)"
-echo "==== ASR (backend=chainer) ==="
-./run.sh --python "${python}" --stage 3 --backend chainer
-
-# skip duplicated ASR training stage 2,3
-# test rnn recipe
-echo "=== ASR (backend=pytorch, model=rnn-pure-ctc) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_pure_ctc.yaml \
-        --decode-config conf/decode_pure_ctc.yaml
-echo "=== ASR (backend=pytorch, model=rnn-no-ctc) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_no_ctc.yaml \
-        --decode-config conf/decode_no_ctc.yaml
-
-# test transformer recipe
-echo "=== ASR (backend=pytorch, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \
-        --decode-config conf/decode.yaml
-./run.sh --python "${python}" --stage 5 --train-config conf/train_transformer.yaml \
-        --decode-config conf/decode.yaml --metric acc
-./run.sh --python "${python}" --stage 5 --train-config conf/train_transformer.yaml \
-        --decode-config conf/decode.yaml --metric loss
-echo "=== ASR (backend=pytorch, model=conformer) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer.yaml \
-        --decode-config conf/decode.yaml
-echo "=== ASR (backend=pytorch, model=transformer-pure-ctc) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_pure_ctc.yaml \
-        --decode-config conf/decode_pure_ctc.yaml
-echo "=== ASR (backend=pytorch, model=conformer-pure-ctc) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_pure_ctc.yaml \
-        --decode-config conf/decode_pure_ctc.yaml
-echo "=== ASR (backend=pytorch, model=transformer-no-ctc) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_no_ctc.yaml \
-        --decode-config conf/decode_no_ctc.yaml
-echo "=== ASR (backend=pytorch num-encs 2, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \
-        --decode-config conf/decode.yaml
-
-# test transducer recipe
-echo "=== ASR (backend=pytorch, model=rnnt) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer.yaml \
-        --decode-config conf/decode_transducer.yaml
-echo "=== ASR (backend=pytorch, model=transformer-transducer) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_transducer.yaml \
-        --decode-config conf/decode_transducer.yaml
-echo "=== ASR (backend=pytorch, model=conformer-transducer) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_transducer.yaml \
-        --decode-config conf/decode_transducer.yaml
-
-# test finetuning
-## test transfer learning
-echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=enc) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_enc.yaml \
-         --decode-config conf/decode_transducer.yaml
-echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=LM) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_lm.yaml \
-         --decode-config conf/decode_transducer.yaml
-## to do: cover all tasks + freezing option
-
-echo "==== ASR (backend=pytorch num-encs 2) ==="
-./run.sh --python "${python}" --stage 2 --train-config ./conf/train_mulenc2.yaml --decode-config ./conf/decode_mulenc2.yaml --mulenc true
-# Remove generated files in order to reduce the disk usage
-rm -rf exp tensorboard dump data
-cd ${cwd} || exit 1
-
-# test asr_mix recipe
-cd ./egs/mini_an4/asr_mix1 || exit 1
-ln -sf ${cwd}/.coverage .
-
-echo "==== ASR Mix (backend=pytorch, model=rnn) ==="
-./run.sh --python "${python}" --train-config conf/train_multispkr.yaml
-echo "==== ASR Mix (backend=pytorch, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train-config conf/train_multispkr_transformer.yaml
-# Remove generated files in order to reduce the disk usage
-rm -rf exp tensorboard dump data
-cd "${cwd}" || exit 1
-
-# test st recipe
-cd ./egs/mini_an4/st1 || exit 1
-ln -sf ${cwd}/.coverage .
-
-echo "==== ST (backend=pytorch) ==="
-./run.sh --python "${python}"
-echo "==== ST (backend=pytorch asr0.3) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_asr0.3.yaml
-echo "==== ST (backend=pytorch ctc asr0.3) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_ctc_asr0.3.yaml
-echo "==== ST (backend=pytorch mt0.3) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_mt0.3.yaml
-echo "==== ST (backend=pytorch asr0.2 mt0.2) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_asr0.2_mt0.2.yaml
-echo "==== ST (backend=pytorch, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer.yaml
-./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
-    --metric acc
-./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
-    --metric bleu
-./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
-    --metric loss
-echo "==== ST (backend=pytorch asr0.3, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_asr0.3.yaml
-echo "==== ST (backend=pytorch ctc asr0.3, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_ctc_asr0.3.yaml
-echo "==== ST (backend=pytorch mt0.3, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_mt0.3.yaml
-echo "==== ST (backend=pytorch asr0.2 mt0.2, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_asr0.2_mt0.2.yaml
-echo "==== ST (backend=pytorch asr0.2 mt0.2, model=conformer) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_conformer_asr0.2_mt0.2.yaml
-# Remove generated files in order to reduce the disk usage
-rm -rf exp tensorboard dump data
-cd "${cwd}" || exit 1
-
-# test mt recipe
-cd ./egs/mini_an4/mt1 || exit 1
-ln -sf ${cwd}/.coverage .
-
-echo "==== MT (backend=pytorch) ==="
-./run.sh --python "${python}"
-echo "==== MT (backend=pytorch, model=transformer) ==="
-./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer.yaml
-./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
-    --metric acc
-./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
-    --metric bleu
-./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
-    --metric loss
-# Remove generated files in order to reduce the disk usage
-rm -rf exp tensorboard dump data
-cd "${cwd}" || exit 1
-
-# test tts recipe
-cd ./egs/mini_an4/tts1 || exit 1
-ln -sf ${cwd}/.coverage .
-
-echo "==== TTS (backend=pytorch) ==="
-./run.sh --python "${python}"
-# Remove generated files in order to reduce the disk usage
-rm -rf exp tensorboard dump data
-cd "${cwd}" || exit 1
-
-echo "=== run integration tests at test_utils ==="
-
-PATH=$(pwd)/bats-core/bin:$PATH
-if ! [ -x "$(command -v bats)" ]; then
-    echo "=== install bats ==="
-    git clone https://github.com/bats-core/bats-core.git
-fi
-bats test_utils/integration_test_*.bats
-
-
-#### Make sure chainer-independent ####
-python3 -m pip uninstall -y chainer
-
-# [ESPnet2] test asr recipe
-cd ./egs2/mini_an4/asr1 || exit 1
-ln -sf ${cwd}/.coverage .
-echo "==== [ESPnet2] ASR ==="
-./run.sh --stage 1 --stop-stage 1
-feats_types="raw fbank_pitch"
-token_types="bpe char"
-for t in ${feats_types}; do
-    ./run.sh --stage 2 --stop-stage 4 --feats-type "${t}" --python "${python}"
-done
-for t in ${token_types}; do
-    ./run.sh --stage 5 --stop-stage 5 --token-type "${t}" --python "${python}"
-done
-for t in ${feats_types}; do
-    for t2 in ${token_types}; do
-        echo "==== feats_type=${t}, token_types=${t2} ==="
-        ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --token-type "${t2}" \
-            --asr-args "--max_epoch=1" --lm-args "--max_epoch=1" --python "${python}"
-    done
-done
-# Remove generated files in order to reduce the disk usage
-rm -rf exp dump data
-cd "${cwd}" || exit 1
-
-# [ESPnet2] test tts recipe
-cd ./egs2/mini_an4/tts1 || exit 1
-ln -sf ${cwd}/.coverage .
-echo "==== [ESPnet2] TTS ==="
-./run.sh --stage 1 --stop-stage 1 --python "${python}"
-feats_types="raw fbank stft"
-for t in ${feats_types}; do
-    echo "==== feats_type=${t} ==="
-    ./run.sh --ngpu 0 --stage 2 --stop-stage 8 --skip-upload false --feats-type "${t}" --train-args "--max_epoch 1" --python "${python}"
-done
-# Remove generated files in order to reduce the disk usage
-rm -rf exp dump data
-cd "${cwd}" || exit 1
-
-# [ESPnet2] test enh recipe
-if python -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.2.0")' &> /dev/null;  then
-    cd ./egs2/mini_an4/enh1 || exit 1
-    ln -sf ${cwd}/.coverage .
-    echo "==== [ESPnet2] ENH ==="
-    ./run.sh --stage 1 --stop-stage 1 --python "${python}"
-    feats_types="raw"
-    for t in ${feats_types}; do
-        echo "==== feats_type=${t} ==="
-        ./run.sh --ngpu 0 --stage 2 --stop-stage 9 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}"
-    done
-    # Remove generated files in order to reduce the disk usage
-    rm -rf exp dump data
-    cd "${cwd}" || exit 1
-fi
-
-# [ESPnet2] Validate configuration files
-echo "<blank>" > dummy_token_list
-echo "==== [ESPnet2] Validation configuration files ==="
-if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.6.0")' &> /dev/null;  then
-    for f in egs2/*/asr1/conf/train_asr*.yaml; do
-        python3 -m espnet2.bin.asr_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list
-    done
-    for f in egs2/*/asr1/conf/train_lm*.yaml; do
-        python3 -m espnet2.bin.lm_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list
-    done
-    for f in egs2/*/tts1/conf/train*.yaml; do
-        python3 -m espnet2.bin.tts_train --config "${f}" --iterator_type none --normalize none --dry_run true --output_dir out --token_list dummy_token_list
-    done
-    for f in egs2/*/enh1/conf/train*.yaml; do
-        python -m espnet2.bin.enh_train --config "${f}" --iterator_type none --dry_run true --output_dir out
-    done
-fi
-
-# These files must be same each other.
-for base in cmd.sh conf/slurm.conf conf/queue.conf conf/pbs.conf; do
-    file1=
-    for f in egs2/*/*/"${base}"; do
-        if [ -z "${file1}" ]; then
-            file1="${f}"
-        fi
-        diff "${file1}" "${f}" || { echo "Error: ${file1} and ${f} differ: To solve: for f in egs2/*/*/${base}; do cp egs2/TEMPLATE/asr1/${base} \${f}; done" ; exit 1; }
-    done
-done
-
-
-echo "==== [ESPnet2] test setup.sh ==="
-for d in egs2/TEMPLATE/*; do
-    if [ -d "${d}" ]; then
-        d="${d##*/}"
-        egs2/TEMPLATE/"$d"/setup.sh egs2/test/"${d}"
-    fi
-done
-echo "=== report ==="
-
-coverage report
-coverage xml
diff --git a/ci/test_integration_espnet1.sh b/ci/test_integration_espnet1.sh
new file mode 100755
index 00000000000..d88bac14c56
--- /dev/null
+++ b/ci/test_integration_espnet1.sh
@@ -0,0 +1,173 @@
+#!/usr/bin/env bash
+
+python="coverage run --append"
+
+cwd=$(pwd)
+
+# test asr recipe
+cd ./egs/mini_an4/asr1 || exit 1
+. path.sh  # source here to avoid undefined variable errors
+
+set -euo pipefail
+
+echo "==== ASR (backend=pytorch lm=RNNLM) ==="
+./run.sh --python "${python}"
+echo "==== ASR (backend=pytorch, lm=TransformerLM) ==="
+./run.sh --python "${python}" --stage 3 --stop-stage 3 --lm-config conf/lm_transformer.yaml --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2)"
+# skip duplicated ASR training stage 4
+./run.sh --python "${python}" --stage 5 --lm-config conf/lm_transformer.yaml --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2)"
+echo "==== ASR (backend=pytorch, dtype=float64) ==="
+./run.sh --python "${python}" --stage 3 --train-config "$(change_yaml.py conf/train.yaml -a train-dtype=float64)" --decode-config "$(change_yaml.py conf/decode.yaml -a api=v2 -a dtype=float64)"
+echo "==== ASR (backend=pytorch, quantize-asr-model true, quantize-lm-model true) ==="
+./run.sh --python "${python}" --stage 5 --decode-config "$(change_yaml.py conf/decode.yaml -a quantize-asr-model=true -a quantize-lm-model=true)"
+echo "==== ASR (backend=pytorch, quantize-asr-model true, quantize-lm-model true api v2) ==="
+./run.sh --python "${python}" --stage 5 --decode-config "$(change_yaml.py conf/decode.yaml -a quantize-asr-model=true -a quantize-lm-model=true -a quantize-config=['Linear'] -a api=v2)"
+
+echo "==== ASR (backend=chainer) ==="
+./run.sh --python "${python}" --stage 3 --backend chainer
+
+# skip duplicated ASR training stage 2,3
+# test rnn recipe
+echo "=== ASR (backend=pytorch, model=rnn-pure-ctc) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_pure_ctc.yaml \
+        --decode-config conf/decode_pure_ctc.yaml
+echo "=== ASR (backend=pytorch, model=rnn-no-ctc) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_no_ctc.yaml \
+        --decode-config conf/decode_no_ctc.yaml
+
+# test transformer recipe
+echo "=== ASR (backend=pytorch, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \
+        --decode-config conf/decode.yaml
+./run.sh --python "${python}" --stage 5 --train-config conf/train_transformer.yaml \
+        --decode-config conf/decode.yaml --metric acc
+./run.sh --python "${python}" --stage 5 --train-config conf/train_transformer.yaml \
+        --decode-config conf/decode.yaml --metric loss
+echo "=== ASR (backend=pytorch, model=conformer) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer.yaml \
+        --decode-config conf/decode.yaml
+echo "=== ASR (backend=pytorch, model=transformer-pure-ctc) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_pure_ctc.yaml \
+        --decode-config conf/decode_pure_ctc.yaml
+echo "=== ASR (backend=pytorch, model=conformer-pure-ctc) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_pure_ctc.yaml \
+        --decode-config conf/decode_pure_ctc.yaml
+echo "=== ASR (backend=pytorch, model=transformer-no-ctc) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_no_ctc.yaml \
+        --decode-config conf/decode_no_ctc.yaml
+echo "=== ASR (backend=pytorch num-encs 2, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer.yaml \
+        --decode-config conf/decode.yaml
+
+# test transducer recipe
+echo "=== ASR (backend=pytorch, model=rnnt) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer.yaml \
+        --decode-config conf/decode_transducer.yaml
+echo "=== ASR (backend=pytorch, model=transformer-transducer) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transformer_transducer.yaml \
+        --decode-config conf/decode_transducer.yaml
+echo "=== ASR (backend=pytorch, model=conformer-transducer) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_transducer.yaml \
+        --decode-config conf/decode_transducer.yaml
+
+# test transducer with auxiliary task recipe
+echo "=== ASR (backend=pytorch, model=rnnt, tasks=L1+L2+L3+L4+L5)"
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_aux.yaml \
+         --decode-config conf/decode_transducer.yaml
+echo "=== ASR (backend=pytorch, model=conformer-transducer, tasks=L1+L2+L5) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_conformer_transducer_aux.yaml \
+        --decode-config conf/decode_transducer.yaml
+
+# test finetuning
+## test transfer learning
+echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=enc) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_enc.yaml \
+         --decode-config conf/decode_transducer.yaml
+echo "=== ASR (backend=pytorch, model=rnnt, transfer_learning=LM) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_transducer_pre_init_lm.yaml \
+         --decode-config conf/decode_transducer.yaml
+## to do: cover all tasks + freezing option
+
+echo "==== ASR (backend=pytorch num-encs 2) ==="
+./run.sh --python "${python}" --stage 2 --train-config ./conf/train_mulenc2.yaml --decode-config ./conf/decode_mulenc2.yaml --mulenc true
+# Remove generated files in order to reduce the disk usage
+rm -rf exp tensorboard dump data
+cd ${cwd} || exit 1
+
+# test asr_mix recipe
+cd ./egs/mini_an4/asr_mix1 || exit 1
+
+echo "==== ASR Mix (backend=pytorch, model=rnn) ==="
+./run.sh --python "${python}" --train-config conf/train_multispkr.yaml
+echo "==== ASR Mix (backend=pytorch, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train-config conf/train_multispkr_transformer.yaml
+# Remove generated files in order to reduce the disk usage
+rm -rf exp tensorboard dump data
+cd "${cwd}" || exit 1
+
+# test st recipe
+cd ./egs/mini_an4/st1 || exit 1
+
+echo "==== ST (backend=pytorch) ==="
+./run.sh --python "${python}"
+echo "==== ST (backend=pytorch asr0.3) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_asr0.3.yaml
+echo "==== ST (backend=pytorch ctc asr0.3) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_ctc_asr0.3.yaml
+echo "==== ST (backend=pytorch mt0.3) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_mt0.3.yaml
+echo "==== ST (backend=pytorch asr0.2 mt0.2) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_asr0.2_mt0.2.yaml
+echo "==== ST (backend=pytorch, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer.yaml
+./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
+    --metric acc
+./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
+    --metric bleu
+./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
+    --metric loss
+echo "==== ST (backend=pytorch asr0.3, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_asr0.3.yaml
+echo "==== ST (backend=pytorch ctc asr0.3, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_ctc_asr0.3.yaml
+echo "==== ST (backend=pytorch mt0.3, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_mt0.3.yaml
+echo "==== ST (backend=pytorch asr0.2 mt0.2, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer_asr0.2_mt0.2.yaml
+echo "==== ST (backend=pytorch asr0.2 mt0.2, model=conformer) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_conformer_asr0.2_mt0.2.yaml
+# Remove generated files in order to reduce the disk usage
+rm -rf exp tensorboard dump data
+cd "${cwd}" || exit 1
+
+# test mt recipe
+cd ./egs/mini_an4/mt1 || exit 1
+
+echo "==== MT (backend=pytorch) ==="
+./run.sh --python "${python}"
+echo "==== MT (backend=pytorch, model=transformer) ==="
+./run.sh --python "${python}" --stage 4 --train_config conf/train_transformer.yaml
+./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
+    --metric acc
+./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
+    --metric bleu
+./run.sh --python "${python}" --stage 5 --train_config conf/train_transformer.yaml \
+    --metric loss
+# Remove generated files in order to reduce the disk usage
+rm -rf exp tensorboard dump data
+cd "${cwd}" || exit 1
+
+# test tts recipe
+cd ./egs/mini_an4/tts1 || exit 1
+
+echo "==== TTS (backend=pytorch) ==="
+./run.sh --python "${python}"
+# Remove generated files in order to reduce the disk usage
+rm -rf exp tensorboard dump data
+cd "${cwd}" || exit 1
+
+echo "=== report ==="
+
+coverage combine egs/*/*/.coverage
+coverage report
+coverage xml
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
new file mode 100755
index 00000000000..78086272af7
--- /dev/null
+++ b/ci/test_integration_espnet2.sh
@@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+source tools/activate_python.sh
+PYTHONPATH="${PYTHONPATH:-}:$(pwd)/tools/s3prl"
+export PYTHONPATH
+python="coverage run --append"
+cwd=$(pwd)
+
+#### Make sure chainer-independent ####
+python3 -m pip uninstall -y chainer
+
+# [ESPnet2] test asr recipe
+cd ./egs2/mini_an4/asr1
+echo "==== [ESPnet2] ASR ==="
+./run.sh --stage 1 --stop-stage 1
+feats_types="raw fbank_pitch"
+token_types="bpe char"
+for t in ${feats_types}; do
+    ./run.sh --stage 2 --stop-stage 4 --feats-type "${t}" --python "${python}"
+done
+for t in ${token_types}; do
+    ./run.sh --stage 5 --stop-stage 5 --token-type "${t}" --python "${python}"
+done
+for t in ${feats_types}; do
+    for t2 in ${token_types}; do
+        echo "==== feats_type=${t}, token_types=${t2} ==="
+        ./run.sh --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "${t}" --token-type "${t2}" \
+            --asr-args "--max_epoch=1" --lm-args "--max_epoch=1" --python "${python}"
+    done
+done
+echo "==== feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
+./run.sh --ngpu 0 --stage 10 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+    --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --python "${python}" \
+    --asr-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1"
+
+echo "==== use_streaming, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
+./run.sh --use_streaming true --ngpu 0 --stage 6 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+    --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --python "${python}" \
+    --asr-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1 --encoder=contextual_block_transformer --decoder=transformer
+                --encoder_conf block_size=40 --encoder_conf hop_size=16 --encoder_conf look_ahead=16"
+    
+if python3 -c "import k2" &> /dev/null; then
+    echo "==== use_k2, num_paths > nll_batch_size, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
+    ./run.sh --num_paths 500 --nll_batch_size 20 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+        --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --python "${python}" \
+        --asr-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1"
+    
+    echo "==== use_k2, num_paths == nll_batch_size, feats_type=raw, token_types=bpe, model_conf.extract_feats_in_collect_stats=False, normalize=utt_mvn ==="
+    ./run.sh --num_paths 20 --nll_batch_size 20 --use_k2 true --ngpu 0 --stage 12 --stop-stage 13 --skip-upload false --feats-type "raw" --token-type "bpe" \
+       --feats_normalize "utterance_mvn" --lm-args "--max_epoch=1" --python "${python}" \
+       --asr-args "--model_conf extract_feats_in_collect_stats=false --max_epoch=1"
+fi
+
+# Remove generated files in order to reduce the disk usage
+rm -rf exp dump data
+cd "${cwd}"
+
+# [ESPnet2] test tts recipe
+cd ./egs2/mini_an4/tts1
+echo "==== [ESPnet2] TTS ==="
+./run.sh --ngpu 0 --stage 1 --stop-stage 8 --skip-upload false  --train-args "--max_epoch 1" --python "${python}"
+# Remove generated files in order to reduce the disk usage
+rm -rf exp dump data
+
+# [ESPnet2] test gan-tts recipe
+# NOTE(kan-bayashi): pytorch 1.4 - 1.6 works but 1.6 has a problem with CPU,
+#   so we test this recipe using only pytorch > 1.6 here.
+#   See also: https://github.com/pytorch/pytorch/issues/42446
+if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) > L("1.6")' &> /dev/null; then
+    ./run.sh --fs 22050 --tts_task gan_tts --feats_extract linear_spectrogram --feats_normalize none --inference_model latest.pth \
+        --ngpu 0 --stop-stage 8 --skip-upload false --train-args "--num_iters_per_epoch 1 --max_epoch 1" --python "${python}"
+    rm -rf exp dump data
+fi
+cd "${cwd}"
+
+# [ESPnet2] test enh recipe
+if python -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.2.0")' &> /dev/null;  then
+    cd ./egs2/mini_an4/enh1
+    echo "==== [ESPnet2] ENH ==="
+    ./run.sh --stage 1 --stop-stage 1 --python "${python}"
+    feats_types="raw"
+    for t in ${feats_types}; do
+        echo "==== feats_type=${t} ==="
+        ./run.sh --ngpu 0 --stage 2 --stop-stage 10 --skip-upload false --feats-type "${t}" --spk-num 1 --enh-args "--max_epoch=1" --python "${python}"
+    done
+    # Remove generated files in order to reduce the disk usage
+    rm -rf exp dump data
+    cd "${cwd}"
+fi
+
+# [ESPnet2] test ssl1 recipe
+if python3 -c "import fairseq" &> /dev/null; then
+    cd ./egs2/mini_an4/ssl1
+    echo "==== [ESPnet2] SSL1/HUBERT ==="
+    ./run.sh --ngpu 0 --stage 1 --stop-stage 7 --feats-type "raw" --token_type "word" --skip-upload false --pt-args "--max_epoch=1" --pretrain_start_iter 0 --pretrain_stop_iter 1 --python "${python}"
+    # Remove generated files in order to reduce the disk usage
+    rm -rf exp dump data
+    cd "${cwd}"
+fi
+
+# [ESPnet2] Validate configuration files
+echo "<blank>" > dummy_token_list
+echo "==== [ESPnet2] Validation configuration files ==="
+if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.8.0")' &> /dev/null;  then
+    for f in egs2/*/asr1/conf/train_asr*.yaml; do
+        if [ "$f" == "egs2/fsc/asr1/conf/train_asr.yaml" ]; then
+            if ! python3 -c "import s3prl" > /dev/null; then
+                continue
+            fi
+        fi
+        ${python} -m espnet2.bin.asr_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list
+    done
+    for f in egs2/*/asr1/conf/train_lm*.yaml; do
+        ${python} -m espnet2.bin.lm_train --config "${f}" --iterator_type none --dry_run true --output_dir out --token_list dummy_token_list
+    done
+    for f in egs2/*/tts1/conf/train*.yaml; do
+        ${python} -m espnet2.bin.tts_train --config "${f}" --iterator_type none --normalize none --dry_run true --output_dir out --token_list dummy_token_list
+    done
+    for f in egs2/*/enh1/conf/train*.yaml; do
+        ${python} -m espnet2.bin.enh_train --config "${f}" --iterator_type none --dry_run true --output_dir out
+    done
+    for f in egs2/*/ssl1/conf/train*.yaml; do
+        ${python} -m espnet2.bin.hubert_train --config "${f}" --iterator_type none --normalize none --dry_run true --output_dir out --token_list dummy_token_list
+    done
+fi
+
+# These files must be same each other.
+for base in cmd.sh conf/slurm.conf conf/queue.conf conf/pbs.conf; do
+    file1=
+    for f in egs2/*/*/"${base}"; do
+        if [ -z "${file1}" ]; then
+            file1="${f}"
+        fi
+        diff "${file1}" "${f}" || { echo "Error: ${file1} and ${f} differ: To solve: for f in egs2/*/*/${base}; do cp egs2/TEMPLATE/asr1/${base} \${f}; done" ; exit 1; }
+    done
+done
+
+
+echo "==== [ESPnet2] test setup.sh ==="
+for d in egs2/TEMPLATE/*; do
+    if [ -d "${d}" ]; then
+        d="${d##*/}"
+        egs2/TEMPLATE/"$d"/setup.sh egs2/test/"${d}"
+    fi
+done
+echo "=== report ==="
+
+coverage combine egs2/*/*/.coverage
+coverage report
+coverage xml
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 2327e083373..b3f47146198 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
 . tools/activate_python.sh
+. tools/extra_path.sh
 
 set -euo pipefail
 
@@ -17,4 +18,9 @@ fi
 # pycodestyle
 pycodestyle -r ${modules} --show-source --show-pep8
 
-LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" pytest -q
+LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$(pwd)/tools/chainer_ctc/ext/warp-ctc/build" \
+    PYTHONPATH="${PYTHONPATH:-}:$(pwd)/tools/s3prl" pytest -q
+
+echo "=== report ==="
+coverage report
+coverage xml
diff --git a/ci/test_utils.sh b/ci/test_utils.sh
new file mode 100755
index 00000000000..11796606b0e
--- /dev/null
+++ b/ci/test_utils.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+echo "=== run integration tests at test_utils ==="
+
+PATH=$(pwd)/bats-core/bin:$PATH
+if ! [ -x "$(command -v bats)" ]; then
+    echo "=== install bats ==="
+    git clone https://github.com/bats-core/bats-core.git
+fi
+bats test_utils/integration_test_*.bats
+
+echo "=== report ==="
+
+source tools/activate_python.sh
+coverage combine egs/*/*/.coverage
+coverage report
+coverage xml
diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 00000000000..c9e226f347d
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,10 @@
+# https://docs.codecov.com/docs/common-recipe-list
+coverage:
+  status:
+    project:
+      default:
+        target: auto
+        # adjust accordingly based on how flaky your tests are
+        # this allows a 1% drop from the previous base commit coverage
+        threshold: 1%
+        informational: true
diff --git a/doc/.gitignore b/doc/.gitignore
index d4058a5aa91..79f7202744d 100644
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -1,4 +1,4 @@
 _gen/
 _build/
 build/
-
+notebook/
\ No newline at end of file
diff --git a/doc/README.md b/doc/README.md
index 24f4cb6eeee..a316b2998c4 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -2,7 +2,7 @@
 
 ## Install
 
-We use [travis-sphinx](https://github.com/Syntaf/travis-sphinx) to generate & deploy HTML documentation.
+We use [sphinx](https://www.sphinx-doc.org) to generate HTML documentation.
 
 ```sh
 $ cd <espnet_root>
@@ -46,8 +46,8 @@ $ cd <espnet_root>
 $ ./ci/doc.sh
 ```
 
-open `doc/build/html/index.html`
+open `doc/build/index.html`
 
 ## Deploy
 
-When your PR is merged into `master` branch, our [Travis-CI](https://github.com/espnet/espnet/blob/master/.travis.yml) will automatically deploy your sphinx html into https://espnet.github.io/espnet/ by `travis-sphinx deploy`.
+When your PR is merged into `master` branch, our [CI](https://github.com/espnet/espnet/blob/master/.github/workflows/doc.yml) will automatically deploy your sphinx html into https://espnet.github.io/espnet/.
diff --git a/doc/argparse2rst.py b/doc/argparse2rst.py
index 790049e0bc9..684673d90a3 100755
--- a/doc/argparse2rst.py
+++ b/doc/argparse2rst.py
@@ -20,11 +20,16 @@ def __init__(self, path):
 
 def get_parser():
     parser = configargparse.ArgumentParser(
-        description='generate RST from argparse options',
+        description="generate RST from argparse options",
         config_file_parser_class=configargparse.YAMLConfigFileParser,
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('src', type=str, nargs='+',
-                        help='source python files that contain get_parser() func')
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "src",
+        type=str,
+        nargs="+",
+        help="source python files that contain get_parser() func",
+    )
     return parser
 
 
@@ -53,7 +58,8 @@ def get_parser():
 for m in modinfo:
     cmd = m.path.name
     sep = "~" * len(cmd)
-    print(f"""
+    print(
+        f"""
 
 .. _{cmd}:
 
@@ -65,4 +71,5 @@ def get_parser():
    :func: get_parser
    :prog: {cmd}
 
-""")
+"""
+    )
diff --git a/doc/conf.py b/doc/conf.py
index 32997b08b86..c2f5acd1881 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -20,8 +20,8 @@
 import os
 import sys
 
-sys.path.insert(0, os.path.abspath('../espnet/nets'))
-sys.path.insert(0, os.path.abspath('../utils'))
+sys.path.insert(0, os.path.abspath("../espnet/nets"))
+sys.path.insert(0, os.path.abspath("../utils"))
 
 # -- General configuration ------------------------------------------------
 
@@ -35,8 +35,8 @@
 extensions = [
     "nbsphinx",
     "sphinx.ext.autodoc",
-    'sphinx.ext.napoleon',
-    'sphinx.ext.viewcode',
+    "sphinx.ext.napoleon",
+    "sphinx.ext.viewcode",
     "sphinx.ext.mathjax",
     "sphinx.ext.todo",
     "sphinxarg.ext",
@@ -44,42 +44,46 @@
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 # source_suffix = '.rst'
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 
 # enable to markdown
 from recommonmark.parser import CommonMarkParser
 
 source_parsers = {
-    '.md': CommonMarkParser,
+    ".md": CommonMarkParser,
 }
 
 # AutoStructify setting ref: https://qiita.com/pashango2/items/d1b379b699af85b529ce
 from recommonmark.transform import AutoStructify
 
-github_doc_root = 'https://github.com/rtfd/recommonmark/tree/master/doc/'
+github_doc_root = "https://github.com/rtfd/recommonmark/tree/master/doc/"
 
 
 def setup(app):
-    app.add_config_value('recommonmark_config', {
-        'url_resolver': lambda url: github_doc_root + url,
-        'auto_toc_tree_section': 'Contents',
-    }, True)
+    app.add_config_value(
+        "recommonmark_config",
+        {
+            "url_resolver": lambda url: github_doc_root + url,
+            "auto_toc_tree_section": "Contents",
+        },
+        True,
+    )
     app.add_transform(AutoStructify)
 
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = u'ESPnet'
-copyright = u'2017, Shinji Watanabe'
-author = u'Shinji Watanabe'
+project = u"ESPnet"
+copyright = u"2017, Shinji Watanabe"
+author = u"Shinji Watanabe"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -87,6 +91,7 @@ def setup(app):
 #
 # The short X.Y version.
 import espnet
+
 version = espnet.__version__
 # The full version, including alpha/beta/rc tags.
 release = espnet.__version__
@@ -102,18 +107,21 @@ def setup(app):
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
 exclude_patterns = [
-    '_build', 'Thumbs.db', '.DS_Store', "README.md",
-    # NOTE: becuase these genearate files are directly included
+    "_build",
+    "Thumbs.db",
+    ".DS_Store",
+    "README.md",
+    # NOTE: because these genearate files are directly included
     # from the other files, we should exclude these files manually.
     "_gen/modules.rst",
     "_gen/utils_sh.rst",
     "_gen/utils_py.rst",
     "_gen/espnet_bin.rst",
-    "_gen/espnet-bin.rst"
+    "_gen/espnet-bin.rst",
 ]
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = False
@@ -127,7 +135,7 @@ def setup(app):
 # html_theme = 'nature'
 import sphinx_rtd_theme
 
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
@@ -147,16 +155,16 @@ def setup(app):
 # This is required for the alabaster theme
 # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
 html_sidebars = {
-    '**': [
-        'relations.html',  # needs 'show_related': True theme option to display
-        'searchbox.html',
+    "**": [
+        "relations.html",  # needs 'show_related': True theme option to display
+        "searchbox.html",
     ]
 }
 
 # -- Options for HTMLHelp output ------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'ESPnetdoc'
+htmlhelp_basename = "ESPnetdoc"
 
 # -- Options for LaTeX output ---------------------------------------------
 
@@ -164,15 +172,12 @@ def setup(app):
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',
-
     # The font size ('10pt', '11pt' or '12pt').
     #
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     #
     # 'preamble': '',
-
     # Latex figure (float) alignment
     #
     # 'figure_align': 'htbp',
@@ -182,18 +187,14 @@ def setup(app):
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'ESPnet.tex', u'ESPnet Documentation',
-     u'Shinji Watanabe', 'manual'),
+    (master_doc, "ESPnet.tex", u"ESPnet Documentation", u"Shinji Watanabe", "manual"),
 ]
 
 # -- Options for manual page output ---------------------------------------
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'espnet', u'ESPnet Documentation',
-     [author], 1)
-]
+man_pages = [(master_doc, "espnet", u"ESPnet Documentation", [author], 1)]
 
 # -- Options for Texinfo output -------------------------------------------
 
@@ -201,12 +202,18 @@ def setup(app):
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'ESPnet', u'ESPnet Documentation',
-     author, 'ESPnet', 'One line description of project.',
-     'Miscellaneous'),
+    (
+        master_doc,
+        "ESPnet",
+        u"ESPnet Documentation",
+        author,
+        "ESPnet",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]
 
-autoclass_content = 'both'
+autoclass_content = "both"
 
 # NOTE(kan-bayashi): Do not update outputs in notebook automatically.
-nbsphinx_execute = 'never'
+nbsphinx_execute = "never"
diff --git a/doc/docker.md b/doc/docker.md
index a733c1c3594..b9360600626 100644
--- a/doc/docker.md
+++ b/doc/docker.md
@@ -10,7 +10,7 @@ $ ./run.sh --docker-gpu 0 --docker-egs chime4/asr1 --docker-folders /export/corp
 Optionally, you can set the CUDA version with the arguments `--docker-cuda` respectively (default version set at CUDA=9.1). The docker container can be built based on the CUDA installed in your computer if you empty this arguments.
 By default, all GPU-based images are built with NCCL v2 and CUDNN v7.
 The arguments required for the docker configuration have a prefix "--docker" (e.g., `--docker-gpu`, `--docker-egs`, `--docker-folders`). `run.sh` accept all normal ESPnet arguments, which must be followed by these docker arguments.
-All docker containers are executed using the same user as your login account. If you want to run the docker in root access, add the flag `--is-root` to command line. In addition, you can pass any enviroment variable using `--docker-env` (e.g., `--docker-env "foo=path"`)
+All docker containers are executed using the same user as your login account. If you want to run the docker in root access, add the flag `--is-root` to command line. In addition, you can pass any environment variable using `--docker-env` (e.g., `--docker-env "foo=path"`)
 
 ### ESPnet 2 Recipes
 
diff --git a/doc/espnet2_task.md b/doc/espnet2_task.md
index 58a508c1393..af3f3a5d866 100644
--- a/doc/espnet2_task.md
+++ b/doc/espnet2_task.md
@@ -55,7 +55,7 @@ if __name__ == "__main__":
 
 ## Data input system
 Espnet2 also provides a command line interface to describe the training corpus.
-On the contrary, unlike `fairseq` or training system such as `pytorch-lightining`, 
+On the contrary, unlike `fairseq` or training system such as `pytorch-lightning`,
 our `Task` class doesn't have an interface for building the dataset explicitly.
 This is because we aim at the task related to speech/text only, 
 so we don't need such general system so far.
diff --git a/doc/espnet2_training_option.md b/doc/espnet2_training_option.md
index 173af4a39a1..56c5db07830 100644
--- a/doc/espnet2_training_option.md
+++ b/doc/espnet2_training_option.md
@@ -326,7 +326,7 @@ and the shape information is required only when `--batch_type numel`.
 
 ### `--batch_type folded`
 
-**In ESPnet1, this mode is refered as seq.**
+**In ESPnet1, this mode is referred as seq.**
 
 
 This mode creates mini-batch which has the size of `base_batch_size // max_i(1 + L_i // f_i)`. 
diff --git a/doc/espnet2_tutorial.md b/doc/espnet2_tutorial.md
index 3f157992cff..0dd69624a4a 100644
--- a/doc/espnet2_tutorial.md
+++ b/doc/espnet2_tutorial.md
@@ -18,6 +18,7 @@ We are planning a super major update, called `ESPnet2`. The developing status is
    - You don't need to create the feature file before training, but just input wave data directly.
    - We support both raw wave input and extracted features.
    - The preprocessing for text, tokenization to characters, or sentencepieces, can be also applied during training.
+   - Support **self-supervised learning representations** from s3prl
 - Discarding the JSON format describing the training corpus.
    - Why do we discard the JSON format? Because a dict object generated from a large JSON file requires much memory and it also takes much time to parse such a large JSON file.
 - Support distributed data-parallel training (Not enough tested)
@@ -179,7 +180,7 @@ You need to do one of the following two ways to change the training configuratio
 
 ```sh
 # Give a configuration file
-./run.sh --asr_train_config conf/train_asr.yaml
+./run.sh --asr_config conf/train_asr.yaml
 # Give arguments to "espnet2/bin/asr_train.py" directly
 ./run.sh --asr_args "--foo arg --bar arg2"
 ```
@@ -222,8 +223,7 @@ Note that you need to setup your environment correctly to use distributed traini
 - [Distributed training](./espnet2_distributed.md)
 - [Using Job scheduling system](./parallelization.md)
 
-
-## Use specified expereiment directory for evaluation
+## Use specified experiment directory for evaluation
 
 If you already have trained a model, you may wonder how to give it to run.sh when you'll evaluate it later.
 By default the directory name is determined according to given options, `asr_args`, `lm_args`, or etc.
@@ -244,4 +244,99 @@ You can overwrite it by `--asr_exp` and `--lm_exp`.
 ./run.sh --download_model <model_name> --skip_train true
 ```
 
-You need to fill `model_name` by yourself. See the following link about our pretrain models: https://github.com/espnet/espnet_model_zoo
+You need to fill `model_name` by yourself. You can search for pretrained models on Hugging Face using the tag [espnet](https://huggingface.co/models?library=espnet)
+
+(Deprecated: See the following link about our pretrain models: https://github.com/espnet/espnet_model_zoo)
+
+## Packing and sharing your trained model
+
+ESPnet encourages you to share your results using platforms like [Hugging Face](https://huggingface.co/) or [Zenodo](https://zenodo.org/) (This last will become deprecated.)
+
+For sharing your models, the last three stages of each task simplify this process. The model is packed into a zip file and uploaded to the selected platform (one or both).
+
+For **Hugging Face**, you need to first create a repository (`<my_repo> = <user_name>/<repo_name>`).  
+Remember to install `git-lfs ` before continuing.
+Then, execute `run.sh` as follows:
+
+```sh
+# For ASR recipe
+./run.sh --stage 14 --skip-upload-hf false --hf-repo <my_repo>
+
+# For TTS recipe
+./run.sh --stage 8 --skip-upload-hf false --hf-repo <my_repo>
+```
+
+For **Zenodo**, you need to register your account first. Then, execute `run.sh` as follows:
+
+```sh
+# For ASR recipe
+./run.sh --stage 14 --skip-upload false
+
+# For TTS recipe
+./run.sh --stage 8 --skip-upload false
+```
+
+The packed model can be uploaded to both platforms by setting the previously mentioned flags.
+
+## Usage of Self-Supervised Learning Representations as feature
+
+ESPnet supports self-supervised learning representations (SSLR) to replace traditional spectrum features. In some cases, SSLRs can boost the performance.
+
+To use SSLRs in your task, you need to make several modifications.
+
+### Prerequisite
+1. Install [S3PRL](https://github.com/s3prl/s3prl) by `tools/installers/install_s3prl.sh`.
+2. If HuBERT / Wav2Vec is needed, [fairseq](https://github.com/pytorch/fairseq) should be installed by `tools/installers/install_fairseq.sh`.
+
+### Usage
+1. To reduce the time used in `collect_stats` step, please specify `--feats_normalize uttmvn` in `run.sh` and pass it as arguments to `asr.sh` or other task-specific scripts. (Recommended)
+2. In the configuration file, specify the `frontend` and `preencoder`. Taking `HuBERT` as an example:
+   The `upstream` name can be whatever supported in S3PRL. `multilayer-feature=True` means the final representation is a weighted-sum of all layers' hidden states from SSLR model.
+   ```
+   frontend: s3prl
+   frontend_conf:
+      frontend_conf:
+         upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+      download_dir: ./hub
+      multilayer_feature: True
+   ```
+   Here the `preencoder` is to reduce the input dimension to the encoder, to reduce the memory cost. The `input_size` depends on the upstream model, while the `output_size` can be set to any values.
+   ```
+   preencoder: linear
+   preencoder_conf:
+      input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+      output_size: 80
+   ```
+3. Because the shift sizes of different `upstream` models are different, e.g. `HuBERT` and `Wav2Vec2.0` have `20ms` frameshift. Sometimes, the downsampling rate (`input_layer`) in the `encoder` configuration need to be changed. For example, using `input_layer: conv2d2` will results in a total frameshift of `40ms`, which is enough for some tasks.
+
+## Streaming ASR
+ESPnet supports streaming Transformer/Conformer ASR with blockwise synchronous beam search.
+
+For more details, please refer to the [paper](https://arxiv.org/pdf/2006.14941.pdf).
+
+### Training
+
+To achieve streaming ASR, please employ blockwise Transformer/Conformer encoder in the configuration file. Taking `blockwise Transformer` as an example:
+The `encoder` name can be `contextual_block_transformer` or `contextual_block_conformer`. 
+
+```sh
+encoder: contextual_block_transformer
+encoder_conf:
+    block_size: 40         # block size for block processing
+    hop_size: 16           # hop size for block processing
+    look_ahead: 16         # look-ahead size for block processing
+    init_average: true     # whether to use average input as initial context 
+    ctx_pos_enc: true      # whether to use positional encoding for the context vectors 
+```
+   
+### Decoding
+
+To enable online decoding, the argument `--use_streaming true` should be added to `run.sh`.
+
+```sh
+./run.sh --stage 12 --use_streaming true
+```
+
+### FAQ
+1. Issue about `'NoneType' object has no attribute 'max'` during training: Please make sure you employ `forward_train` function during traininig, check more details [here](https://github.com/espnet/espnet/issues/3803).
+3. I successfully trained the model, but encountered the above issue during decoding: You may forget to specify `--use_streaming true` to select streaming inference.
diff --git a/doc/index.rst b/doc/index.rst
index 13f20ab0a96..30cd3d35fd4 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -28,16 +28,7 @@ ESPnet is an end-to-end speech processing toolkit, mainly focuses on end-to-end
    ./espnet2_task.md
    ./espnet2_distributed.md
 
-.. toctree::
-   :maxdepth: 1
-   :caption: Notebook:
-
-   ./notebook/asr_cli.ipynb
-   ./notebook/asr_library.ipynb
-   ./notebook/tts_cli.ipynb
-   ./notebook/pretrained.ipynb
-   ./notebook/tts_realtime_demo.ipynb
-   ./notebook/st_demo.ipynb
+.. include:: ./_gen/notebooks.rst
 
 .. include:: ./_gen/modules.rst
 
diff --git a/doc/installation.md b/doc/installation.md
index e29ebf2e259..db45a09135b 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -32,14 +32,14 @@ the following packages are installed using Anaconda, so you can skip them.)
     # For CentOS
     $ sudo yum install libsndfile
     ```
-- ffmpeg (This is not required when installataion, but used in some recipes)
+- ffmpeg (This is not required when installing, but used in some recipes)
     ```sh
     # For Ubuntu
     $ sudo apt-get install ffmpeg
     # For CentOS
     $ sudo yum install ffmpeg
     ```
-- flac (This is not required when installataion, but used in some recipes)
+- flac (This is not required when installing, but used in some recipes)
     ```sh
     # For Ubuntu
     $ sudo apt-get install flac
@@ -202,14 +202,14 @@ We also have [prebuilt Kaldi binaries](https://github.com/espnet/espnet/blob/mas
 
     ```sh
     $ cd <espnet-root>/tools
-    $ make TH_VERSION=1.3.1
+    $ make TH_VERSION=1.10.1
     ```
     
     Note that the CUDA version is derived from `nvcc` command. If you'd like to specify the other CUDA version, you need to give `CUDA_VERSION`.
     
     ```sh
     $ cd <espnet-root>/tools
-    $ make TH_VERSION=1.3.1 CUDA_VERSION=10.1
+    $ make TH_VERSION=1.10.1 CUDA_VERSION=11.3
     ```
 
     If you don't have `nvcc` command, packages are installed for CPU mode by default.
@@ -255,7 +255,7 @@ e.g.
     ```
 
 ### Check installation
-You can check whether your installation is succesfully finished by
+You can check whether your installation is successfully finished by
 ```sh
 cd <espnet-root>/tools
 . ./activate_python.sh; python3 check_install.py
diff --git a/doc/module2rst.py b/doc/module2rst.py
index a4cd4db3f6c..7cb83b9e7ad 100755
--- a/doc/module2rst.py
+++ b/doc/module2rst.py
@@ -8,15 +8,15 @@
 
 # parser
 parser = configargparse.ArgumentParser(
-    description='generate RST files from <root> module recursively into <dst>/_gen',
+    description="generate RST files from <root> module recursively into <dst>/_gen",
     config_file_parser_class=configargparse.YAMLConfigFileParser,
-    formatter_class=configargparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--root', nargs='+',
-                    help='root module to generate docs recursively')
-parser.add_argument('--dst', type=str,
-                    help='destination path to generate RSTs')
-parser.add_argument('--exclude', nargs='*', default=[],
-                    help='exclude module name')
+    formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+)
+parser.add_argument(
+    "--root", nargs="+", help="root module to generate docs recursively"
+)
+parser.add_argument("--dst", type=str, help="destination path to generate RSTs")
+parser.add_argument("--exclude", nargs="*", default=[], help="exclude module name")
 args = parser.parse_args()
 print(args)
 
@@ -36,12 +36,14 @@ def gen_rst(module_path, f):
     doc = module.__doc__
     if doc is None:
         doc = ""
-    f.write(f"""
+    f.write(
+        f"""
 {title}
 {sep}
 {doc}
 
-""")
+"""
+    )
 
     for cpath in glob(module_path + "/**/*.py", recursive=True):
         print(cpath)
@@ -51,7 +53,8 @@ def gen_rst(module_path, f):
             continue
         cname = to_module(cpath)
         csep = "-" * len(cname)
-        f.write(f"""
+        f.write(
+            f"""
 .. _{cname}:
 
 {cname}
@@ -62,7 +65,8 @@ def gen_rst(module_path, f):
     :undoc-members:
     :show-inheritance:
 
-""")
+"""
+        )
     f.flush()
 
 
diff --git a/doc/notebook b/doc/notebook
deleted file mode 160000
index ef3cbf880fc..00000000000
--- a/doc/notebook
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ef3cbf880fcd725d11021e541a0cdfae4080446d
diff --git a/doc/notebook2rst.sh b/doc/notebook2rst.sh
new file mode 100755
index 00000000000..83bf7d57794
--- /dev/null
+++ b/doc/notebook2rst.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")"
+
+if [ ! -d notebook ]; then
+    git clone https://github.com/espnet/notebook --depth 1
+fi
+
+echo "\
+.. toctree::
+   :maxdepth: 1
+   :caption: Notebook:
+"
+
+find ./notebook/*.ipynb -exec echo "   {}" \;
diff --git a/doc/tutorial.md b/doc/tutorial.md
index dd80f408b73..8428129fcdc 100644
--- a/doc/tutorial.md
+++ b/doc/tutorial.md
@@ -142,7 +142,7 @@ echo 2
 `run.sh` has multiple stages including data prepration, traning, and etc., so you may likely want to start
 from the specified stage if some stages are failed by some reason for example.
 
-You can start from specified stage as following and stop the process at the specifed stage:
+You can start from specified stage as following and stop the process at the specified stage:
 
 ```bash
 # Start from 3rd stage and stop at 5th stage
@@ -152,96 +152,147 @@ $ ./run.sh --stage 3 --stop-stage 5
 
 ### CTC, attention, and hybrid CTC/attention
 
-ESPnet can completely switch the mode from CTC, attention, and hybrid CTC/attention
+ESPnet can easily switch the model's training/decoding mode from CTC, attention, and hybrid CTC/attention.
+
+Each mode can be trained by specifying `mtlalpha` in the [training configuration](https://github.com/espnet/espnet/blob/7dc9da2f07c54b4b0e878d8ef219fcd4d16a5bec/doc/tutorial.md#changing-the-training-configuration):
 
 ```sh
 # hybrid CTC/attention (default)
-#  --mtlalpha 0.5 and --ctc_weight 0.3 in most cases
-$ ./run.sh
+mtlalpha: 0.3
+
+# CTC
+mtlalpha: 1.0
+
+# attention
+mtlalpha: 0.0
+```
 
-# CTC mode
-$ ./run.sh --mtlalpha 1.0 --ctc_weight 1.0 --recog_model model.loss.best
+Decoding for each mode can be done using the following decoding configurations:
 
-# attention mode
-$ ./run.sh --mtlalpha 0.0 --ctc_weight 0.0 --maxlenratio 0.8 --minlenratio 0.3
+```sh
+# hybrid CTC/attention (default)
+ctc-weight: 0.3
+beam-size: 10
+
+# CTC
+ctc-weight: 1.0
+## for best path decoding
+api: v1 # default setting (can be omitted)
+## for prefix search decoding w/ beam search
+api: v2
+beam-size: 10
+
+# attention
+ctc-weight: 0.0
+beam-size: 10
+maxlenratio: 0.8
+minlenratio: 0.3
 ```
 
-- The CTC training mode does not output the validation accuracy, and the optimum model is selected with its loss value
-(i.e., `--recog_model model.loss.best`).
-- The pure attention mode requires to set the maximum and minimum hypothesis length (`--maxlenratio` and `--minlenratio`), appropriately. In general, if you have more insertion errors, you can decrease the `maxlenratio` value, while if you have more deletion errors you can increase the `minlenratio` value. Note that the optimum values depend on the ratio of the input frame and output label lengths, which is changed for each language and each BPE unit.
+- The CTC mode does not compute the validation accuracy, and the optimum model is selected with its loss value
+(i.e., `$ ./run.sh --recog_model model.loss.best`).
+- The CTC decoding adopts the best path decoding by default, which simply outputs the most probable label at every time step. The prefix search deocding with beam search is also supported in [beam search API v2](https://espnet.github.io/espnet/apis/espnet_bin.html?highlight=api#asr-recog-py).
+- The pure attention mode requires to set the maximum and minimum hypothesis length (`--maxlenratio` and `--minlenratio`), appropriately. In general, if you have more insertion errors, you can decrease the `maxlenratio` value, while if you have more deletion errors you can increase the `minlenratio` value. Note that the optimum values depend on the ratio of the input frame and output label lengths, which is changed for each language and each BPE unit. 
+- Negative `maxlenratio` can be used to set the constant maximum hypothesis length independently from the number of input frames. If `maxlenratio` is set to `-1`, the decoding will always stop after the first output, which can be used to emulate the utterance classification tasks. This is suitable for some spoken language understanding and speaker identification tasks.
 - About the effectiveness of hybrid CTC/attention during training and recognition, see [2] and [3]. For example, hybrid CTC/attention is not sensitive to the above maximum and minimum hypothesis heuristics. 
 
 ### Transducer
 
-ESPnet also supports transducer-based models.
-To switch to transducer mode, the following should be set in the training config:
+***Important: If you encounter any issue related to Transducer loss, please open an issue in [our fork of warp-transducer](https://github.com/b-flo/warp-transducer).***
+
+ESPnet supports models trained with Transducer loss, aka Transducer models. To train such model, the following should be set in the training config:
 
 ```
 criterion: loss
 model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
 ```
 
-Several transducer architectures are currently available:
-- RNN-Transducer (default)
-- Custom-Transducer (`etype: custom` and `dtype: custom`)
+#### Architecture
+
+Several Transducer architectures are currently available in ESPnet:
+- RNN-Transducer (default, e.g.: `etype: blstm` with `dtype: lstm`)
+- Custom-Transducer (e.g.: `etype: custom` and `dtype: custom`)
 - Mixed Custom/RNN-Transducer (e.g: `etype: custom` with `dtype: lstm`)
 
-The architecture specification is separated for the encoder and decoder parts, and defined by the user through, respectively, `etype` and `dtype` in training config. If `custom` is specified for either, a customizable architecture will be used for the corresponding part, otherwise a RNN-based architecture will be selected.
+The architecture specification is separated for the encoder and decoder part, and defined by the user through, respectively, `etype` and `dtype` in the training config. If `custom` is specified for either, a customizable architecture will be used for the corresponding part. Otherwise, an RNN-based architecture will be selected.
+
+Here, the *custom* architecture is a unique feature of the Transducer model in ESPnet. It was made available to add some flexibility in the architecture definition and ease the reproduction of some SOTA Transducer models mixing  different layers types or parameters within the same model part (encoder or decoder). As such, the architecture definition is different compared to the RNN architecture :
 
-While defining a RNN architecture is done in an usual manner (similarly to CTC, Att and MTL) with global parameters, a customizable architecture definition for transducer is different:
-1) Each blocks (or layers) for both network part should be specified individually through `enc-block-arch` or/and `dec-block-arch`:
+1) Each block (or layer) of the custom architecture should be specified individually through `enc-block-arch` or/and `dec-block-arch` parameters:
 
-        # e.g: TDNN-Transformer encoder
+        # e.g: Conv-Transformer encoder
         etype: custom
         enc-block-arch:
-                - type: tdnn
-                  idim: 512
-                  odim: 320
-                  ctx_size: 3
-                  dilation: 1
+                - type: conv1d
+                  idim: 80
+                  odim: 32
+                  kernel_size: [3, 7]
+                  stride: [1, 2]
+                - type: conv1d
+                  idim: 32
+                  odim: 32
+                  kernel_size: 3
+                  stride: 2
+                - type: conv1d
+                  idim: 32
+                  odim: 384
+                  kernel_size: 3
                   stride: 1
                 - type: transformer
-                  d_hidden: 320
-                  d_ff: 320
+                  d_hidden: 384
+                  d_ff: 1536
                   heads: 4
 
-2) Each part has different allowed block type: `tdnn`, `conformer` or `transformer` for encoder and `causal-conv1d` or `transformer` for decoder. For each block type, a set of parameters are needed:
-
-        # TDNN
-        - type: tdnn
-          idim: input dimension
-          odim: output dimension
-          ctx_size: size of the context window
-          dilation: parameter to control the stride of elements within the neighborhood
-          stride: stride of the sliding blocks
-          [optional: dropout-rate]
+2) Different block types are allowed for the custom encoder (`tdnn`, `conformer` or `transformer`) and the custom decoder (`causal-conv1d` or `transformer`). Each one has a set of mandatory and optional parameters :
+
+        # 1D convolution (TDNN) block
+        - type: conv1d
+          idim: [Input dimension. (int)]
+          odim: [Output dimension. (int)]
+          kernel_size: [Size of the context window. (int or tuple)]
+          stride (optional): [Stride of the sliding blocks. (int or tuple, default = 1)]
+          dilation (optional): [Parameter to control the stride of elements within the neighborhood. (int or tuple, default = 1)]
+          groups (optional): [Number of blocked connections from input channels to output channels. (int, default = 1)
+          bias (optional): [Whether to add a learnable bias to the output. (bool, default = True)]
+          use-relu (optional): [Whether to use a ReLU activation after convolution. (bool, default = True)]
+          use-batchnorm: [Whether to use batch normalization after convolution. (bool, default = False)]
+          dropout-rate (optional): [Dropout-rate for TDNN block. (float, default = 0.0)]
 
         # Transformer
         - type: transformer
-          d_hidden: input/output dimension
-          d_ff: feed-forward hidden dimension
-          heads: number of heads in multi-head attention
-          [optional: dropout-rate, pos-dropout-rate, att-dropout-rate]
+          d_hidden: [Input/output dimension of Transformer block. (int)]
+          d_ff: [Hidden dimension of the Feed-forward module. (int)]
+          heads: [Number of heads in multi-head attention. (int)]
+          dropout-rate (optional): [Dropout-rate for Transformer block. (float, default = 0.0)]
+          pos-dropout-rate (optional): [Dropout-rate for positional encoding module. (float, default = 0.0)]
+          att-dropout-rate (optional): [Dropout-rate for attention module. (float, default = 0.0)]
 
         # Conformer
         - type: conformer
-          d_hidden: input/output dimension
-          d_ff: feed-forward hidden dimension
-          heads: number of heads in multi-head attention
-          macaron_style: wheter to use macaron style
-          use_conv_mod: whether to use convolutional module
-          conv_mod_kernel: number of kernel in convolutional module (optional if `use_conv_mod=True`)
-          [optional: dropout-rate, pos-dropout-rate, att-dropout-rate]
+          d_hidden: [Input/output dimension of Conformer block (int)]
+          d_ff: [Hidden dimension of the Feed-forward module. (int)]
+          heads: [Number of heads in multi-head attention. (int)]
+          macaron_style: [Whether to use macaron style. (bool)]
+          use_conv_mod: [Whether to use convolutional module. (bool)]
+          conv_mod_kernel (required if use_conv_mod = True): [Number of kernel in convolutional module. (int)]
+          dropout-rate (optional): [Dropout-rate for Transformer block. (float, default = 0.0)]
+          pos-dropout-rate (optional): [Dropout-rate for positional encoding module. (float, default = 0.0)]
+          att-dropout-rate (optional): [Dropout-rate for attention module. (float, default = 0.0)]
 
         # Causal Conv1d
         - type: causal-conv1d
-          idim: input dimension
-          odim: output dimension
-          kernel_size: size of convolving kernel
-          stride: stride of the convolution
-          dilation: spacing between the kernel points
-
-3) Each specified block(s) for each network part can be repeated by specifying the number of duplications through `enc-block-repeat` or `dec-block-repeat` parameters:
+          idim: [Input dimension. (int)]
+          odim: [Output dimension. (int)]
+          kernel_size: [Size of the context window. (int)]
+          stride (optional): [Stride of the sliding blocks. (int, default = 1)]
+          dilation (optional): [Parameter to control the stride of elements within the neighborhood. (int, default = 1)]
+          groups (optional): [Number of blocked connections from input channels to output channels. (int, default = 1)
+          bias (optional): [Whether to add a learnable bias to the output. (bool, default = True)]
+          use-relu (optional): [Whether to use a ReLU activation after convolution. (bool, default = True)]
+          use-batchnorm: [Whether to use batch normalization after convolution. (bool, default = False)]
+          dropout-rate (optional): [Dropout-rate for TDNN block. (float, default = 0.0)]
+
+3) The defined architecture can be repeated by specifying the total number of blocks/layers in the architecture through `enc-block-repeat` or/and `dec-block-repeat` parameters:
 
         # e.g.: 2x (Causal-Conv1d + Transformer) decoder
         dtype: transformer
@@ -258,47 +309,88 @@ While defining a RNN architecture is done in an usual manner (similarly to CTC,
                   att-dropout-rate: 0.4
         dec-block-repeat: 2
 
-For more information about the customizable architecture, please refer to [vivos config examples](https://github.com/espnet/espnet/tree/master/egs/vivos/asr1/conf/tuning/transducer) which cover all cases.
+#### Multi-task learning
+
+We also support multi-task learning with various auxiliary losses, such as: CTC, cross-entropy w/ label-smoothing (LM loss), auxiliary Transducer, and symmetric KL divergence.
+The four losses can be simultaneously trained with main Transducer loss to jointly optimize the total loss defined as:
+
+![augmented Transducer training](http://www.sciweavers.org/tex2img.php?eq=\mathcal{L}_{tot}%20%3D%20\lambda_{1}\mathcal{L}_{1}%20%2B%20\lambda_{2}\mathcal{L}_{2}%20%2B%20\lambda_{3}\mathcal{L}_{3}%20%2B%20\lambda_{4}%20\mathcal{L}_{4}%20%2B%20\lambda_{5}%20\mathcal{L}_{5}&bc=White&fc=Black&im=jpg&fs=12&ff=arev&edit=)
+
+where the losses are respectively, in order: The main Transducer loss, the CTC loss, the auxiliary Transducer loss, the symmetric KL divergence loss, and the LM loss. Lambda values define their respective contribution to the overall loss. Additionally, each loss can be independently selected or omitted depending on the task.
+
+Each loss can be defined in the training config alongside its specific options, such as follow:
+
+        # Transducer loss (L1)
+        transducer-loss-weight: [Weight of the main Transducer loss (float)]
+
+        # CTC loss (L2)
+        use-ctc-loss: True
+        ctc-loss-weight (optional): [Weight of the CTC loss. (float, default = 0.5)]
+        ctc-loss-dropout-rate (optional): [Dropout rate for encoder output representation. (float, default = 0.0)]
+
+        # Auxiliary Transducer loss (L3)
+        use-aux-transducer-loss: True
+        aux-transducer-loss-weight (optional): [Weight of the auxiliary Transducer loss. (float, default = 0.4)]
+        aux-transducer-loss-enc-output-layers (required if use-aux-transducer-loss = True): [List of intermediate encoder layer IDs to compute auxiliary Transducer loss(es). (list)]
+        aux-transducer-loss-mlp-dim (optional): [Hidden dimension for the MLP network. (int, default = 320)]
+        aux-transducer-loss-mlp-dropout-rate: [Dropout rate for the MLP network. (float, default = 0.0)]
+
+        # Symmetric KL divergence loss (L4)
+        # Note: It can be only used in addition to the auxiliary Transducer loss.
+        use-symm-kl-div-loss: True
+        symm-kl-div-loss-weight (optional): [Weight of the symmetric KL divergence loss. (float, default = 0.2)]
+
+        # LM loss (L5)
+        use-lm-loss: True
+        lm-loss-weight (optional): [Weight of the LM loss. (float, default = 0.2)]
+        lm-loss-smoothing-rate: [Smoothing rate for LM loss. If > 0, label smoothing is enabled. (float, default = 0.0)]
+
+#### Inference
+
+Various decoding algorithms are also available for Transducer by setting `beam-size` and `search-type` parameters in decode config.
 
-Various decoding algorithms are also available for transducer by setting `search-type` parameter in decode config:
-- Default beam search (`default`)
-- Time-synchronous decoding (`tsd`)
-- Alignment-length decoding (`alsd`)
-- N-step Constrained beam search (`nsc`)
+  - Greedy search  constrained to one emission by timestep (`beam-size: 1`).
+  - Beam search algorithm without prefix search (`beam-size: >1` and `search-type: default`).
+  - Time Synchronous Decoding [[Saon et al., 2020]](https://ieeexplore.ieee.org/abstract/document/9053040) (`beam-size: >1` and `search-type: tsd`).
+  - Alignment-Length Synchronous Decoding [[Saon et al., 2020]](https://ieeexplore.ieee.org/abstract/document/9053040) (`beam-size: >1` and `search-type: alsd`).
+  - N-step Constrained beam search modified from [[Kim et al., 2020]](https://arxiv.org/abs/2002.03577) (`beam-size: >1` and `search-type: default`).
+  - modified Adaptive Expansion Search, based on [[Kim et al., 2021]](https://ieeexplore.ieee.org/abstract/document/9250505) and NSC (`beam-size: >1` and `search-type: maes`).
 
-All algorithms share a common parameter to control beam size (`beam-size`) but each ones have its own parameters:
+The algorithms share two parameters to control beam size (`beam-size`) and final hypotheses normalization (`score-norm-transducer`). The specific parameters for each algorithm are:
 
         # Default beam search
         search-type: default
-        score-norm-transducer: normalize final scores by length
 
         # Time-synchronous decoding
         search-type: tsd
-        max-sym-exp: number of maximum symbol expansions at each time step
+        max-sym-exp: [Number of maximum symbol expansions at each time step (int)]
 
         # Alignement-length decoding
         search-type: alsd
-        u-max: maximum output sequence length
+        u-max: [Maximum output sequence length (int)]
 
         # N-step Constrained beam search
         search-type: nsc
-        nstep: number of maximum expansion steps at each time step
-               (N exp. step = N symbol expansion + 1)
-        prefix-alpha: maximum prefix length in prefix search
+        nstep: [Number of maximum expansion steps at each time step (int)]
+               # nstep = max-sym-exp + 1 (blank)
+        prefix-alpha: [Maximum prefix length in prefix search (int)]
 
-Except for the default algorithm, performance and decoding time can be controlled through described parameters. A high value will increase performance but also decoding time while a low value will decrease decoding time but will negatively impact performance.
+        # modified Adaptive Expansion Search
+        search-type: maes
+        nstep: [Number of maximum expansion steps at each time step (int, > 1)]
+        prefix-alpha: [Maximum prefix length in prefix search (int)]
+        expansion-gamma: [Number of additional candidates in expanded hypotheses selection (int)]
+        expansion-beta: [Allowed logp difference for prune-by-value method (float, > 0)]
 
-IMPORTANT (temporary) note: ALSD, TSD and NSC have their execution time degraded because of the current batching implementation. We decided to keep it as if for internal discussions but it can be manually removed by the user to speed up inference. In a near future, the inference part for transducer will be replaced by our own torch lib.
+Except for the default algorithm, the described parameters are used to control the performance and decoding speed. The optimal values for each parameter are task-dependent; a high value will typically increase decoding time to focus on performance while a low value will improve decoding time at the expense of performance.
 
-The algorithm references can be found in [methods documentation](https://github.com/espnet/espnet/tree/master/espnet/nets/beam_search_transducer.py). For more information about decoding usage, refer to [vivos config examples](https://github.com/espnet/espnet/tree/master/egs/vivos/asr1/conf/tuning/transducer).
+#### Additional notes
 
-Additional notes:
-- Similarly to CTC training mode, transducer does not output the validation accuracy. Thus, the optimum model is selected with its loss value (i.e., --recog_model model.loss.best).
-- There are several differences between MTL and transducer training/decoding options. The users should refer to `espnet/espnet/nets/pytorch_backend/e2e_asr_transducer.py` for an overview.
-- RNN-decoder pre-initialization using a LM is supported. The LM state dict keys (`predictor.*`) will be matched to AM state dict keys (`dec.*`).
-- Transformer-decoder pre-initialization using a transformer LM is not supported yet.
-- Transformer and conformer blocks within the same architecture part (i.e: encoder) is not supported yet.
-- Customizable architecture is a in-progress work and will be eventually extended to RNN. Please report any encountered error or usage issue.
+- Similarly to training with CTC, Transducer does not output the validation accuracy. Thus, the optimum model is selected with its loss value (i.e., --recog_model model.loss.best).
+- There are several differences between MTL and Transducer training/decoding options. The users should refer to `espnet/espnet/nets/pytorch_backend/e2e_asr_transducer.py` for an overview and `espnet/espnet/nets/pytorch_backend/transducer/arguments` for all possible arguments.
+- FastEmit regularization [[Yu et al., 2021]](https://arxiv.org/pdf/2010.11148) is available through `--fastemit-lambda` training parameter (default = 0.0).
+- RNN-decoder pre-initialization using an LM is supported. Note that regular decoder keys are expected. The LM state dict keys (`predictor.*`) will be renamed according to AM state dict keys (`dec.*`).
+- Transformer-decoder pre-initialization using a Transformer LM is not supported yet.
 
 ### Changing the training configuration
 
@@ -374,7 +466,7 @@ We expect the user to define the following options in its main training config (
 ### Important notes
 
 - Given a pre-trained source model, the modules specified for transfer learning are expected to have the same parameters (i.e.: layers and units) as the target model modules.
-- We also support initialization with a pre-trained RNN LM for the RNN-transducer decoder.
+- We also support initialization with a pre-trained RNN LM for the RNN-Transducer decoder.
 - RNN models use different key names for encoder and decoder parts compared to Transformer, Conformer or Custom models:
   - RNN model use `enc.` for encoder part and `dec.` for decoder part.
   - Transformer/Conformer/Custom model use `encoder.` for encoder part and `decoder.` for decoder part.
diff --git a/docker/.default_args b/docker/.default_args
new file mode 100644
index 00000000000..48cadcda9a6
--- /dev/null
+++ b/docker/.default_args
@@ -0,0 +1,2 @@
+docker_cuda=11.1
+docker_os=20.04
diff --git a/docker/.gitignore b/docker/.gitignore
index 9937f78b3dc..a8a0eb3cee5 100644
--- a/docker/.gitignore
+++ b/docker/.gitignore
@@ -1 +1,4 @@
 espnet-local.tar
+.custom_args
+*.log
+*.done
\ No newline at end of file
diff --git a/docker/build.sh b/docker/build.sh
index 19db299e9c8..987a0f54ac7 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -3,16 +3,19 @@
 # 2019, Nelson Yalta
 # 2019, Ludwig Kürzinger, Technische Universität München
 
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
 
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
 
-tags="cpu-u18
-      gpu-cuda10.0-cudnn7-u18
-      gpu-cuda10.1-cudnn7-u18"
-cuda_vers="10.0
-           10.1"
-docker_ver=$(docker version -f '{{.Server.Version}}')
-echo "Using Docker Ver.${docker_ver}"
+# Default values
+ubuntu_ver=20.04
+cuda_ver=11.1
+build_ver=cpu
+build_cores=24
+th_ver=1.10.1
 
 
 cmd_usage() {
@@ -24,11 +27,12 @@ cmd_usage() {
         Also able to build containers based on local build configuration.
     
     USAGE
-        ${PROGRAM} <mode>
+        ${PROGRAM} <args> <mode>
         ${PROGRAM} build_and_push
-        ${PROGRAM} local [cpu|9.1|9.2|10.0|10.1]
+        ${PROGRAM} --build-ver [cpu|gpu] local
 
             mode      Select script functionality
+            args      Set up building features
 
         Modes
             build           build docker containers
@@ -39,6 +43,15 @@ cmd_usage() {
                             using the base image from Docker Hub (espnet/espnet:runtime)
                             optional: cpu or CUDA version (default: cpu)
             fully_local     like local, but also builds the base image
+        
+        Arguments
+            build-ver       cpu/gpu
+            ubuntu-ver      any ubuntu version available at docker hub (e.g. 18.04/20.04/...)
+                            (default: 18.04)
+            cuda-ver        any cuda version available at nvidia (e.g. 9.0/9.1/...)
+                            (default: 10.1)
+            build-cores     cores employed for building the container
+            th-ver          Pytorch version for fully local build
 
     CAVEATS
         For local builds, the image pulled from Docker Hub is based on Ubuntu 16,
@@ -51,68 +64,95 @@ cmd_usage() {
 
 
 build(){
-    echo "Build docker containers"
+    log "Build Latest docker containers"
     # build runtime and gpu based containers
-    docker_image=$( docker images -q espnet/espnet:runtime )
+    this_tag=espnet/espnet:runtime-latest
+    docker_image=$( docker images -q  ${this_tag} )
     if ! [[ -n ${docker_image} ]]; then
-        docker build --build-arg DOCKER_VER=${docker_ver} -f prebuilt/runtime/Dockerfile -t espnet/espnet:runtime . || exit 1
+        log "Now building Runtime container"
+        docker build --build-arg DOCKER_VER=${docker_ver} \
+                    --build-arg FROM_TAG=${default_ubuntu_ver} \
+                    --build-arg NUM_BUILD_CORES=${build_cores} \
+                    -f prebuilt/runtime.dockerfile -t ${this_tag} . | tee -a build_runtime.log > /dev/null
+
+        docker_image=$( docker images -q ${this_tag} )
+        [ -z "${docker_image}" ] && exit 1
+    fi
+
+    this_tag=espnet/espnet:cuda-latest
+    docker_image=$( docker images -q  ${this_tag} )
+    if ! [[ -n ${docker_image} ]]; then
+        log "Now building CUDA container"
+        docker build --build-arg FROM_TAG=runtime-latest \
+                    -f prebuilt/gpu.dockerfile -t ${this_tag} . | tee -a build_cuda.log > /dev/null
+        docker_image=$( docker images -q ${this_tag} )
+        [ -z "${docker_image}" ] && exit 1
     fi
-    for ver in ${cuda_vers}; do
-        docker_image=$( docker images -q espnet/espnet:cuda${ver}-cudnn7 )
-        if ! [[ -n ${docker_image} ]]; then
-            docker build -f prebuilt/devel/gpu/${ver}/cudnn7/Dockerfile -t espnet/espnet:cuda${ver}-cudnn7 . || exit 1
-        fi
-    done
 
     # build cpu based
-    docker_image=$( docker images -q espnet/espnet:cpu-u18 )
+    docker_image=$( docker images -q espnet/espnet:cpu-latest )
+    this_tag=espnet/espnet:cpu-latest
+    docker_image=$( docker images -q  ${this_tag} )
     if ! [[ -n ${docker_image} ]]; then
-        echo "Now building cpu-u18"
-        docker build --build-arg FROM_TAG=runtime -f prebuilt/devel/Dockerfile -t espnet/espnet:cpu-u18 . || exit 1
+        log "Now building cpu-latest with ubuntu:${default_ubuntu_ver}"
+        docker build --build-arg FROM_TAG=runtime-latest \
+                            -f prebuilt/devel.dockerfile \
+                            --target devel \
+                            -t ${this_tag} . | tee -a build_cpu.log > /dev/null
+
+        docker_image=$( docker images -q ${this_tag} )
+        [ -z "${docker_image}" ] && exit 1
     fi
+
     # build gpu based
-    for ver in ${cuda_vers}; do
-        build_args="--build-arg FROM_TAG=cuda${ver}-cudnn7"
-        build_args="${build_args} --build-arg CUDA_VER=${ver}"
-        docker_image=$( docker images -q espnet/espnet:gpu-cuda${ver}-cudnn7-u18 )
-        if ! [[ -n ${docker_image} ]]; then
-            echo "Now building gpu-cuda${ver}-cudnn7-u18"
-            docker build ${build_args} -f prebuilt/devel/Dockerfile -t espnet/espnet:gpu-cuda${ver}-cudnn7-u18 . || exit 1
-        fi
-    done
+    build_args="--build-arg FROM_TAG=cuda-latest 
+                --build-arg CUDA_VER=${default_cuda_ver}"
+    this_tag=espnet/espnet:gpu-latest
+    docker_image=$( docker images -q ${this_tag}  )
+    if ! [[ -n ${docker_image} ]]; then
+        log "Now building gpu-latest with ubuntu:${default_ubuntu_ver} and cuda:${default_cuda_ver}"
+        docker build ${build_args} -f prebuilt/devel.dockerfile \
+                            --target devel \
+                            -t ${this_tag}  . | tee -a build_gpu.log > /dev/null
+        docker_image=$( docker images -q ${this_tag} )
+        [ -z "${docker_image}" ] && exit 1
+    fi
 }
 
 
 build_local(){
-    echo "Building docker container: base image, and image for ${ver}"
+    log "Building docker container: base image, and image for ${build_ver}"
     sleep 1
 
     # prepare espnet-repo, assuming that this script is in folder espnet/docker
     cd ${SCRIPTPATH}/..
     ESPNET_ARCHIVE="./espnet-local.tar"
-    echo "Reconstructing the local repository from the last commit"
+    log "Reconstructing the local repository from the last commit"
     git archive -o docker/${ESPNET_ARCHIVE} HEAD  || exit 1
     cd ${SCRIPTPATH}
     test -r ${ESPNET_ARCHIVE} || exit 1;
     sleep 1
 
-    if [ "${build_base_image}" = true ] ; then
-        echo "building ESPnet base image"
-        docker build --build-arg DOCKER_VER=${docker_ver} -f prebuilt/runtime/Dockerfile -t espnet/espnet:runtime . || exit 1
+    if [ "${build_base_image}" = true ]; then
+        log "building ESPnet base image with ubuntu:${ubuntu_ver}"
+        docker build --build-arg DOCKER_VER=${docker_ver} \
+                    --build-arg FROM_TAG=${ubuntu_ver} \
+                    --build-arg NUM_BUILD_CORES=${build_cores} \
+                    -f prebuilt/runtime/Dockerfile -t espnet/espnet:runtime-local . || exit 1
         sleep 1
     fi
 
-    if [[ ${ver} == "cpu" ]]; then
-        echo "building ESPnet CPU Image"
-        docker build --build-arg FROM_TAG=runtime  --build-arg ESPNET_ARCHIVE=${ESPNET_ARCHIVE} \
+    if [[ ${build_ver} == "cpu" ]]; then
+        log "building ESPnet CPU Image with ubuntu:${ubuntu_ver}"
+        docker build --build-arg FROM_TAG=runtime-local  --build-arg ESPNET_ARCHIVE=${ESPNET_ARCHIVE} \
                      -f prebuilt/local/Dockerfile -t espnet/espnet:cpu-local . || exit 1
-    elif [[ ${ver} =~ ^(9.1|9.2|10.0|10.1)$ ]]; then
-        echo "building ESPnet GPU Image for ${ver}"
+    elif [[ ${build_ver} == "gpu" ]]; then
+        log "building ESPnet GPU Image with ubuntu:${ubuntu_ver} and cuda:${cuda_ver}"
         if [ "${build_base_image}" = true ] ; then
-            docker build -f prebuilt/devel/gpu/${ver}/cudnn7/Dockerfile -t espnet/espnet:cuda${ver}-cudnn7 . || exit 1
+            docker build -f prebuilt/devel/gpu/${ver}/Dockerfile -t espnet/espnet:cuda${ver}-cudnn7 . || exit 1
         else
-            if ! [[ -n $( docker images -q espnet/espnet:cuda${ver}-cudnn7)  ]]; then
-                docker pull espnet/espnet:cuda${ver}-cudnn7
+            if ! [[ -n $( docker images -q espnet/espnet:cuda-latest)  ]]; then
+                docker pull espnet/espnet:cuda-latest
             fi
         fi
         build_args="--build-arg FROM_TAG=cuda${ver}-cudnn7"
@@ -120,105 +160,148 @@ build_local(){
         build_args="${build_args} --build-arg ESPNET_ARCHIVE=${ESPNET_ARCHIVE}"
         docker build ${build_args} -f prebuilt/local/Dockerfile -t espnet/espnet:gpu-cuda${ver}-cudnn7-u18-local . || exit 1
     else
-        echo "Parameter invalid: " ${ver}
+        log "ERROR: Parameter invalid: " ${ver}
     fi
 
-    echo "cleanup."
+    log "cleanup."
     test -r ${ESPNET_ARCHIVE} && rm ${ESPNET_ARCHIVE}
 }
 
+run_recipe1(){
+    ./run.sh --docker-egs mini_an4/asr1 \
+                        --docker-cmd run.sh \
+                        --docker-gpu ${1} \
+                        --verbose 1 \
+                        --backend ${2} \
+                        --ngpu ${3} \
+                        --stage ${4} \
+                        --tag train_nodev_${2}_${5} | tee -a ${PWD}/testing_${5}_${2}.log > /dev/null
+}
+
+run_recipe2(){
+   ./run.sh --docker-egs mini_an4/asr1  \
+                    --docker-cmd run.sh \
+                    --docker-gpu ${1} \
+                    --docker-env "NLTK_DATA=/espnet/egs2/mini_an4/asr1/nltk_data,HOME=/espnet/egs2/mini_an4/asr1" \
+                    --is-egs2 \
+                    --ngpu ${2} \
+                    --stage ${3} \
+                    --asr-tag train_nodev_${4} \
+                    --lm-tag train_nodev_${4}  | tee -a ${PWD}/testing2_pytorch_${4}.log > /dev/null 
+}
 
 testing(){
-    echo "Testing docker containers"
+    log "Testing docker containers"
     # Test Docker Containers with cpu setup
     run_stage=-1
-    if [ -f ../egs/an4/asr1/dump/train_nodev/deltafalse/data.json ]; then 
-        run_stage=3
-    fi
-    for cuda_ver in cpu ${cuda_vers};do
-        for backend in pytorch chainer;do
-            if [ "${cuda_ver}" != "cpu" ];then
-                docker_cuda="--docker-cuda ${cuda_ver}"
-                gpu=0
-                ngpu=1
-            else
-                docker_cuda=""
-                gpu=-1
-                ngpu=0
-            fi
-            ( ./run.sh ${docker_cuda} \
-                        --docker-egs an4/asr1 \
-                        --docker-cmd run.sh \
-                        --docker-gpu ${gpu} \
-                        --verbose 1 \
-                        --backend ${backend} \
-                        --ngpu ${ngpu} \
-                        --stage ${run_stage} \
-                        --tag train_nodev_${backend}_cuda${cuda_ver} ) || exit 1
-        done
+    for backend in chainer pytorch; do
+        if [ -f ../egs/mini_an4/asr1/dump/train_nodev/deltafalse/data.json ]; then 
+            run_stage=3
+        fi
+        if [ ! -f .test_cpu_${backend}.done ]; then
+            run_recipe1 -1 ${backend} 0 ${run_stage} "cpu"
+            touch .test_cpu_${backend}.done
+        fi
     done
 
-    echo "ESPnet egs Done. Press <enter> to continue with ESPnet2 egs"
+    for backend in chainer pytorch; do
+        if [ -f ../egs/mini_an4/asr1/dump/train_nodev/deltafalse/data.json ]; then 
+            run_stage=3
+        fi
+        if [ ! -f .test_gpu_${backend}.done ]; then
+            run_recipe1 0 ${backend} 1 ${run_stage} "gpu"
+            touch .test_gpu_${backend}.done
+        fi
+    done
+
+    log "ESPnet egs Done. Press <enter> to continue with ESPnet2 egs"
     read enter
     # Test for espnet2
     run_stage=-1
-    if [ -f ../egs2/an4/asr1/dump/raw/train_nodev/text ]; then 
-        run_stage=9
+    # 
+    if [ ! -f .test2_cpu_${backend}.done ]; then
+        run_recipe2 -1 0 ${run_stage} "cpu"
+        touch .test2_cpu_${backend}.done
+    fi
+    run_stage=6
+    if [ ! -f .test2_gpu_${backend}.done ]; then
+        run_recipe2 0 1 ${run_stage} "gpu"
+        touch .test2_gpu_${backend}.done
     fi
-    for cuda_ver in cpu ${cuda_vers};do
-        if [ "${cuda_ver}" != "cpu" ];then
-            docker_cuda="--docker-cuda ${cuda_ver}"
-            gpu=0
-            ngpu=1
-        else
-            docker_cuda=""
-            gpu=-1
-            ngpu=0
-        fi
-        ( ./run.sh ${docker_cuda} \
-                    --docker-egs an4/asr1  \
-                    --docker-cmd run.sh \
-                    --docker-gpu ${gpu} \
-                    --is-egs2 \
-                    --ngpu ${ngpu} \
-                    --stage ${run_stage} \
-                    --asr_tag train_nodev_cuda${cuda_ver} \
-                    --lm_tag train_nodev_cuda${cuda_ver}) || exit 1
-        run_stage=3
-    done
 }
 
 
 push(){
-    for tag in ${tags};do
-        echo "docker push espnet/espnet:${tag}"
+    for tag in runtime-latest cuda-latest cpu-latest gpu-latest;do
+        log "docker push espnet/espnet:${tag}"
         ( docker push espnet/espnet:${tag} )|| exit 1
     done
 }
 
+## Parameter initialization:
+while test $# -gt 0
+do
+    case "$1" in
+        -h) cmd_usage
+            exit 0;;
+        --help) cmd_usage
+            exit 0;;
+        --*) ext=${1#--}
+            ext=${ext//-/_}
+            frombreak=true
+            for i in _ {a..z} {A..Z}; do
+                for var in `eval echo "\\${!${i}@}"`; do
+                    if [ "$var" == "$ext" ]; then
+                        eval ${ext}=$2
+                        frombreak=false
+                        shift
+                        break 2
+                    fi 
+                done 
+            done
+            if ${frombreak} ; then
+                echo "bad option $1" 
+                exit 1
+            fi
+            ;;
+        *) break
+            ;;
+    esac
+    shift
+done
 
-## Parameter initialization: cpu or gpu docker container (default: cpu)
-if [[ -z "$2" ]]; then
-    ver='cpu'
-else
-    ver=$2
+
+mode=$1
+default_ubuntu_ver=20.04
+default_cuda_ver=11.1
+
+check=true
+[ "${default_ubuntu_ver}" != "${ubuntu_ver}" ] || [ "${default_cuda_ver}" != "${cuda_ver}" ] && check=false
+
+if [ ${check} = false ] && [ "${mode}" != "fully_local" ]; then
+    log "Error: Use of custom versions of Ubuntu (!=${default_ubuntu_ver}) and CUDA (!=${default_cuda_ver})
+        is only available for <mode> == fully_local.
+        Exiting... "
+    exit 0;
 fi
 
+docker_ver=$(docker version -f '{{.Server.Version}}')
+log "Using Docker Ver.${docker_ver}"
 
 ## Application menu
-if   [[ $1 == "build" ]]; then
+if   [[ "${mode}" == "build" ]]; then
     build
-elif [[ $1 == "local" ]]; then
+elif [[ "${mode}" == "local" ]]; then
     build_base_image=false
     build_local
-elif [[ $1 == "fully_local" ]]; then
+elif [[ "${mode}" == "fully_local" ]]; then
     build_base_image=true
     build_local
-elif [[ $1 == "push" ]]; then
+elif [[ "${mode}" == "push" ]]; then
     push
-elif [[ $1 == "test" ]]; then
+elif [[ "${mode}" == "test" ]]; then
     testing
-elif [[ $1 == "build_and_push" ]]; then
+elif [[ "${mode}" == "build_and_push" ]]; then
     build
     testing
     push
@@ -226,4 +309,4 @@ else
     cmd_usage
 fi
 
-echo "$(basename "$0") done."
+log "$(basename "$0") done."
diff --git a/docker/espnet.dockerfile b/docker/espnet.dockerfile
new file mode 100644
index 00000000000..b6295ca2a92
--- /dev/null
+++ b/docker/espnet.dockerfile
@@ -0,0 +1,23 @@
+ARG FROM_TAG
+# For cuda-based images, The distribution will include cuda
+FROM espnet/espnet:${FROM_TAG}
+LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
+
+ARG THIS_USER
+ARG THIS_UID
+ARG EXTRA_LIBS
+
+# Add extra libraries (VC/TTS)
+
+RUN if [ ${EXTRA_LIBS} = true ]; then \
+        cd /espnet/tools; \
+        make extra; \
+    fi
+
+# Add user to container
+RUN if [ ! -z "${THIS_UID}"  ]; then \
+    useradd -m -r -u ${THIS_UID} -g root ${THIS_USER}; \
+    fi
+
+USER ${THIS_USER}
+WORKDIR /
diff --git a/docker/prebuilt/Dockerfile b/docker/prebuilt/Dockerfile
deleted file mode 100644
index 63bc7b73525..00000000000
--- a/docker/prebuilt/Dockerfile
+++ /dev/null
@@ -1,25 +0,0 @@
-ARG FROM_TAG
-# For cuda-based images, The distribution will include cuda, cudnn, nccl
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-ARG THIS_USER
-ARG THIS_UID
-ARG EXTRA_LIBS
-
-# Add extra libraries (VC/TTS)
-
-RUN if [ ${EXTRA_LIBS} = true ]; then \
-        cd /espnet/tools; \
-        . ./activate_python.sh; \
-        pip install parallel_wavegan; \
-        pip install git+https://github.com/cybertronai/pytorch-lamb; \
-    fi
-
-# Add user to container
-RUN if [ ! -z "${THIS_UID}"  ]; then \
-    useradd -m -r -u ${THIS_UID} -g root ${THIS_USER}; \
-    fi
-
-USER ${THIS_USER}
-WORKDIR /
diff --git a/docker/prebuilt/devel.dockerfile b/docker/prebuilt/devel.dockerfile
new file mode 100644
index 00000000000..95dc6a41059
--- /dev/null
+++ b/docker/prebuilt/devel.dockerfile
@@ -0,0 +1,83 @@
+ARG FROM_TAG
+FROM espnet/espnet:${FROM_TAG} as devel
+LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
+
+ARG CUDA_VER
+ENV CUDA_VER ${CUDA_VER}
+
+ARG TH_VERSION
+ENV TH_VERSION ${TH_VERSION}
+WORKDIR /
+
+ARG ESPNET_LOCATION=https://github.com/espnet/espnet
+
+ENV PATH=/opt/miniconda/bin:${PATH}
+
+# Download ESPnet
+RUN git clone ${ESPNET_LOCATION} && \
+    cd espnet && \
+    rm -rf docker egs egs2 espnet2 test utils && \
+    rm -rf .git 
+
+# Install espnet
+WORKDIR /espnet/tools
+
+# Disable cupy test
+# Docker build does not load libcuda.so.1
+# Replace nvidia-smi for nvcc because docker does not load nvidia-smi
+RUN if [ -z "${CUDA_VER}" ]; then \
+        echo "Build without CUDA" && \
+        MY_OPTS='CUPY_VERSION=""'; \
+    else \
+        echo "Build with CUDA ${CUDA_VER}" && \
+        # Docker containers cannot load cuda libs during build.
+        # So, their checks on cuda packages are disabled.
+        sed -i '200s|install.py|install.py --no-cuda --no-cupy |' Makefile && \
+        export CFLAGS="-I${CUDA_HOME}/include ${CFLAGS}" && \
+        MY_OPTS="CUDA_VERSION=${CUDA_VER}" && \
+        . ./setup_cuda_env.sh /usr/local/cuda;  \
+    fi; \
+    if [ ! -z "${TH_VERSION}" ]; then \
+        MY_OPTS="${MY_OPTS} TH_VERSION=${TH_VERSION} "; \
+    fi; \
+    echo "Make with options ${MY_OPTS}" && \
+    ln -s /opt/kaldi ./ && \
+   rm -f activate_python.sh && touch activate_python.sh && \
+   conda install -y conda "python=3.7.4" && \
+    make KALDI=/opt/kaldi ${MY_OPTS} && \
+    . ./activate_python.sh && \
+    ./installers/install_warp-ctc.sh && \
+    ./installers/install_kenlm.sh && \
+    ./installers/install_chainer.sh cpu && \
+    conda clean --all && \
+    rm -f *.tar.*  && \
+    pip cache purge
+
+RUN rm -rf ../espnet
+
+WORKDIR /
+
+
+#### For local docker
+FROM devel as espnet_local
+LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
+
+ARG CUDA_VER
+WORKDIR /
+
+# IF using a local ESPNet repository, a temporary file containing the ESPnet git repo is copied over
+ARG ESPNET_ARCHIVE=./espnet-local.tar
+COPY  ${ESPNET_ARCHIVE} /espnet-local.tar
+
+
+# Download ESPnet
+RUN echo "Getting ESPnet sources from local repository, in temporary file: " ${ESPNET_ARCHIVE}
+RUN mkdir /espnet
+RUN tar xf espnet-local.tar -C /espnet/
+RUN rm espnet-local.tar
+
+RUN cd espnet && \
+    rm -rf docker egs test utils
+
+# Install espnet
+WORKDIR /espnet/tools
diff --git a/docker/prebuilt/devel/Dockerfile b/docker/prebuilt/devel/Dockerfile
deleted file mode 100644
index 55ddfdcdf90..00000000000
--- a/docker/prebuilt/devel/Dockerfile
+++ /dev/null
@@ -1,44 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-ARG CUDA_VER
-WORKDIR /
-
-ARG ESPNET_LOCATION=https://github.com/espnet/espnet
-
-# Download ESPnet
-RUN git clone ${ESPNET_LOCATION} && \
-    cd espnet && \
-    rm -rf docker egs egs2 espnet2 test utils
-
-# Install espnet
-WORKDIR /espnet/tools
-
-# Disable cupy test
-# Docker build does not load libcuda.so.1
-# Replace nvidia-smi for nvcc because docker does not load nvidia-smi
-RUN if [ -z "$( which nvcc )" ]; then \
-        echo "Build without CUDA" && \
-        MY_OPTS='CUPY_VERSION=""  TH_VERSION=1.6.0'; \
-    else \
-        echo "Build with CUDA" && \
-        # Docker containers cannot load cuda libs during build.
-        # So, their checks on cuda packages are disabled.
-        sed -i '200s|install.py|install.py --no-cuda --no-cupy |' Makefile && \
-        export CFLAGS="-I${CUDA_HOME}/include ${CFLAGS}" && \
-        MY_OPTS="CUDA_VERSION=${CUDA_VER}" && \
-        . ./setup_cuda_env.sh /usr/local/cuda;  \
-    fi; \ 
-    if [ "${CUDA_VER}" = "10.1" ]; then \
-        # warpctc is not supported from Pytorch 1.3.1
-        MY_OPTS="${MY_OPTS} TH_VERSION=1.6.0";  \
-    fi; \
-    echo "Make with options ${MY_OPTS}" && \
-    ln -s /kaldi ./ && \
-    ./setup_anaconda.sh /miniconda espnet 3.7.4 && \
-    make KALDI=/kaldi ${MY_OPTS}
-
-RUN rm -rf ../espnet
-
-WORKDIR /
diff --git a/docker/prebuilt/devel/gpu/10.0/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/10.0/cudnn7/Dockerfile
deleted file mode 100644
index 6556f9acab0..00000000000
--- a/docker/prebuilt/devel/gpu/10.0/cudnn7/Dockerfile
+++ /dev/null
@@ -1,73 +0,0 @@
-FROM espnet/espnet:runtime
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 10.0 base 
-
-RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
-    apt-get purge --autoremove -y curl && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CUDA_VERSION 10.0.130
-
-ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
-
-# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION \
-        cuda-compat-10-0 && \
-    ln -s cuda-10.0 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# Required for nvidia-docker v1
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=410,driver<411"
-
-ENV CUDA_HOME /usr/local/cuda
-
-## FROM CUDA 10.0 runtime
-
-ENV NCCL_VERSION 2.4.8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-nvtx-$CUDA_PKG_VERSION \
-        libnccl2=$NCCL_VERSION-1+cuda10.0 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-
-## FROM CUDA 10.0 devel
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        libnccl-dev=$NCCL_VERSION-1+cuda10.0 && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 10.0-CUDNN 7 devel
-
-ENV CUDNN_VERSION 7.6.5.32
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /
\ No newline at end of file
diff --git a/docker/prebuilt/devel/gpu/10.1/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/10.1/cudnn7/Dockerfile
deleted file mode 100644
index adcdace6628..00000000000
--- a/docker/prebuilt/devel/gpu/10.1/cudnn7/Dockerfile
+++ /dev/null
@@ -1,80 +0,0 @@
-FROM espnet/espnet:runtime
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 10.1 base [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/base/Dockerfile] 
-
-RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
-    apt-get purge --autoremove -y curl && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CUDA_VERSION 10.1.168
-
-ENV CUDA_PKG_VERSION 10-1=$CUDA_VERSION-1
-
-# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION \
-        cuda-compat-10-1 && \
-    ln -s cuda-10.1 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# Required for nvidia-docker v1
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=10.1 brand=tesla,driver>=384,driver<385 brand=tesla,driver>=396,driver<397 brand=tesla,driver>=410,driver<411"
-
-ENV CUDA_HOME /usr/local/cuda
-
-## FROM CUDA 10.1 runtime [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/runtime/Dockerfile]
-
-ENV NCCL_VERSION 2.7.8
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-nvtx-$CUDA_PKG_VERSION \
-        libnccl2=$NCCL_VERSION-1+cuda10.1 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-## FROM CUDA 10.1 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/10.1/devel/Dockerfile]
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        cuda-nvprof-$CUDA_PKG_VERSION \
-        cuda-npp-dev-$CUDA_PKG_VERSION \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        libcublas-dev=10.2.1.243-1 \
-        libnccl-dev=2.7.8-1+cuda10.1 && \
-    apt-mark hold libnccl-dev &&  \
-    rm -rf /var/lib/apt/lists/*
-
-# apt from auto upgrading the cublas package. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88
-RUN apt-mark hold libcublas-dev
-
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 10.1-CUDNN 7 devel
-
-ENV CUDNN_VERSION 7.6.0.64
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda10.1 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda10.1 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /
\ No newline at end of file
diff --git a/docker/prebuilt/devel/gpu/8.0/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/8.0/cudnn7/Dockerfile
deleted file mode 100644
index 64379a2531e..00000000000
--- a/docker/prebuilt/devel/gpu/8.0/cudnn7/Dockerfile
+++ /dev/null
@@ -1,79 +0,0 @@
-FROM espnet/espnet:runtime
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 8.0 runtime
-
-RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
-    rm -rf /var/lib/apt/lists/* && \
-    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
-    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
-    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
-    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
-
-ENV CUDA_VERSION 8.0.61
-
-ENV CUDA_PKG_VERSION 8-0=$CUDA_VERSION-1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-nvrtc-$CUDA_PKG_VERSION \
-        cuda-nvgraph-$CUDA_PKG_VERSION \
-        cuda-cusolver-$CUDA_PKG_VERSION \
-        cuda-cublas-8-0=8.0.61.2-1 \
-        cuda-cufft-$CUDA_PKG_VERSION \
-        cuda-curand-$CUDA_PKG_VERSION \
-        cuda-cusparse-$CUDA_PKG_VERSION \
-        cuda-npp-$CUDA_PKG_VERSION \
-        cuda-cudart-$CUDA_PKG_VERSION && \
-    ln -s cuda-8.0 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# nvidia-docker 1.0
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
-
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV CUDA_HOME /usr/local/cuda
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0"
-
-## FROM CUDA 8.0 devel
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-core-$CUDA_PKG_VERSION \
-        cuda-misc-headers-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        cuda-nvrtc-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-nvgraph-dev-$CUDA_PKG_VERSION \
-        cuda-cusolver-dev-$CUDA_PKG_VERSION \
-        cuda-cublas-dev-8-0=8.0.61.2-1 \
-        cuda-cufft-dev-$CUDA_PKG_VERSION \
-        cuda-curand-dev-$CUDA_PKG_VERSION \
-        cuda-cusparse-dev-$CUDA_PKG_VERSION \
-        cuda-npp-dev-$CUDA_PKG_VERSION \
-        cuda-cudart-dev-$CUDA_PKG_VERSION \
-        cuda-driver-dev-$CUDA_PKG_VERSION && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 8.0 CUDNN 7 devel
-
-RUN echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
-
-ENV CUDNN_VERSION 7.2.1.38
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda8.0 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda8.0 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
diff --git a/docker/prebuilt/devel/gpu/9.0/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/9.0/cudnn7/Dockerfile
deleted file mode 100644
index a4147705ef1..00000000000
--- a/docker/prebuilt/devel/gpu/9.0/cudnn7/Dockerfile
+++ /dev/null
@@ -1,75 +0,0 @@
-FROM espnet/espnet:runtime
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 9.0 base
-
-RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
-    rm -rf /var/lib/apt/lists/* && \
-    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
-    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
-    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
-    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
-
-ENV CUDA_VERSION 9.0.176
-
-ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION && \
-    ln -s cuda-9.0 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# nvidia-docker 1.0
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
-
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV CUDA_HOME /usr/local/cuda
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
-
-## FROM CUDA 9.0 runtime
-
-ENV NCCL_VERSION 2.4.2
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-cublas-9-0=9.0.176.4-1 \
-        libnccl2=$NCCL_VERSION-1+cuda9.0 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-## FROM CUDA 9.0 devel
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        cuda-core-9-0=9.0.176.3-1 \
-        cuda-cublas-dev-9-0=9.0.176.4-1 \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.0 && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 9.0 CUDNN 7 devel
-
-ENV CUDNN_VERSION 7.4.2.24
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda9.0 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
-
diff --git a/docker/prebuilt/devel/gpu/9.1/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/9.1/cudnn7/Dockerfile
deleted file mode 100644
index 05823cf717a..00000000000
--- a/docker/prebuilt/devel/gpu/9.1/cudnn7/Dockerfile
+++ /dev/null
@@ -1,71 +0,0 @@
-FROM espnet/espnet:runtime
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 9.1 base
-
-RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
-    rm -rf /var/lib/apt/lists/* && \
-    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
-    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
-    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
-    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
-
-ENV CUDA_VERSION 9.1.85
-
-ENV CUDA_PKG_VERSION 9-1=$CUDA_VERSION-1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION && \
-    ln -s cuda-9.1 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# nvidia-docker 1.0
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
-
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-ENV CUDA_HOME /usr/local/cuda
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=9.1"
-
-## FROM CUDA 9.1 runtime
-
-ENV NCCL_VERSION 2.2.12
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        libnccl2=$NCCL_VERSION-1+cuda9.1 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-## FROM CUDA 9.1 devel
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.1 && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 9.1 CUDNN 7
-
-ENV CUDNN_VERSION 7.1.2.21
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda9.1 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda9.1 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
diff --git a/docker/prebuilt/devel/gpu/9.2/cudnn7/Dockerfile b/docker/prebuilt/devel/gpu/9.2/cudnn7/Dockerfile
deleted file mode 100644
index c8c00dc5a01..00000000000
--- a/docker/prebuilt/devel/gpu/9.2/cudnn7/Dockerfile
+++ /dev/null
@@ -1,72 +0,0 @@
-FROM espnet/espnet:runtime
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-## FROM CUDA 9.2 base [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/base/Dockerfile]
-# CUDA 9.2 is not officially supported on ubuntu 18.04 yet, the ubuntu 17.10 repository for CUDA were used instead.
-RUN apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates && \
-    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64/7fa2af80.pub | apt-key add - && \
-    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1710/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
-    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
-    apt-get purge --autoremove -y curl && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CUDA_VERSION 9.2.148
-
-ENV CUDA_PKG_VERSION 9-2=$CUDA_VERSION-1
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-cudart-$CUDA_PKG_VERSION && \
-    ln -s cuda-9.2 /usr/local/cuda && \
-    rm -rf /var/lib/apt/lists/*
-
-# nvidia-docker 1.0
-LABEL com.nvidia.volumes.needed="nvidia_driver"
-LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
-
-RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
-    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
-
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
-
-# nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
-ENV NVIDIA_REQUIRE_CUDA "cuda>=9.2"
-
-ENV CUDA_HOME /usr/local/cuda
-
-## FROM CUDA 9.2 runtime
-
-ENV NCCL_VERSION 2.3.7
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
-        cuda-nvtx-$CUDA_PKG_VERSION \
-        libnccl2=$NCCL_VERSION-1+cuda9.2 && \
-    apt-mark hold libnccl2 && \
-    rm -rf /var/lib/apt/lists/*
-
-## FROM CUDA 9.2 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/devel/Dockerfile]
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
-        cuda-nvml-dev-$CUDA_PKG_VERSION \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.2 && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
-
-## FROM CUDA 9.2-CUDNN 7 devel [https://gitlab.com/nvidia/cuda/blob/ubuntu18.04/9.2/devel/cudnn7/Dockerfile]
-
-ENV CUDNN_VERSION 7.5.0.56
-LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-            libcudnn7=$CUDNN_VERSION-1+cuda9.2 \
-            libcudnn7-dev=$CUDNN_VERSION-1+cuda9.2 && \
-    apt-mark hold libcudnn7 && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /
\ No newline at end of file
diff --git a/docker/prebuilt/gpu.dockerfile b/docker/prebuilt/gpu.dockerfile
new file mode 100644
index 00000000000..a94504dc52c
--- /dev/null
+++ b/docker/prebuilt/gpu.dockerfile
@@ -0,0 +1,60 @@
+ARG FROM_TAG
+ARG NUM_BUILD_CORES=8
+ARG DOCKER_VER
+FROM espnet/espnet:${FROM_TAG} AS cuda_builder
+LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
+
+## FROM CUDA 11.1 base 
+## [https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.1.1/ubuntu20.04-x86_64/base/Dockerfile] 
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gnupg2 curl ca-certificates && \
+    curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+    apt-get purge --autoremove -y curl \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CUDA_VERSION 11.1.1
+
+# For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    cuda-cudart-11-1=11.1.74-1 \
+    cuda-compat-11-1 \
+    && ln -s cuda-11.1 /usr/local/cuda && \
+    rm -rf /var/lib/apt/lists/*
+
+# Required for nvidia-docker v1
+RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf \
+    && echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
+
+ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=11.1 brand=tesla,driver>=418,driver<419 brand=tesla,driver>=440,driver<441 driver>=450"
+
+ENV CUDA_HOME /usr/local/cuda
+
+## FROM CUDA 11.1 devel
+## [https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/11.1.1/ubuntu20.04-x86_64/devel/Dockerfile]
+ENV NCCL_VERSION 2.8.4
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libtinfo5 libncursesw5 \
+    cuda-cudart-dev-11-1=11.1.74-1 \
+    cuda-command-line-tools-11-1=11.1.1-1 \
+    cuda-minimal-build-11-1=11.1.1-1 \
+    cuda-libraries-dev-11-1=11.1.1-1 \
+    cuda-nvml-dev-11-1=11.1.74-1 \
+    libnpp-dev-11-1=11.1.2.301-1 \
+    libcublas-dev-11-1=11.3.0.106-1 \
+    libcusparse-dev-11-1=11.3.0.10-1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# apt from auto upgrading the cublas package. See https://gitlab.com/nvidia/container-images/cuda/-/issues/88
+RUN apt-mark hold libcublas-dev-11-1
+ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
+WORKDIR /
diff --git a/docker/prebuilt/local/Dockerfile b/docker/prebuilt/local/Dockerfile
deleted file mode 100644
index 15939185aff..00000000000
--- a/docker/prebuilt/local/Dockerfile
+++ /dev/null
@@ -1,50 +0,0 @@
-ARG FROM_TAG
-FROM espnet/espnet:${FROM_TAG}
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-ARG CUDA_VER
-WORKDIR /
-
-# IF using a local ESPNet repository, a temporary file containing the ESPnet git repo is copied over
-ARG ESPNET_ARCHIVE=./espnet-local.tar
-COPY  ${ESPNET_ARCHIVE} /espnet-local.tar
-
-
-# Download ESPnet
-RUN echo "Getting ESPnet sources from local repository, in temporary file: " ${ESPNET_ARCHIVE}
-RUN mkdir /espnet
-RUN tar xf espnet-local.tar -C /espnet/
-RUN rm espnet-local.tar
-
-RUN cd espnet && \
-    rm -rf docker egs test utils
-
-# Install espnet
-WORKDIR /espnet/tools
-
-# Replace nvidia-smi for nvcc because docker does not load nvidia-smi
-RUN if [ -z "$( which nvcc )" ]; then \
-        echo "Build without CUDA" && \
-        MY_OPTS='CUPY_VERSION=""  TH_VERSION=1.6.0'; \
-    else \
-        echo "Build with CUDA" && \
-        # Disable cupy test
-        # Docker build does not load libcuda.so.1
-        # So, their checks on cuda packages are disabled.
-        sed -i '200s|install.py|install.py --no-cuda --no-cupy |' Makefile && \
-        export CFLAGS="-I${CUDA_HOME}/include ${CFLAGS}" && \
-        MY_OPTS="CUDA_VERSION=${CUDA_VER}" && \
-        . ./setup_cuda_env.sh /usr/local/cuda;  \
-    fi; \ 
-    if [ "${CUDA_VER}" = "10.1" ]; then \
-        # warpctc is not supported from Pytorch 1.3.1
-        MY_OPTS="${MY_OPTS} TH_VERSION=1.6.0";  \
-    fi; \
-    echo "Make with options ${MY_OPTS}" && \
-    ln -s /kaldi ./ && \
-    ./setup_anaconda.sh /miniconda espnet 3.7.4 && \
-    make KALDI=/kaldi ${MY_OPTS}
-
-RUN rm -rf ../espnet
-
-WORKDIR /
diff --git a/docker/prebuilt/runtime.dockerfile b/docker/prebuilt/runtime.dockerfile
new file mode 100644
index 00000000000..5f54ed90c90
--- /dev/null
+++ b/docker/prebuilt/runtime.dockerfile
@@ -0,0 +1,72 @@
+ARG FROM_TAG
+ARG NUM_BUILD_CORES=8
+ARG DOCKER_VER
+
+FROM ubuntu:${FROM_TAG} AS main_builder
+LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
+
+ENV DOCKER_BUILT_VER ${DOCKER_VER}
+ENV NUM_BUILD_CORES ${NUM_BUILD_CORES}
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive \
+    apt-get -y install --no-install-recommends \ 
+        automake \
+        autoconf \
+        apt-utils \
+        bc \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        flac \
+        ffmpeg \
+        gawk \
+        gfortran \
+        git \
+        libboost-all-dev \
+        libtool \
+        libbz2-dev \
+        liblzma-dev \
+        libsndfile1-dev \
+        patch \
+        python2.7 \
+        python3 \
+        software-properties-common \
+        sox \
+        subversion \
+        unzip \
+        wget \
+        zip \
+        zlib1g-dev \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Latest version of git
+RUN add-apt-repository ppa:git-core/ppa -y && \
+    apt update && \
+    apt install -y --no-install-recommends git-all && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi /opt/kaldi
+
+RUN wget --tries=3 -nv "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda.sh && \
+    bash miniconda.sh -b -p /opt/miniconda && \
+    rm miniconda.sh
+
+WORKDIR /
+
+FROM main_builder AS espnet1
+# # Using kaldi pre-built binaries
+RUN cd /opt/kaldi/tools &&  \
+    echo "" > extras/check_dependencies.sh && \
+    chmod +x extras/check_dependencies.sh &&  \
+    cd /opt/kaldi && \
+    wget --tries=3 -nv https://github.com/espnet/kaldi-bin/releases/download/v0.0.1/ubuntu16-featbin.tar.gz && \
+    tar -xf ./ubuntu16-featbin.tar.gz && \
+    cp featbin/* src/featbin/ && \
+    rm -rf featbin && \
+    rm -f ubuntu16-featbin.tar.gz
+
+WORKDIR /
diff --git a/docker/prebuilt/runtime/Dockerfile b/docker/prebuilt/runtime/Dockerfile
deleted file mode 100644
index 1e1bc66a12c..00000000000
--- a/docker/prebuilt/runtime/Dockerfile
+++ /dev/null
@@ -1,60 +0,0 @@
-FROM ubuntu:18.04
-LABEL maintainer "Nelson Yalta <nyalta21@gmail.com>"
-
-ARG DOCKER_VER
-ENV DOCKER_BUILT_VER ${DOCKER_VER}}
-
-ARG NUM_BUILD_CORES=8
-ENV NUM_BUILD_CORES ${NUM_BUILD_CORES}
-
-RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install --no-install-recommends \ 
-        automake \
-        autoconf \
-        apt-utils \
-        bc \
-        build-essential \
-        ca-certificates \
-        cmake \
-        curl \
-        flac \
-        ffmpeg \
-        gawk \
-        gfortran \
-        git \
-        libtool \
-        libsndfile1-dev \
-        python2.7 \
-        python3 \
-        sox \
-        subversion \
-        unzip \
-        wget \
-        zip \
-        zlib1g-dev \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install Kaldi
-RUN git clone https://github.com/kaldi-asr/kaldi
-
-RUN cd /kaldi/tools && \
-    ./extras/install_mkl.sh -sp debian intel-mkl-64bit-2019.2-057 && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
-    make all && \
-    rm -r openfst-*/src && \
-    ./extras/install_beamformit.sh && \
-    ./extras/install_irstlm.sh && \
-    cd /kaldi/src && \
-    ./configure --shared --use-cuda=no && \
-    make depend -j${NUM_BUILD_CORES} && \
-    make -j${NUM_BUILD_CORES} && \
-    find /kaldi/src -name "*.o" -exec rm -f {} \; && \
-    find /kaldi/src -name "*.o" -exec rm -f {} \;
-
-RUN wget --tries=3 "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda.sh && \
-    bash miniconda.sh -b -p /miniconda && \
-    rm miniconda.sh
-
-WORKDIR /
\ No newline at end of file
diff --git a/docker/run.sh b/docker/run.sh
index 4f5500a41b9..cff0d5604bc 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -3,11 +3,10 @@
 docker_gpu=0
 docker_egs=
 docker_folders=
-docker_cuda=10.1
+docker_tag=latest
 
 docker_env=
 docker_cmd=
-docker_os=u18
 
 is_root=false
 is_local=false
@@ -69,25 +68,19 @@ fi
 
 from_tag="cpu"
 if [ ! "${docker_gpu}" == "-1" ]; then
-    if [ -z "${docker_cuda}" ]; then
-        # If the docker_cuda is not set, the program will automatically 
-        # search the installed version with default configurations (apt)
-        docker_cuda=$( nvcc -V | grep release )
-        docker_cuda=${docker_cuda#*"release "}
-        docker_cuda=${docker_cuda%,*}
-    fi
+    docker_cuda=$( nvcc -V | grep release )
+    docker_cuda=${docker_cuda#*"release "}
+    docker_cuda=${docker_cuda%,*}
+
     # After search for your cuda version, if the variable docker_cuda is empty the program will raise an error
     if [ -z "${docker_cuda}" ]; then
-        echo "CUDA was not found in your system. Use CPU image or install NVIDIA-DOCKER, CUDA and NVCC for GPU image."
+        echo "CUDA was not found in your system. Use CPU image or install NVIDIA-DOCKER, CUDA for GPU image."
         exit 1
-    else
-        from_tag="gpu-cuda${docker_cuda}-cudnn7"
     fi
+        from_tag="gpu"
 fi
 
-if [ ! -z "${docker_os}" ]; then
-    from_tag="${from_tag}-${docker_os}"
-fi
+from_tag="${from_tag}-${docker_tag}"
 
 EXTRAS=${is_extras}
 
@@ -123,8 +116,8 @@ if [ ${is_root} = false ]; then
         build_args="${build_args} --build-arg THIS_UID=${UID}"
         build_args="${build_args} --build-arg EXTRA_LIBS=${EXTRAS}"
 
-        echo "Now running docker build ${build_args} -f prebuilt/Dockerfile -t espnet/espnet:${container_tag} ."
-        (docker build ${build_args} -f prebuilt/Dockerfile -t  espnet/espnet:${container_tag} .) || exit 1
+        echo "Now running docker build ${build_args} -f espnet.dockerfile -t espnet/espnet:${container_tag} ."
+        (docker build ${build_args} -f espnet.dockerfile -t  espnet/espnet:${container_tag} .) || exit 1
     fi
 else
     container_tag=${from_tag}
diff --git a/egs/README.md b/egs/README.md
index 2ea843d193b..61951b84d47 100755
--- a/egs/README.md
+++ b/egs/README.md
@@ -13,7 +13,7 @@ See: https://espnet.github.io/espnet/tutorial.html
 | aishell2                | AISHELL-2 Open Source Mandarin Speech Corpus                 | ASR                                        | ZH             | http://www.aishelltech.com/aishell_2                                                         |
 | ami                     | The AMI Meeting Corpus                                       | ASR                                        | EN             | http://groups.inf.ed.ac.uk/ami/corpus/                       |                               |
 | an4                     | CMU AN4 database                                             | ASR/TTS                                    | EN             | http://www.speech.cs.cmu.edu/databases/an4/                  |                               |
-| arctic                 | CMU ARCTIC databases                                          | TTS, VC                                    | EN, EN -> EN   | http://www.festvox.org/cmu_arctic/                           |                               |
+| arctic                  | CMU ARCTIC databases                                         | TTS, VC                                    | EN, EN -> EN   | http://www.festvox.org/cmu_arctic/                           |                               |
 | aurora4                 | Aurora-4 database                                            | ASR                                        | EN             | http://aurora.hsnr.de/aurora-4.html                          |                               |
 | babel                   | IARPA Babel corups                                           | ASR                                        | ~20 Languages  | https://www.iarpa.gov/index.php/research-programs/babel      |                               |
 | blizzard_2017           | Blizzard Challenge 2017                                      | TTS                                        | EN             | https://www.synsig.org/index.php/Blizzard_Challenge_2017     |                               |
@@ -22,6 +22,7 @@ See: https://espnet.github.io/espnet/tutorial.html
 | chime6                  | The 6th CHiME Speech Separation and Recognition Challenge    | ASR                                        | EN             | https://chimechallenge.github.io/chime6/                |                               |
 | cmu_wilderness          | CMU Wilderness Multilingual Speech Dataset                   | Multilingual ASR                           | ~100 Languages | https://github.com/festvox/datasets-CMU_Wilderness           |                               |
 | commonvoice             | The Mozilla Common Voice                                     | ASR                                        | 13 Languages   | https://voice.mozilla.org/datasets                           |                               |
+| covost2                 | CoVoST: A Large-Scale Multilingual Speech-To-Text Translation Corpus | ASR/Machine Translation/Speech Translation                                        | 15+21 Language pairs   | https://github.com/facebookresearch/covost                           |                               |
 | csj                     | Corpus of Spontaneous Japanese                               | ASR                                        | JP             | https://pj.ninjal.ac.jp/corpus_center/csj/en/                |                               |
 | csmsc                   | Chinese Standard Mandarin Speech Copus                       | TTS                                        | ZH             | https://www.data-baker.com/open_source.html                  |                               |
 | dipco                   | Dinner Party Corpus                                          | ASR                                        | EN             | https://arxiv.org/abs/1909.13447                             |                               |
@@ -33,6 +34,9 @@ See: https://espnet.github.io/espnet/tutorial.html
 | hub4_spanish            | 1997 Spanish Broadcast News Speech (HUB4-NE)                 | ASR                                        | ES             | https://catalog.ldc.upenn.edu/LDC98S74, https://catalog.ldc.upenn.edu/LDC98T29 |                               |
 | iwslt16                 | International Workshop on Spoken Language Translation 2016   | Machine Translation | EN->DE         | https://wit3.fbk.eu/mt.php?release=2016-01 |                               |
 | iwslt18                 | International Workshop on Spoken Language Translation 2018   | ASR/Machine Translation/Speech Translation | EN->DE         | https://sites.google.com/site/iwsltevaluation2018/Lectures-task |                               |
+| iwslt19                 | International Workshop on Spoken Language Translation 2019   | ASR/Speech Translation                     | EN->DE         | https://sites.google.com/view/iwslt-evaluation-2019/speech-translation    |
+| iwslt21                 | International Workshop on Spoken Language Translation 2021   | ASR/Machine Translation/Speech Translation | EN->DE         | https://iwslt.org/2021/offline |
+| iwslt21_low_resource    | International Workshop on Spoken Language Translation 2021   | ASR/Speech Translation | SWA->EN & SWC->FR         | https://iwslt.org/2021/low-resource  |
 | jesc                    | Japanese-English Subtitle Corpus                             | Machine Translation                        | EN->JP         | https://nlp.stanford.edu/projects/jesc/                              |                         |
 | jnas                    | ASJ Japanese Newspaper Article Sentences Read Speech Corpus (JNAS) | ASR/TTS                              | JP             | http://research.nii.ac.jp/src/JNAS.html                      |                               |
 | jsalt18e2e              | Multilingual End-to-end ASR for Incomplete Data Benchmark    | Multilingual ASR                           | ~20 Languages  | https://www.clsp.jhu.edu/workshops/18-workshop/multilingual-end-end-asr-incomplete-data/ | babel+                        |
@@ -45,7 +49,10 @@ See: https://espnet.github.io/espnet/tutorial.html
 | librispeech             | LibriSpeech ASR corpus                                       | ASR                                        | EN             | http://www.openslr.org/12                                    |                               |
 | libritts                | LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech | TTS                                      | EN             | http://www.openslr.org/60/                                   |                               |
 | ljspeech                | The LJ Speech Dataset                                        | TTS                                        | EN             | https://keithito.com/LJ-Speech-Dataset/                      |                               |
+| lrs                     | The Lip Reading Sentences Dataset                            | ASR/AVSR                                       | EN             | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                      |                               |
 | m_ailabs                | The M-AILABS Speech Dataset                                  | TTS                                        | ~5 languages   | https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/    |
+| mucs_2021               | MUCS 2021: MUltilingual and Code-Switching ASR Challenges for Low Resource Indian Languages   | ASR/Code Switching          | HI, MR, OR, TA, TE, GU, HI-EN, BN-EN | https://navana-tech.github.io/MUCS2021/data.html                    |                               |
+| mtedx                   | Multilingual TEDx | ASR/Machine Translation/Speech Translation | 13 Language pairs | http://www.openslr.org/100/                         |
 | must_c                  | Must-C Multilingual Speech Translation Corpus | ASR/Machine Translation/Speech Translation                | EN->{DE, ES, FR, IT, NL, PT, RO, RU} | https://ict.fbk.eu/must-c/                    |                               |                          |
 | must_c_v2               | Must-C Multilingual Speech Translation Corpus | ASR/Machine Translation/Speech Translation                | EN->DE         | https://ict.fbk.eu/must-c/ https://iwslt.org/2021/offline    | More talks that result in 20k more audio/text segments. Improved cleaning strategies able to better discard low-quality triplets. TED talks of MuST-C v2 were downloaded from the YouTube TED channel. |
 | puebla_nahuatl                  | The Puebla-Nahuatl Corpus                                | ASR                                    | Nahuatl        | http://www.openslr.org/89              |                                                     |
diff --git a/egs/aidatatang_200zh/asr1/cmd.sh b/egs/aidatatang_200zh/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/aidatatang_200zh/asr1/cmd.sh
+++ b/egs/aidatatang_200zh/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/aishell/asr1/RESULTS.md b/egs/aishell/asr1/RESULTS.md
index 158645ba99d..6221ac11328 100644
--- a/egs/aishell/asr1/RESULTS.md
+++ b/egs/aishell/asr1/RESULTS.md
@@ -1,3 +1,82 @@
+# Conformer-Transducer with auxiliary task (CTC weight = 0.5)
+
+## Environments
+- Same as RNN-Transducer (see below)
+
+## Config files
+- preprocess config: `conf/specaug.yaml`
+- train config: `conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml`
+- lm config: `-` (LM was not used)
+- decode config: `conf/tuning/transducer/decode_default.yaml`
+- ngpu: `4`
+
+## Results (CER)
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_decode_default|14326|205341|95.8|4.0|0.2|0.1|4.3|33.6|
+|decode_test_decode_default|7176|104765|95.3|4.4|0.2|0.1|4.8|36.3|
+
+
+# Conformer-Transducer
+
+## Environments
+- Same as RNN-Transducer (see below)
+
+## Config files
+- preprocess config: `conf/specaug.yaml`
+- train config: `conf/tuning/transducer/train_conformer-rnn_transducer.yaml`
+- lm config: `-` (LM was not used)
+- decode config: `conf/tuning/transducer/decode_default.yaml`
+
+## Results (CER)
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_decode_default|14326|205341|95.6|4.2|0.2|0.1|4.5|34.0|
+|decode_test_decode_default|7176|104765|95.0|4.7|0.3|0.1|5.0|37.1|
+
+
+# RNN-Transducer with auxiliary task (CTC weight = 0.1)
+
+## Environments
+- Same as RNN-Transducer (see below)
+
+## Config files
+- preprocess config: `conf/specaug.yaml`
+- train config: `conf/tuning/transducer/train_transducer_aux.yaml`
+- lm config: `-` (LM was not used)
+- decode config: `conf/tuning/transducer/decode_default.yaml`
+
+## Results (CER)
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_decode_default|14326|205341|93.9|5.8|0.3|0.1|6.3|41.9|
+|decode_test_decode_default|7176|104765|93.2|6.5|0.4|0.1|6.9|44.5|
+
+
+# RNN-Transducer
+
+## Environments
+- date: `Thu May 20 05:29:03 UTC 2021`
+- python version: `3.7.4 (default, Aug 13 2019, 20:35:49)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.8`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.6.0`
+- Git hash: `95b3008cdcc2247e781a048bc999243dc7f45fe7`
+  - Commit date: `Sat Mar 6 00:48:29 2021 +0000`
+
+## Config files
+- preprocess config: `conf/specaug.yaml`
+- train config: `conf/tuning/transducer/train_transducer.yaml`
+- lm config: `-` (LM was not used)
+- decode config: `conf/tuning/transducer/decode_default.yaml`
+
+## Results (CER)
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_decode_default|14326|205341|93.8|5.9|0.3|0.1|6.3|42.0|
+|decode_test_decode_default|7176|104765|92.9|6.7|0.3|0.1|7.2|45.9|
+
+
 # Conformer (kernel size = 15) + SpecAugment + LM weight = 0.0 result
 
 - training config file: `conf/tuning/train_pytorch_conformer_kernel15.yaml`
diff --git a/egs/aishell/asr1/cmd.sh b/egs/aishell/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/aishell/asr1/cmd.sh
+++ b/egs/aishell/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/aishell/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
index 7d5effcca04..5a51f93ffda 100644
--- a/egs/aishell/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
+++ b/egs/aishell/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
@@ -43,7 +43,7 @@ transformer-init: pytorch
 transformer-encoder-pos-enc-layer-type: rel_pos
 transformer-encoder-selfattn-layer-type: rel_selfattn
 transformer-encoder-activation-type: swish
-rel_pos_type: latest
+rel-pos-type: latest
 macaron-style: true
 use-cnn-module: true
 cnn-module-kernel: 15
diff --git a/egs/aishell/asr1/conf/tuning/transducer/decode_default.yaml b/egs/aishell/asr1/conf/tuning/transducer/decode_default.yaml
new file mode 100644
index 00000000000..b62b87b7f73
--- /dev/null
+++ b/egs/aishell/asr1/conf/tuning/transducer/decode_default.yaml
@@ -0,0 +1,5 @@
+# decoding parameters
+batch: 0
+beam-size: 10
+search-type: default
+score-norm: True
diff --git a/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml b/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
new file mode 100644
index 00000000000..cfb84ec9732
--- /dev/null
+++ b/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
@@ -0,0 +1,52 @@
+# minibatch related
+batch-size: 64
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+epochs: 100
+patience: 0
+accum-grad: 2
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+enc-block-arch:
+        - type: conformer
+          d_hidden: 512
+          d_ff: 2048
+          heads: 4
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+enc-block-repeat: 12
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 1024
+dunits: 512
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 512
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+
+# reporter related
+report-wer: True
+report-cer: True
diff --git a/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml b/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml
new file mode 100644
index 00000000000..28c37402b4b
--- /dev/null
+++ b/egs/aishell/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux_ngpu4.yaml
@@ -0,0 +1,57 @@
+# minibatch related
+batch-size: 32
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+epochs: 100
+patience: 0
+#accum-grad: 2
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+enc-block-arch:
+        - type: conformer
+          d_hidden: 512
+          d_ff: 2048
+          heads: 4
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+enc-block-repeat: 12
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 1024
+dunits: 512
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 512
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+
+# reporter related
+report-wer: True
+report-cer: True
+
+# auxiliary task
+aux-ctc: True
+aux-ctc-weight: 0.5
+aux-ctc-dropout-rate: 0.1
diff --git a/egs/aishell/asr1/conf/tuning/transducer/train_transducer.yaml b/egs/aishell/asr1/conf/tuning/transducer/train_transducer.yaml
new file mode 100644
index 00000000000..c8be66354fc
--- /dev/null
+++ b/egs/aishell/asr1/conf/tuning/transducer/train_transducer.yaml
@@ -0,0 +1,37 @@
+# minibatch related
+batch-size: 64
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: adadelta
+epochs: 30
+patience: 3
+accum-grad: 2
+
+# network architecture
+## encoder related
+etype: vggblstm
+elayers: 6
+eunits: 512
+eprojs: 512
+dropout-rate: 0.4
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 1024
+dunits: 512
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 512
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+
+# reporter related
+report-wer: True
+report-cer: True
diff --git a/egs/aishell/asr1/conf/tuning/transducer/train_transducer_aux.yaml b/egs/aishell/asr1/conf/tuning/transducer/train_transducer_aux.yaml
new file mode 100644
index 00000000000..9c3fc715bc7
--- /dev/null
+++ b/egs/aishell/asr1/conf/tuning/transducer/train_transducer_aux.yaml
@@ -0,0 +1,42 @@
+# minibatch related
+batch-size: 64
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: adadelta
+epochs: 30
+patience: 3
+accum-grad: 2
+
+# network architecture
+## encoder related
+etype: vggblstm
+elayers: 6
+eunits: 512
+eprojs: 512
+dropout-rate: 0.4
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 1024
+dunits: 512
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 512
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+
+# reporter related
+report-wer: True
+report-cer: True
+
+# auxiliary task
+aux-ctc: True
+aux-ctc-weight: 0.1
+aux-ctc-dropout-rate: 0.1
diff --git a/egs/aishell/asr1/run.sh b/egs/aishell/asr1/run.sh
index 3f92e8bac00..a19805ee0cd 100755
--- a/egs/aishell/asr1/run.sh
+++ b/egs/aishell/asr1/run.sh
@@ -241,6 +241,15 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         		       --out ${expdir}/results/${recog_model} \
         		       --num ${n_average}
     fi
+
+    if [[ $(get_yaml.py ${train_config} model-module) = *transducer* ]]; then
+        echo "[info]: transducer model does not support '--api v2'" \
+             "(hence ngram is ignored)"
+        recog_v2_opts=""
+    else
+        recog_v2_opts="--ngram-model ${ngramexpdir}/${n_gram}gram.bin --api v2"
+    fi
+
     pids=() # initialize pids
     for rtask in ${recog_set}; do
     (
@@ -263,8 +272,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${recog_model}  \
             --rnnlm ${lmexpdir}/rnnlm.model.best \
-            --ngram-model ${ngramexpdir}/${n_gram}gram.bin \
-            --api v2
+            ${recog_v2_opts}
 
         score_sclite.sh ${expdir}/${decode_dir} ${dict}
 
diff --git a/egs/aishell2/asr1/cmd.sh b/egs/aishell2/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100755
--- a/egs/aishell2/asr1/cmd.sh
+++ b/egs/aishell2/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/ami/asr1/cmd.sh b/egs/ami/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/ami/asr1/cmd.sh
+++ b/egs/ami/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/ami/asr1/local/ami_download.sh b/egs/ami/asr1/local/ami_download.sh
index bae72d1716a..0c19b24b9ad 100755
--- a/egs/ami/asr1/local/ami_download.sh
+++ b/egs/ami/asr1/local/ami_download.sh
@@ -103,7 +103,7 @@ else
   fi
 fi
 
-echo "Downloads of AMI corpus completed succesfully. License can be found under $adir/LICENCE.TXT"
+echo "Downloads of AMI corpus completed successfully. License can be found under $adir/LICENCE.TXT"
 exit 0;
 
 
diff --git a/egs/ami/asr1/local/ami_ihm_scoring_data_prep.sh b/egs/ami/asr1/local/ami_ihm_scoring_data_prep.sh
index f8420041362..a0b6470fb87 100755
--- a/egs/ami/asr1/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/asr1/local/ami_ihm_scoring_data_prep.sh
@@ -88,7 +88,7 @@ awk '{print $1}' $dir/segments | \
 sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 
 #check and correct the case when segment timings for given speaker overlap themself
-#(important for simulatenous asclite scoring to proceed).
+#(important for simultaneous asclite scoring to proceed).
 #There is actually only one such case for devset and automatic segmentetions
 join $dir/utt2spk $dir/segments | \
    perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
diff --git a/egs/ami/asr1/local/ami_mdm_scoring_data_prep.sh b/egs/ami/asr1/local/ami_mdm_scoring_data_prep.sh
index 7d4d963f688..8dc96e52318 100755
--- a/egs/ami/asr1/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/asr1/local/ami_mdm_scoring_data_prep.sh
@@ -94,7 +94,7 @@ awk '{print $1}' $tmpdir/segments | \
           print "$1$2$3 $1$2\n";' > $tmpdir/utt2spk_stm || exit 1;
 
 #check and correct case when segment timings for a given speaker overlap themself
-#(important for simulatenous asclite scoring to proceed).
+#(important for simultaneous asclite scoring to proceed).
 #There is actually only one such case for devset and automatic segmentetions
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
@@ -122,7 +122,7 @@ for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel
 done
 
 cp local/english.glm $dir/glm
-#note, although utt2spk contains mappings to the whole meetings for simulatenous scoring
+#note, although utt2spk contains mappings to the whole meetings for simultaneous scoring
 #we need to know which speakers overlap at meeting level, hence we generate an extra utt2spk_stm file
 local/convert2stm.pl $dir utt2spk_stm > $dir/stm
 
diff --git a/egs/ami/asr1/local/ami_sdm_scoring_data_prep.sh b/egs/ami/asr1/local/ami_sdm_scoring_data_prep.sh
index b0a656d1444..a2be3cd695a 100755
--- a/egs/ami/asr1/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/asr1/local/ami_sdm_scoring_data_prep.sh
@@ -106,7 +106,7 @@ awk '{print $1}' $tmpdir/segments | \
     > $tmpdir/utt2spk_stm || exit 1;
 
 #check and correct the case when segment timings for given speaker overlap themself
-#(important for simulatenous asclite scoring to proceed).
+#(important for simultaneous asclite scoring to proceed).
 #There is actually only one such case for devset and automatic segmentetions
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
diff --git a/egs/an4/asr1/cmd.sh b/egs/an4/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/an4/asr1/cmd.sh
+++ b/egs/an4/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/an4/tts1/cmd.sh b/egs/an4/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/an4/tts1/cmd.sh
+++ b/egs/an4/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/arctic/tts1/cmd.sh b/egs/arctic/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/arctic/tts1/cmd.sh
+++ b/egs/arctic/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/arctic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml b/egs/arctic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml
index 70f84e663b3..ee46dcaf412 100644
--- a/egs/arctic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml
+++ b/egs/arctic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml
@@ -1,5 +1,5 @@
 # This configuration uses reduction factor = 1 and location-sensitive attention.
-# Furthermore, to accelerate the learning of diaogonal attention, we additionaly
+# Furthermore, to accelerate the learning of diaogonal attention, we additionally
 # use guided attention loss. This leads super fast and robust attention learning.
 
 # encoder related
diff --git a/egs/arctic/tts1/local/data_download.sh b/egs/arctic/tts1/local/data_download.sh
index e7fb368be7f..18da617f74e 100755
--- a/egs/arctic/tts1/local/data_download.sh
+++ b/egs/arctic/tts1/local/data_download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/arctic/tts1/local/data_prep.sh b/egs/arctic/tts1/local/data_prep.sh
index d087c2f9f9d..3d23e19a720 100755
--- a/egs/arctic/tts1/local/data_prep.sh
+++ b/egs/arctic/tts1/local/data_prep.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/arctic/tts1/local/pretrained_model_download.sh b/egs/arctic/tts1/local/pretrained_model_download.sh
index 89698164812..cd01a43faec 100755
--- a/egs/arctic/tts1/local/pretrained_model_download.sh
+++ b/egs/arctic/tts1/local/pretrained_model_download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
@@ -34,4 +35,4 @@ if [ ! -e ${dir}/.complete ]; then
     download_from_google_drive.sh ${share_url} ${dir} ".tar.gz"
     touch ${dir}/.complete
 fi
-echo "Successfully finished donwload of pretrained model."
+echo "Successfully finished download of pretrained model."
diff --git a/egs/arctic/vc1/cmd.sh b/egs/arctic/vc1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/arctic/vc1/cmd.sh
+++ b/egs/arctic/vc1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/arctic/vc1/conf/tuning/train_pytorch_transformer.tts_pt.v1.1.single.yaml b/egs/arctic/vc1/conf/tuning/train_pytorch_transformer.tts_pt.v1.1.single.yaml
index d1d266a3329..ca1b7a12938 100644
--- a/egs/arctic/vc1/conf/tuning/train_pytorch_transformer.tts_pt.v1.1.single.yaml
+++ b/egs/arctic/vc1/conf/tuning/train_pytorch_transformer.tts_pt.v1.1.single.yaml
@@ -22,7 +22,7 @@ use-masking: True
 bce-pos-weight: 5.0
 use-batch-norm: True
 use-scaled-pos-enc: True
-encoder-normalize-before: False
+encoder-normalize-before: True
 decoder-normalize-before: False
 encoder-concat-after: False
 decoder-concat-after: False
diff --git a/egs/arctic/vc1/local/pretrained_model_download.sh b/egs/arctic/vc1/local/pretrained_model_download.sh
index b15be3ba196..cdfb8a41d8c 100755
--- a/egs/arctic/vc1/local/pretrained_model_download.sh
+++ b/egs/arctic/vc1/local/pretrained_model_download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2020 Nagoya University (Wen-Chin Huang)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
@@ -31,4 +32,4 @@ if [ ! -e ${dir}/.complete ]; then
     download_from_google_drive.sh ${share_url} ${dir} "tar.gz"
     touch ${dir}/.complete
 fi
-echo "Successfully finished donwload of pretrained model."
+echo "Successfully finished download of pretrained model."
diff --git a/egs/aurora4/asr1/cmd.sh b/egs/aurora4/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/aurora4/asr1/cmd.sh
+++ b/egs/aurora4/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/babel/asr1/README.md b/egs/babel/asr1/README.md
index e8cb946cb64..60d2d98cb5a 100644
--- a/egs/babel/asr1/README.md
+++ b/egs/babel/asr1/README.md
@@ -62,7 +62,7 @@ To run the experiment do
 
 `cd ../expname`
 
-To specify the BABEL langauges in training refer to them by their language id.
+To specify the BABEL languages in training refer to them by their language id.
 See  conf/lang.conf for the exhaustive list of languages and corresponding
 language ids.
 
diff --git a/egs/babel/asr1/cmd.sh b/egs/babel/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/babel/asr1/cmd.sh
+++ b/egs/babel/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/babel/asr1/conf/lang.conf b/egs/babel/asr1/conf/lang.conf
index ae6fa9d4593..7b2960a7231 100644
--- a/egs/babel/asr1/conf/lang.conf
+++ b/egs/babel/asr1/conf/lang.conf
@@ -1,7 +1,7 @@
 # A giant configurations file for all the BABEL languages
 # as well as some training configurations for training HMM-GMM systems
 # for obtaining phoneme level alignments if you really want to do that
-# All paths starting with /export/* are set for the JHU/CLSP grid and shoudl
+# All paths starting with /export/* are set for the JHU/CLSP grid and should
 # be changed appropriately for other users
 
 # Cantonese
diff --git a/egs/babel/asr1/local/run_all.sh b/egs/babel/asr1/local/run_all.sh
index d04d04c6f2f..3359508b675 100755
--- a/egs/babel/asr1/local/run_all.sh
+++ b/egs/babel/asr1/local/run_all.sh
@@ -8,7 +8,7 @@ for x in 101-cantonese 102-assamese 103-bengali 104-pashto 105-turkish 106-tagal
     ./setup_experiment.sh asr1_${lang}
     pushd ../asr1_${lang}
     ./run.sh --langs $langid --recog $langid --ngpu 1 &
-    sleep 20m # to avoid too many disk access happend at the same time
+    sleep 20m # to avoid too many disk access happened at the same time
     popd
 done
 
diff --git a/egs/blizzard17/tts1/cmd.sh b/egs/blizzard17/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/blizzard17/tts1/cmd.sh
+++ b/egs/blizzard17/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/blizzard17/tts1/conf/tuning/train_pytorch_tacotron2.tuning.lab3-rev3.yaml b/egs/blizzard17/tts1/conf/tuning/train_pytorch_tacotron2.tuning.lab3-rev3.yaml
index 504d62846b3..f7c02fc2405 100755
--- a/egs/blizzard17/tts1/conf/tuning/train_pytorch_tacotron2.tuning.lab3-rev3.yaml
+++ b/egs/blizzard17/tts1/conf/tuning/train_pytorch_tacotron2.tuning.lab3-rev3.yaml
@@ -1,5 +1,5 @@
 # To make the attention wight diagonal in decoding, we use forward attention.
-# Futhermore, we use reduction-fucter :3 to generate clear speech.
+# Furthermore, we use reduction-fucter :3 to generate clear speech.
 
 # encoder related
 embed-dim: 512
diff --git a/egs/chime4/asr1/cmd.sh b/egs/chime4/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/chime4/asr1/cmd.sh
+++ b/egs/chime4/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/chime4/asr1/run.sh b/egs/chime4/asr1/run.sh
index 38196c45088..85c4e4a64eb 100755
--- a/egs/chime4/asr1/run.sh
+++ b/egs/chime4/asr1/run.sh
@@ -72,7 +72,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         ${chime4_data}/data/audio/16kHz/isolated_2ch_track enhan/beamformit_2mics
     local/run_beamform_6ch_track.sh --cmd "${train_cmd}" --nj 20 \
         ${chime4_data}/data/audio/16kHz/isolated_6ch_track enhan/beamformit_5mics
-    echo "prepartion for chime4 data"
+    echo "preparation for chime4 data"
     local/real_noisy_chime4_data_prep.sh ${chime4_data}
     local/simu_noisy_chime4_data_prep.sh ${chime4_data}
     echo "test data for 1ch track"
diff --git a/egs/chime4/asr1_multich/cmd.sh b/egs/chime4/asr1_multich/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/chime4/asr1_multich/cmd.sh
+++ b/egs/chime4/asr1_multich/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/chime4/asr1_multich/run.sh b/egs/chime4/asr1_multich/run.sh
index 28651c03b59..a0f395e6193 100755
--- a/egs/chime4/asr1_multich/run.sh
+++ b/egs/chime4/asr1_multich/run.sh
@@ -61,7 +61,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     wsj0_data=${chime4_data}/data/WSJ0
     local/clean_wsj0_data_prep.sh ${wsj0_data}
     local/clean_chime4_format_data.sh
-    echo "prepartion for chime4 data"
+    echo "preparation for chime4 data"
     local/real_noisy_chime4_data_prep.sh ${chime4_data}
     local/simu_noisy_chime4_data_prep.sh ${chime4_data}
     local/bth_chime4_data_prep.sh ${chime4_data}
diff --git a/egs/chime5/asr1/cmd.sh b/egs/chime5/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/chime5/asr1/cmd.sh
+++ b/egs/chime5/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/chime6/asr1/cmd.sh b/egs/chime6/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/chime6/asr1/cmd.sh
+++ b/egs/chime6/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/chime6/asr1/local/install_pb_chime5.sh b/egs/chime6/asr1/local/install_pb_chime5.sh
index 430edb6810d..3a3805daff2 100755
--- a/egs/chime6/asr1/local/install_pb_chime5.sh
+++ b/egs/chime6/asr1/local/install_pb_chime5.sh
@@ -8,7 +8,7 @@ cd pb_chime5
 git submodule init  
 git submodule update
 
-# sudo apt install libopenmpi-dev -- if you have problem with mpi4py instalation
+# sudo apt install libopenmpi-dev -- if you have problem with mpi4py installation
 
 python -m pip install cython
 python -m pip install pymongo
diff --git a/egs/cmu_indic/tts1/cmd.sh b/egs/cmu_indic/tts1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/cmu_indic/tts1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/cmu_indic/tts1/conf/decode.yaml b/egs/cmu_indic/tts1/conf/decode.yaml
new file mode 100644
index 00000000000..9fe723b5b15
--- /dev/null
+++ b/egs/cmu_indic/tts1/conf/decode.yaml
@@ -0,0 +1,7 @@
+# decoding setting
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use-att-constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward-window: 1        # Backward window size in the attention constraint
+forward-window: 3         # Forward window size in the attention constraint
diff --git a/egs/cmu_indic/tts1/conf/gpu.conf b/egs/cmu_indic/tts1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/cmu_indic/tts1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/cmu_indic/tts1/conf/queue.conf b/egs/cmu_indic/tts1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/cmu_indic/tts1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/cmu_indic/tts1/conf/slurm.conf b/egs/cmu_indic/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/cmu_indic/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/cmu_indic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml b/egs/cmu_indic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml
new file mode 100644
index 00000000000..05be689e5bc
--- /dev/null
+++ b/egs/cmu_indic/tts1/conf/train_pytorch_tacotron2.v3.finetune.yaml
@@ -0,0 +1,49 @@
+# encoder related
+embed-dim: 512
+elayers: 1
+eunits: 512
+econv-layers: 3 # if set 0, no conv layer is used
+econv-chans: 512
+econv-filts: 5
+
+# decoder related
+dlayers: 2
+dunits: 1024
+prenet-layers: 2  # if set 0, no prenet is used
+prenet-units: 256
+postnet-layers: 5 # if set 0, no postnet is used
+postnet-chans: 512
+postnet-filts: 5
+
+# attention related
+atype: location
+adim: 128
+aconv-chans: 32
+aconv-filts: 15      # resulting in filter-size = aconv-filts * 2 + 1
+cumulate-att-w: true # whether to cumulate attetion weight
+use-batch-norm: true # whether to use batch normalization in conv layer
+use-concate: true    # whether to concatenate encoder embedding with decoder lstm outputs
+use-residual: false  # whether to use residual connection in encoder convolution
+use-masking: true    # whether to mask the padded part in loss calculation
+bce-pos-weight: 1.0  # weight for positive samples of stop token in cross-entropy calculation
+use-guided-attn-loss: true
+guided-attn-loss-sigma: 0.4
+reduction-factor: 1
+
+# minibatch related
+batch-sort-key: input # shuffle or input or output
+batch-bins: 2428800   # 32 * (870 * 80 + 180 * 35)
+                      # batch-size * (max_out * dim_out + max_in * dim_in)
+                      # resuling in 38 ~ 127 samples (avg 38 samples) in batch (330 batches per epochs) for ljspeech
+
+# optimization related
+lr: 1e-4
+eps: 1e-6
+weight-decay: 0.0
+dropout-rate: 0.5
+zoneout-rate: 0.1
+epochs: 200
+patience: 0
+
+# other
+save-interval-epoch: 10
diff --git a/egs/cmu_indic/tts1/conf/train_pytorch_transformer.v1.single.finetune.mailabs.en_UK.elizabeth.transformer.v1.single.yaml b/egs/cmu_indic/tts1/conf/train_pytorch_transformer.v1.single.finetune.mailabs.en_UK.elizabeth.transformer.v1.single.yaml
new file mode 100644
index 00000000000..e8a2af167cc
--- /dev/null
+++ b/egs/cmu_indic/tts1/conf/train_pytorch_transformer.v1.single.finetune.mailabs.en_UK.elizabeth.transformer.v1.single.yaml
@@ -0,0 +1,55 @@
+model-module: espnet.nets.pytorch_backend.e2e_tts_transformer:Transformer
+embed-dim: 0
+eprenet-conv-layers: 0
+eprenet-conv-filts: 0
+eprenet-conv-chans: 0
+dprenet-layers: 2
+dprenet-units: 256
+adim: 384
+aheads: 4
+elayers: 6
+eunits: 1536
+dlayers: 6
+dunits: 1536
+postnet-layers: 5
+postnet-filts: 5
+postnet-chans: 256
+use-masking: true
+bce-pos-weight: 5.0
+use-batch-norm: true
+use-scaled-pos-enc: true
+encoder-normalize-before: false
+decoder-normalize-before: false
+encoder-concat-after: false
+decoder-concat-after: false
+reduction-factor: 1
+batch-sort-key: input
+batch-bins: 759000
+transformer-init: pytorch
+transformer-warmup-steps: 4000
+transformer-lr: 0.1
+initial-encoder-alpha: 1.0
+initial-decoder-alpha: 1.0
+eprenet-dropout-rate: 0.0
+dprenet-dropout-rate: 0.5
+postnet-dropout-rate: 0.5
+transformer-enc-dropout-rate: 0.1
+transformer-enc-positional-dropout-rate: 0.1
+transformer-enc-attn-dropout-rate: 0.1
+transformer-dec-dropout-rate: 0.1
+transformer-dec-positional-dropout-rate: 0.1
+transformer-dec-attn-dropout-rate: 0.1
+transformer-enc-dec-attn-dropout-rate: 0.1
+use-guided-attn-loss: true
+num-heads-applied-guided-attn: 2
+num-layers-applied-guided-attn: 2
+modules-applied-guided-attn:
+- encoder-decoder
+opt: noam
+accum-grad: 6
+grad-clip: 1.0
+weight-decay: 0.0
+patience: 0
+epochs: 1000
+save-interval-epoch: 20
+pretrained-model: downloads/mailabs.en_UK.elizabeth.transformer.v1.single/exp/en_UK_elizabeth_train_trim_pytorch_train_pytorch_transformer.v1.single/results/model.last1.avg.best
diff --git a/egs/cmu_indic/tts1/conf/train_pytorch_transformer.v1.single.finetune.yaml b/egs/cmu_indic/tts1/conf/train_pytorch_transformer.v1.single.finetune.yaml
new file mode 100644
index 00000000000..f8806e69cf6
--- /dev/null
+++ b/egs/cmu_indic/tts1/conf/train_pytorch_transformer.v1.single.finetune.yaml
@@ -0,0 +1,69 @@
+# This configuration reuqires 1 gpus in the case of each gpu memory = 12GB, and it takes 3~4 days.
+# If you want to accelerate the training with multi-gpus, please use train_pytorch_transformer.v1.yaml config.
+# Or you can manually increase batch-bins and decrease accum-grad.
+# Note that transoformer requires a large batchsize > 64 in average according to the original paper.
+
+# network architecture related
+model-module: espnet.nets.pytorch_backend.e2e_tts_transformer:Transformer
+embed-dim: 0
+eprenet-conv-layers: 0  # one more linear layer w/o non-linear will be added for 0-centor
+eprenet-conv-filts: 0
+eprenet-conv-chans: 0
+dprenet-layers: 2  # one more linear layer w/o non-linear will be added for 0-centor
+dprenet-units: 256
+adim: 384
+aheads: 4
+elayers: 6
+eunits: 1536
+dlayers: 6
+dunits: 1536
+postnet-layers: 5
+postnet-filts: 5
+postnet-chans: 256
+use-masking: True
+bce-pos-weight: 5.0
+use-batch-norm: True
+use-scaled-pos-enc: True
+encoder-normalize-before: False
+decoder-normalize-before: False
+encoder-concat-after: False
+decoder-concat-after: False
+reduction-factor: 1
+
+# minibatch related
+batch-sort-key: input # shuffle or input or output
+batch-bins: 759000    # 10 * (870 * 80 + 180 * 35)
+                      # batch-size * (max_out * dim_out + max_in * dim_in)
+                      # resuling in 10 ~ 24 samples (avg 13 samples) in batch (969 batches per epochs) for ljspeech
+
+# training related
+transformer-init: pytorch
+transformer-warmup-steps: 4000
+transformer-lr: 0.1
+initial-encoder-alpha: 1.0
+initial-decoder-alpha: 1.0
+eprenet-dropout-rate: 0.0
+dprenet-dropout-rate: 0.5
+postnet-dropout-rate: 0.5
+transformer-enc-dropout-rate: 0.1
+transformer-enc-positional-dropout-rate: 0.1
+transformer-enc-attn-dropout-rate: 0.1
+transformer-dec-dropout-rate: 0.1
+transformer-dec-positional-dropout-rate: 0.1
+transformer-dec-attn-dropout-rate: 0.1
+transformer-enc-dec-attn-dropout-rate: 0.1
+use-guided-attn-loss: true
+num-heads-applied-guided-attn: 2
+num-layers-applied-guided-attn: 2
+modules-applied-guided-attn: ["encoder-decoder"]
+
+# optimization related
+opt: noam
+accum-grad: 6
+grad-clip: 1.0
+weight-decay: 0.0
+patience: 0
+epochs: 1000
+
+# other
+save-interval-epoch: 20
diff --git a/egs/cmu_indic/tts1/local/clean_text.py b/egs/cmu_indic/tts1/local/clean_text.py
new file mode 100755
index 00000000000..7b14f47a61a
--- /dev/null
+++ b/egs/cmu_indic/tts1/local/clean_text.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import codecs
+
+from tacotron_cleaner.cleaners import custom_english_cleaners
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("text", type=str, help="text to be cleaned")
+    args = parser.parse_args()
+    with codecs.open(args.text, "r", "utf-8") as fid:
+        for line in fid.readlines():
+            line = line.split(" ")
+            id = line[0]
+            content = " ".join(line[1:])
+            clean_content = custom_english_cleaners(content.rstrip())
+            print("%s %s" % (id, clean_content))
diff --git a/egs/cmu_indic/tts1/local/data_download.sh b/egs/cmu_indic/tts1/local/data_download.sh
new file mode 100755
index 00000000000..d32e86196ba
--- /dev/null
+++ b/egs/cmu_indic/tts1/local/data_download.sh
@@ -0,0 +1,39 @@
+#!/bin/bash -e
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db=$1
+spk=$2
+
+available_spks=(
+    "hin_ab" "tel_ss" "tam_sdr" "kan_plv" "mar_slp" "guj_dp" "ben_rm"
+)
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <db_root_dir> <spk>"
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+
+# check speakers
+if ! $(echo ${available_spks[*]} | grep -q ${spk}); then
+    echo "Specified spk (${spk}) is not available or not supported." >&2
+    exit 1
+fi
+
+# download dataset
+cwd=`pwd`
+if [ ! -e ${db}/${spk}.done ]; then
+    mkdir -p ${db}
+    cd ${db}
+    wget http://festvox.org/h2r_indic/cmu_indic_${spk}.tar.bz2
+    tar xf cmu_indic_${spk}.tar.bz2
+    rm cmu_indic_${spk}.tar.bz2
+    cd $cwd
+    echo "Successfully finished download."
+    touch ${db}/${spk}.done
+else
+    echo "Already exists. Skip download."
+fi
diff --git a/egs/cmu_indic/tts1/local/data_prep.sh b/egs/cmu_indic/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..31396f18984
--- /dev/null
+++ b/egs/cmu_indic/tts1/local/data_prep.sh
@@ -0,0 +1,81 @@
+#!/bin/bash -e
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db=$1
+spk=$2
+data_dir=$3
+
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 <db> <spk> <data_dir>"
+    exit 1
+fi
+
+# check speaker
+available_spks=(
+    "hin_ab" "tel_ss" "tam_sdr" "kan_plv" "mar_slp" "guj_dp" "ben_rm"
+)
+if ! $(echo ${available_spks[*]} | grep -q ${spk}); then
+    echo "Specified speaker ${spk} is not available."
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+
+# check directory existence
+[ ! -e ${data_dir} ] && mkdir -p ${data_dir}
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+segments=${data_dir}/segments
+
+# check file existence
+[ -e ${scp} ] && rm ${scp}
+[ -e ${utt2spk} ] && rm ${utt2spk}
+[ -e ${segments} ] && rm ${segments}
+
+# make scp, utt2spk, and spk2utt
+find ${db} -name "*.wav" -follow | sort | while read -r filename;do
+    id="${spk}_$(basename ${filename} | sed -e "s/\.[^\.]*$//g")"
+    echo "${id} ${filename}" >> ${scp}
+    echo "${id} ${spk}" >> ${utt2spk}
+done
+echo "Successfully finished making wav.scp, utt2spk."
+
+utils/utt2spk_to_spk2utt.pl ${utt2spk} > ${spk2utt}
+echo "Successfully finished making spk2utt."
+
+# make text
+raw_text=${db}/etc/txt.done.data
+ids=$(sed < ${raw_text} -e "s/^( /${spk}_/g" -e "s/ )$//g" | cut -d " " -f 1)
+sentences=$(sed < ${raw_text} -e "s/^( //g" -e "s/ )$//g" -e "s/\"//g" | tr '[:lower:]' '[:upper:]' | cut -d " " -f 2-)
+paste -d " " <(echo "${ids}") <(echo "${sentences}") > ${text}.tmp
+local/clean_text.py ${text}.tmp > ${text}
+rm ${text}.tmp
+echo "Successfully finished making text."
+
+# make segments
+find ${db}/lab -name "*.lab" -follow | sort | while read -r filename; do
+    # get start time
+    while read line; do
+        phn=$(echo ${line} | cut -d " " -f 3)
+        if [ ${phn} != "pau" ]; then
+            break
+        fi
+        start=$(echo ${line} | cut -d " " -f 1)
+    done < <(tail -n +2 $filename)
+    # get end time
+    while read line; do
+        end=$(echo ${line} | cut -d " " -f 1)
+        phn=$(echo ${line} | cut -d " " -f 3)
+        if [ ${phn} != "pau" ]; then
+            break
+        fi
+    done < <(tail -n +2 $filename | tac)
+    echo "${spk}_$(basename ${filename} .lab) ${spk}_$(basename ${filename} .lab) ${start} ${end}" >> ${segments}
+done
+echo "Successfully finished making segments."
diff --git a/egs/cmu_indic/tts1/local/pretrained_model_download.sh b/egs/cmu_indic/tts1/local/pretrained_model_download.sh
new file mode 100755
index 00000000000..b3f97b586e8
--- /dev/null
+++ b/egs/cmu_indic/tts1/local/pretrained_model_download.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -e
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+download_dir=$1
+pretrained_model=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <download_dir> <pretrained_model>"
+    echo ""
+    echo "Available pretrained models:"
+    echo "    - mailabs.en_UK.elizabeth.transformer.v1.single"
+    echo "    - mailabs.en_UK.elizabeth.tacotron2.v3"
+    echo "    - mailabs.en_US.judy.transformer.v1.single"
+    echo "    - mailabs.en_US.judy.tacotron2.v3"
+    echo "    - mailabs.en_US.elliot.transformer.v1.single"
+    exit 1
+fi
+
+case "${pretrained_model}" in
+    "mailabs.en_UK.elizabeth.transformer.v1.single") share_url="https://drive.google.com/open?id=1iXdQv_YGD9VG1dR_xCjSkX6A4HkrpTbF" ;;
+    "mailabs.en_UK.elizabeth.tacotron2.v3") share_url="https://drive.google.com/open?id=1iOwvCx6wX5_qCmHZSX_vCd_ZYn-B5akh" ;;
+    "mailabs.en_US.judy.transformer.v1.single") share_url="https://drive.google.com/open?id=1rHQMMjkSoiX3JX2e70MKUKSrxHGwhmRb" ;;
+    "mailabs.en_US.judy.tacotron2.v3") share_url="https://drive.google.com/open?id=1cNrTa8Jxa3AYcap7jo0_RPBapiay3etG" ;;
+    "mailabs.en_US.elliot.transformer.v1.single") share_url="https://drive.google.com/open?id=1zv9GwhhBW32a6RM5wHzjqRxkkv9IrXTL" ;;
+    *) echo "No such pretrained model: ${pretrained_model}"; exit 1 ;;
+esac
+
+dir=${download_dir}/${pretrained_model}
+mkdir -p ${dir}
+if [ ! -e ${dir}/.complete ]; then
+    download_from_google_drive.sh ${share_url} ${dir} ".tar.gz"
+    touch ${dir}/.complete
+fi
+echo "Successfully finished download of pretrained model."
diff --git a/egs/cmu_indic/tts1/path.sh b/egs/cmu_indic/tts1/path.sh
new file mode 100644
index 00000000000..9347111078e
--- /dev/null
+++ b/egs/cmu_indic/tts1/path.sh
@@ -0,0 +1,14 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/cmu_indic/tts1/run.sh b/egs/cmu_indic/tts1/run.sh
new file mode 100755
index 00000000000..261d3f023b7
--- /dev/null
+++ b/egs/cmu_indic/tts1/run.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1
+stop_stage=100
+ngpu=1       # number of gpus ("0" uses cpu, otherwise use gpu)
+nj=16        # number of parallel jobs
+dumpdir=dump # directory to dump full features
+verbose=1    # verbose option (if set > 0, get more log)
+N=0          # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+seed=1       # random seed number
+resume=""    # the snapshot path to resume (if set empty, no effect)
+
+# feature extraction related
+fs=16000      # sampling frequency
+fmax=7600     # maximum frequency
+fmin=80       # minimum frequency
+n_mels=80     # number of mel basis
+n_fft=1024    # number of fft points
+n_shift=256   # number of shift points
+win_length="" # window length
+
+# config files
+train_config=conf/train_pytorch_transformer.v1.single.finetune.yaml # you can select from conf or conf/tuning.
+                                                                    # now we support tacotron2 and transformer for TTS.
+                                                                    # see more info in the header of each config.
+decode_config=conf/decode.yaml
+
+# decoding related
+model=model.loss.best
+n_average=1           # if > 0, the model averaged with n_average ckpts will be used instead of model.loss.best
+griffin_lim_iters=64  # the number of iterations of Griffin-Lim
+
+# pretrained model related
+download_dir=downloads
+pretrained_model="mailabs.en_UK.elizabeth.transformer.v1.single"  # see model info in local/pretrained_model_download.sh
+
+# dataset configuration
+spk=hin_ab  # see local/data_prep.sh to check available speakers
+
+# exp tag
+tag=""  # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+org_set=${spk}
+train_set=${spk}_train_no_dev
+dev_set=${spk}_dev
+eval_set=${spk}_eval
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data and Pretrained Model Download"
+    local/data_download.sh ${download_dir} ${spk}
+    local/pretrained_model_download.sh ${download_dir} ${pretrained_model}
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    local/data_prep.sh ${download_dir}/cmu_indic_${spk} ${spk} data/${org_set}
+    utils/fix_data_dir.sh data/${org_set}
+    utils/validate_data_dir.sh --no-feats data/${org_set}
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${dev_set}; mkdir -p ${feat_dt_dir}
+feat_ev_dir=${dumpdir}/${eval_set}; mkdir -p ${feat_ev_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev name by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    # Generate the fbank features; by default 80-dimensional fbanks on each frame
+    fbankdir=fbank
+    make_fbank.sh --cmd "${train_cmd}" --nj ${nj} \
+        --fs ${fs} \
+        --fmax "${fmax}" \
+        --fmin "${fmin}" \
+        --n_fft ${n_fft} \
+        --n_shift ${n_shift} \
+        --win_length "${win_length}" \
+        --n_mels ${n_mels} \
+        data/${org_set} \
+        exp/make_fbank/${org_set} \
+        ${fbankdir}
+
+    # make a dev set
+    utils/subset_data_dir.sh --last data/${org_set} 200 data/${org_set}_tmp
+    utils/subset_data_dir.sh --last data/${org_set}_tmp 100 data/${eval_set}
+    utils/subset_data_dir.sh --first data/${org_set}_tmp 100 data/${dev_set}
+    n=$(( $(wc -l < data/${org_set}/wav.scp) - 200 ))
+    utils/subset_data_dir.sh --first data/${org_set} ${n} data/${train_set}
+    rm -rf data/${org_set}_tmp
+
+    # use pretrained model cmvn
+    cmvn=$(find ${download_dir}/${pretrained_model} -name "cmvn.ark" | head -n 1)
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \
+        data/${train_set}/feats.scp ${cmvn} exp/dump_feats/${train_set} ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \
+        data/${dev_set}/feats.scp ${cmvn} exp/dump_feats/${dev_set} ${feat_dt_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta false \
+        data/${eval_set}/feats.scp ${cmvn} exp/dump_feats/${eval_set} ${feat_ev_dir}
+fi
+
+dict=$(find ${download_dir}/${pretrained_model} -name "*_units.txt" | head -n 1)
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+
+    # make json labels using pretrained model dict
+    data2json.sh --feat ${feat_tr_dir}/feats.scp \
+         data/${train_set} ${dict} > ${feat_tr_dir}/data.json
+    data2json.sh --feat ${feat_dt_dir}/feats.scp \
+         data/${dev_set} ${dict} > ${feat_dt_dir}/data.json
+    data2json.sh --feat ${feat_ev_dir}/feats.scp \
+         data/${eval_set} ${dict} > ${feat_ev_dir}/data.json
+fi
+
+# add pretrained model info in config
+pretrained_model_path=$(find ${download_dir}/${pretrained_model} -name "model*.best" | head -n 1)
+train_config="$(change_yaml.py -a pretrained-model="${pretrained_model_path}" \
+    -o "conf/$(basename "${train_config}" .yaml).${pretrained_model}.yaml" "${train_config}")"
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Text-to-speech model training"
+    tr_json=${feat_tr_dir}/data.json
+    dt_json=${feat_dt_dir}/data.json
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        tts_train.py \
+           --backend ${backend} \
+           --ngpu ${ngpu} \
+           --minibatches ${N} \
+           --outdir ${expdir}/results \
+           --tensorboard-dir tensorboard/${expname} \
+           --verbose ${verbose} \
+           --seed ${seed} \
+           --resume ${resume} \
+           --train-json ${tr_json} \
+           --valid-json ${dt_json} \
+           --config ${train_config}
+fi
+
+if [ ${n_average} -gt 0 ]; then
+    model=model.last${n_average}.avg.best
+fi
+outdir=${expdir}/outputs_${model}_$(basename ${decode_config%.*})
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Decoding"
+    if [ ${n_average} -gt 0 ]; then
+        average_checkpoints.py --backend ${backend} \
+                               --snapshots ${expdir}/results/snapshot.ep.* \
+                               --out ${expdir}/results/${model} \
+                               --num ${n_average}
+    fi
+    pids=() # initialize pids
+    for name in ${dev_set} ${eval_set}; do
+    (
+        [ ! -e ${outdir}/${name} ] && mkdir -p ${outdir}/${name}
+        cp ${dumpdir}/${name}/data.json ${outdir}/${name}
+        splitjson.py --parts ${nj} ${outdir}/${name}/data.json
+        # decode in parallel
+        ${train_cmd} JOB=1:${nj} ${outdir}/${name}/log/decode.JOB.log \
+            tts_decode.py \
+                --backend ${backend} \
+                --ngpu 0 \
+                --verbose ${verbose} \
+                --out ${outdir}/${name}/feats.JOB \
+                --json ${outdir}/${name}/split${nj}utt/data.JOB.json \
+                --model ${expdir}/results/${model} \
+                --config ${decode_config}
+        # concatenate scp files
+        for n in $(seq ${nj}); do
+            cat "${outdir}/${name}/feats.$n.scp" || exit 1;
+        done > ${outdir}/${name}/feats.scp
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((i++)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Synthesis"
+    pids=() # initialize pids
+    for name in ${dev_set} ${eval_set}; do
+    (
+        [ ! -e ${outdir}_denorm/${name} ] && mkdir -p ${outdir}_denorm/${name}
+        # use pretrained model cmvn
+        cmvn=$(find ${download_dir}/${pretrained_model} -name "cmvn.ark" | head -n 1)
+        apply-cmvn --norm-vars=true --reverse=true ${cmvn} \
+            scp:${outdir}/${name}/feats.scp \
+            ark,scp:${outdir}_denorm/${name}/feats.ark,${outdir}_denorm/${name}/feats.scp
+        convert_fbank.sh --nj ${nj} --cmd "${train_cmd}" \
+            --fs ${fs} \
+            --fmax "${fmax}" \
+            --fmin "${fmin}" \
+            --n_fft ${n_fft} \
+            --n_shift ${n_shift} \
+            --win_length "${win_length}" \
+            --n_mels ${n_mels} \
+            --iters ${griffin_lim_iters} \
+            ${outdir}_denorm/${name} \
+            ${outdir}_denorm/${name}/log \
+            ${outdir}_denorm/${name}/wav
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((i++)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished."
+fi
diff --git a/egs/cmu_indic/tts1/steps b/egs/cmu_indic/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/cmu_indic/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/cmu_indic/tts1/utils b/egs/cmu_indic/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/cmu_indic/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/commonvoice/asr1/RESULTS.md b/egs/commonvoice/asr1/RESULTS.md
index 7b644c86987..cfb394aceb4 100644
--- a/egs/commonvoice/asr1/RESULTS.md
+++ b/egs/commonvoice/asr1/RESULTS.md
@@ -939,71 +939,140 @@
 |decode_dev_pl_decode_lm|4458|32280|91.6|7.8|0.6|0.5|8.9|28.1|
 |decode_test_pl_decode_lm|4458|31588|97.6|2.1|0.3|0.2|2.6|8.1|
 
----
-
-# Transducer results (no data augmentation)
-# Encoder: VGG2L + 12 x Conformer / Decoder: embedding + 3 x LSTM
+# Transducer
+
+## Summary
+
+|Lang|Model|Algo|CER¹|WER¹|SER¹|RTF¹²|
+|-|-|-|-|-|-|-|
+|cs³|Conformer/RNN-T|default|7.9|15.3|30.4|0.089|
+|-|-|ALSD|8.2|15.6|30.8|0.080|
+|-|-|TSD|8.1|15.5|31.3|0.111|
+|-|-|NSC|8.0|15.5|30.8|0.123|
+|-|-|mAES|8.1|15.5|31.0|0.081|
+|-|Conformer/RNN-T + Aux|default|8.0|14.7|27.8|0.200|
+|-|-|ALSD|8.4|14.9|27.7|0.080|
+|-|-|TSD|7.8|14.6|27.4|0.107|
+|-|-|NSC|7.9|14.5|27.4|0.123|
+|-|-|mAES|8.0|14.8|27.7|0.082|
+
+¹ Reported on the test set only.
+² RTF was computed using `line-profiler` tool applied to [recognize method](https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/e2e_asr_transducer.py#L470). The reported value is averaged on 5 runs with `nj=1`. All experiments were performed using a single AMD EPYC 7502P.  
+³ We do not use a language model for these experiments.
+
+## Czech (cs)
+
+### Conformer/RNN-Transducer (Enc: VGG + 12 x Conformer, Dec: 1 x LSTM)
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Peak VRAM usage during training: ~ 24.3 GiB
+  - Training time: ~ 8 hours and 5 minutes
+  - Decoding time (4 jobs, `search-type: default`): ~ 3 minutes and 30 seconds
+  - Model averaging: `n_average=20`, `use_valbest_average=true`
 
 - Environments
-  - date: `Thu Nov 27 12:25:08 CET 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.4`
+  - date: `Wed Aug 22 08:54:04 UTC 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
   - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `e9c1a554f0fbeeaeedd0f7e5c9ab096d243011b2`
-  - Commit date: `Wed Nov 18 22:06:15 2020 +0100`
-
-# Czech (train_cs_pytorch_cs_train_conformer-rnn_transducer)
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
 
 - Model files
-  - model link:
-  - training config file: `conf/tuning/transducer/cs_train_conformer-rnn_transducer.yaml`
-  - decoding config file: `conf/tuning/transducer/decode_default.yaml`
+  - model link: https://drive.google.com/file/d/11zF8CCQMWG4rcR4nLe3tdGoyXgJMu3Nf
+  - training config file: `conf/tuning/transducer/cs/train_conformer-rnn_transducer.yaml`
+  - decoding config file: `conf/tuning/transducer/cs/decode_default.yaml`
   - cmvn file: `data/train_cs/cmvn.ark`
-  - e2e file: `exp/train_cs_pytorch_train_cs_conformer-rnn_transducer/results/model.last20.avg.best`
-  - e2e JSON file: `exp/train_cs_pytorch_cs_train_conformer-rnn_transducer/results/model.json`
-  - lm file: `exp/train_cs_rnnlm_pytorch_lm_unigram150/rnnlm.model.best`
-  - lm JSON file: `exp/train_cs_rnnlm_pytorch_lm_unigram150/model.json`
+  - e2e file: `exp/train_cs_pytorch_cs_train_conformer-rnn_transducer_specaug/results/model.val20.avg.best`
+  - e2e JSON file: `exp/train_cs_pytorch_cs_train_conformer-rnn_transducer_specaug/results/model.json`
   - dict file: `data/cs_lang_char/`
 
-## CER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dev_cs_decode_default_lm|2584|62452|91.8|5.9|2.3|1.1|9.3|41.2|
-|decode_test_cs_decode_default_lm|2574|65232|88.5|8.6|2.9|1.8|13.3|48.6|
+|decode_dev_cs_decode_alsd|2584|62520|96.2|2.6|1.2|0.5|4.3|20.7|
+|decode_dev_cs_decode_default|2584|62520|96.3|2.6|1.1|0.5|4.2|20.4|
+|decode_dev_cs_decode_maes|2584|62520|96.0|2.7|1.2|0.5|4.5|21.1|
+|decode_dev_cs_decode_nsc|2584|62520|96.0|2.7|1.3|0.5|4.5|21.1|
+|decode_dev_cs_decode_tsd|2584|62520|95.8|2.8|1.4|0.5|4.6|21.9|
+|decode_test_cs_decode_alsd|2574|65285|92.9|5.2|1.9|1.1|8.2|30.8|
+|decode_test_cs_decode_default|2574|65285|93.1|5.0|1.8|1.0|7.9|30.4|
+|decode_test_cs_decode_maes|2574|65285|92.9|5.1|2.0|1.0|8.1|31.0|
+|decode_test_cs_decode_nsc|2574|65285|93.0|5.1|2.0|1.0|8.0|30.8|
+|decode_test_cs_decode_tsd|2574|65285|92.8|5.1|2.0|1.0|8.1|31.3|
 
-## WER
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dev_cs_decode_default_lm|2584|16239|84.4|13.3|2.3|1.6|17.2|41.2|
-|decode_test_cs_decode_default_lm|2574|16508|78.4|19.1|2.5|2.5|24.0|48.6|
+|decode_dev_cs_decode_alsd|2584|16239|92.3|6.5|1.2|0.7|8.4|20.7|
+|decode_dev_cs_decode_default|2584|16239|92.3|6.5|1.1|0.7|8.4|20.4|
+|decode_dev_cs_decode_maes|2584|16239|92.1|6.7|1.3|0.8|8.7|21.1|
+|decode_dev_cs_decode_nsc|2584|16239|92.1|6.7|1.2|0.7|8.7|21.1|
+|decode_dev_cs_decode_tsd|2584|16239|91.8|6.9|1.3|0.7|9.0|21.9|
+|decode_test_cs_decode_alsd|2574|16508|86.0|12.2|1.8|1.7|15.6|30.8|
+|decode_test_cs_decode_default|2574|16508|86.3|11.9|1.8|1.6|15.3|30.4|
+|decode_test_cs_decode_maes|2574|16508|86.0|12.1|1.9|1.5|15.5|31.0|
+|decode_test_cs_decode_nsc|2574|16508|86.1|12.0|1.9|1.5|15.5|30.7|
+|decode_test_cs_decode_tsd|2574|16508|85.9|12.1|2.0|1.5|15.6|31.2|
+
+### Conformer/RNN-Transducer (Enc: VGG + 12 x Conformer, Dec: 1 x LSTM)
+##   + CTC loss + Label Smoothing loss + aux. Transducer loss + symm. KL div loss
 
-# Welsh (train_cy_pytorch_cy_train_conformer-rnn_transducer)
-# Note: early results, lm-weight = 0.1 for decoding
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Peak VRAM usage during training: ~ 38.2 GiB
+  - Training time: ~ 9 hours and 34 minutes
+  - Decoding time (4 jobs, `search-type: default`): ~ 6 minutes and 28 seconds
+  - Model averaging: `n_average=20`, `use_valbest_average=true`
+
+- Environments
+  - date: `Wed Aug 22 08:54:04 UTC 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
+  - chainer version: `chainer 6.0.0`
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
 
 - Model files
-  - model link: https://drive.google.com/open?id=1POQmnorwiZrWwwszvkEKNaAfsuyQStjU
-  - training config file: `conf/tuning/transducer/cy_train_conformer-rnn_transducer.yaml`
-  - decoding config file: `conf/tuning/transducer/decode_default.yaml`
-  - cmvn file: `data/train_cy/cmvn.ark`
-  - e2e file: `exp/train_cy_pytorch_cy_train_conformer-rnn_transducer/results/model.last20.avg.best`
-  - e2e JSON file: `exp/train_cy_pytorch_cy_train_conformer-rnn_transducer/results/model.json`
-  - lm file: `exp/train_cy_rnnlm_pytorch_lm_unigram150/rnnlm.model.best`
-  - lm JSON file: `exp/train_cy_rnnlm_pytorch_lm_unigram150/model.json`
-  - dict file: `data/cy_lang_char/`
+  - model link: https://drive.google.com/file/d/1Ik8EpLeedsGYTIPNJXjZkUqcElEBv1gV
+  - training config file: `conf/tuning/transducer/cs/train_conformer-rnn_transducer_aux.yaml`
+  - decoding config file: `conf/tuning/transducer/cs/decode_default.yaml`
+  - cmvn file: `data/train_cs/cmvn.ark`
+  - e2e file: `exp/train_cs_pytorch_cs_train_conformer-rnn_transducer_aux_specaug/results/model.val20.avg.best`
+  - e2e JSON file: `exp/train_cs_pytorch_cs_train_conformer-rnn_transducer_aux_specaug/results/model.json`
+  - dict file: `data/cs_lang_char/`
 
-## CER
+#### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dev_cy_decode_default_lm|2933|91050|86.5|9.3|4.2|2.3|15.8|64.6|
-|decode_test_cy_decode_default_lm|2937|80473|89.9|6.9|3.2|1.7|11.8|35.6|
+|decode_dev_cs_decode_alsd|2584|62520|96.3|1.8|1.8|0.4|4.1|16.5|
+|decode_dev_cs_decode_default|2584|62520|96.5|1.8|1.7|0.4|3.9|17.0|
+|decode_dev_cs_decode_maes|2584|62520|96.4|1.8|1.8|0.4|3.9|16.8|
+|decode_dev_cs_decode_nsc|2584|62520|96.5|1.8|1.7|0.4|3.8|16.6|
+|decode_dev_cs_decode_tsd|2584|62520|96.5|1.8|1.7|0.3|3.8|17.0|
+|decode_test_cs_decode_alsd|2574|65285|92.6|4.1|3.2|1.0|8.4|27.7|
+|decode_test_cs_decode_default|2574|65285|92.9|4.0|3.0|0.9|8.0|27.8|
+|decode_test_cs_decode_maes|2574|65285|92.8|4.1|3.1|0.9|8.0|27.7|
+|decode_test_cs_decode_nsc|2574|65285|92.9|4.0|3.0|0.8|7.9|27.4|
+|decode_test_cs_decode_tsd|2574|65285|92.9|4.0|3.1|0.8|7.8|27.4|
 
-## WER
+#### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dev_cy_decode_default_lm|2933|28498|78.1|19.1|2.8|2.3|24.2|64.6|
-|decode_test_cy_decode_default_lm|2937|26046|85.4|12.6|2.1|1.6|16.2|35.6|
+|decode_dev_cs_decode_alsd|2584|16239|93.0|5.3|1.7|0.5|7.4|16.5|
+|decode_dev_cs_decode_default|2584|16239|93.1|5.4|1.5|0.6|7.5|17.0|
+|decode_dev_cs_decode_maes|2584|16239|93.1|5.2|1.6|0.5|7.3|16.8|
+|decode_dev_cs_decode_nsc|2584|16239|93.2|5.2|1.6|0.5|7.2|16.6|
+|decode_dev_cs_decode_tsd|2584|16239|93.2|5.3|1.6|0.5|7.3|17.0|
+|decode_test_cs_decode_alsd|2574|16508|86.2|11.0|2.8|1.0|14.9|27.7|
+|decode_test_cs_decode_default|2574|16508|86.3|11.0|2.7|1.0|14.7|27.8|
+|decode_test_cs_decode_maes|2574|16508|86.2|11.1|2.7|1.0|14.8|27.7|
+|decode_test_cs_decode_nsc|2574|16508|86.4|10.9|2.7|0.9|14.5|27.4|
+|decode_test_cs_decode_tsd|2574|16508|86.3|10.9|2.8|0.9|14.6|27.4|
\ No newline at end of file
diff --git a/egs/commonvoice/asr1/cmd.sh b/egs/commonvoice/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/commonvoice/asr1/cmd.sh
+++ b/egs/commonvoice/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_alsd.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_alsd.yaml
new file mode 100644
index 00000000000..1d9609c5c2d
--- /dev/null
+++ b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_alsd.yaml
@@ -0,0 +1,6 @@
+# decoding parameters
+batch: 0
+beam-size: 5
+search-type: alsd
+u-max: 100
+
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_default.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_default.yaml
new file mode 100644
index 00000000000..7a0788aa4b4
--- /dev/null
+++ b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_default.yaml
@@ -0,0 +1,4 @@
+# decoding parameters
+batch: 0
+beam-size: 10
+search-type: default
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_maes.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_maes.yaml
new file mode 100644
index 00000000000..4718ac693e5
--- /dev/null
+++ b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_maes.yaml
@@ -0,0 +1,8 @@
+# decoding parameters
+batch: 0
+beam-size: 5
+search-type: maes
+nstep: 2
+prefix-alpha: 2
+expansion-gamma: 1.3
+expansion-beta: 1
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_nsc.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_nsc.yaml
new file mode 100644
index 00000000000..17a017caf66
--- /dev/null
+++ b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_nsc.yaml
@@ -0,0 +1,7 @@
+# decoding parameters
+batch: 0
+beam-size: 5
+search-type: nsc
+nstep: 2
+prefix-alpha: 2
+
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_tsd.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_tsd.yaml
new file mode 100644
index 00000000000..09a968bcfc2
--- /dev/null
+++ b/egs/commonvoice/asr1/conf/tuning/transducer/cs/decode_tsd.yaml
@@ -0,0 +1,5 @@
+# decoding parameters
+batch: 0
+beam-size: 5
+search-type: tsd
+max-sym-exp: 3
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/cs/train_conformer-rnn_transducer.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/cs/train_conformer-rnn_transducer.yaml
new file mode 100644
index 00000000000..43d45cfa319
--- /dev/null
+++ b/egs/commonvoice/asr1/conf/tuning/transducer/cs/train_conformer-rnn_transducer.yaml
@@ -0,0 +1,48 @@
+# minibatch related
+batch-size: 64
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+noam-lr: 1.0
+noam-adim: 256
+transformer-warmup-steps: 25000
+epochs: 100
+patience: 0
+accum-grad: 2
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+enc-block-arch:
+        - type: conformer
+          d_hidden: 256
+          d_ff: 2048
+          heads: 8
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.4
+          att-dropout-rate: 0.4
+enc-block-repeat: 12
+## decoder related
+dtype: lstm
+dunits: 512
+dlayer: 1
+dropout-rate-decoder: 0.1
+dropout-rate-embed-decoder: 0.2
+## joint network related
+joint-dim: 512
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
\ No newline at end of file
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/cs/train_conformer-rnn_transducer_aux.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/cs/train_conformer-rnn_transducer_aux.yaml
new file mode 100644
index 00000000000..e0ea446ec2f
--- /dev/null
+++ b/egs/commonvoice/asr1/conf/tuning/transducer/cs/train_conformer-rnn_transducer_aux.yaml
@@ -0,0 +1,61 @@
+# minibatch related
+batch-size: 64
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+noam-lr: 1.0
+noam-adim: 256
+transformer-warmup-steps: 25000
+epochs: 100
+patience: 0
+accum-grad: 2
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+enc-block-arch:
+        - type: conformer
+          d_hidden: 256
+          d_ff: 2048
+          heads: 8
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.4
+          att-dropout-rate: 0.4
+enc-block-repeat: 12
+## decoder related
+dtype: lstm
+dunits: 512
+dlayer: 1
+dropout-rate-decoder: 0.1
+dropout-rate-embed-decoder: 0.2
+## joint network related
+joint-dim: 512
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+use-ctc-loss: True
+ctc-loss-weight: 0.5
+ctc-loss-dropout-rate: 0.1
+use-lm-loss: True
+lm-loss-weight: °.3
+lm-loss-smoothing-rate: 0.05
+use-aux-transducer-loss: True
+aux-transducer-loss-weight: 0.2
+aux-transducer-loss-enc-output-layers: "[6, 8, 10]"
+aux-transducer-loss-mlp-dim: 256
+aux-transducer-loss-mlp-dropout-rate: 0.1
+use-symm-kl-div-loss: True
+symm-kl-div-loss-weight: 0.1
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/cs_train_conformer-rnn_transducer.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/cs_train_conformer-rnn_transducer.yaml
deleted file mode 100644
index 8f9db6c85c5..00000000000
--- a/egs/commonvoice/asr1/conf/tuning/transducer/cs_train_conformer-rnn_transducer.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# minibatch related
-batch-size: 32
-maxlen-in: 512
-maxlen-out: 150
-
-# optimization related
-criterion: loss
-early-stop-criterion: "validation/main/loss"
-sortagrad: 0
-opt: noam
-transformer-lr: 1.0
-transformer-warmup-steps: 28000
-epochs: 100
-patience: 0
-accum-grad: 2
-grad-clip: 5.0
-
-# network architecture
-## general
-custom-enc-positional-encoding-type: rel_pos
-custom-enc-self-attn-type: rel_self_attn
-custom-enc-pw-activation-type: swish
-## encoder related
-etype: custom
-custom-enc-input-layer: vgg2l
-enc-block-arch:
-        - type: conformer
-          d_hidden: 256
-          d_ff: 2048
-          heads: 8
-          macaron_style: True
-          use_conv_mod: True
-          conv_mod_kernel: 15
-          dropout-rate: 0.4
-          att-dropout-rate: 0.4
-enc-block-repeat: 12
-## decoder related
-dtype: lstm
-dunits: 1024
-dlayer: 3
-dropout-rate-decoder: 0.2
-dropout-rate-embed-decoder: 0.3
-## joint network related
-joint-dim: 784
-
-# transducer related
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
\ No newline at end of file
diff --git a/egs/commonvoice/asr1/conf/tuning/transducer/decode_default.yaml b/egs/commonvoice/asr1/conf/tuning/transducer/decode_default.yaml
deleted file mode 100644
index db8d18bf5f6..00000000000
--- a/egs/commonvoice/asr1/conf/tuning/transducer/decode_default.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-# decoding parameters
-batch: 0
-beam-size: 10
-search-type: default
-score-norm: True
-lm-weight: 0.3
\ No newline at end of file
diff --git a/egs/commonvoice/asr1/local/data_prep.pl b/egs/commonvoice/asr1/local/data_prep.pl
index c05d33ab6f4..818d98748fa 100755
--- a/egs/commonvoice/asr1/local/data_prep.pl
+++ b/egs/commonvoice/asr1/local/data_prep.pl
@@ -68,6 +68,6 @@
   die "Error creating spk2utt file in directory $out_dir";
 }
 system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
-if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-feats $out_dir") != 0) {
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --non-print --no-feats $out_dir") != 0) {
   die "Error validating directory $out_dir";
 }
diff --git a/egs/commonvoice/asr1/local/download_and_untar.sh b/egs/commonvoice/asr1/local/download_and_untar.sh
index 5505934b52b..1f5c40d9b0e 100755
--- a/egs/commonvoice/asr1/local/download_and_untar.sh
+++ b/egs/commonvoice/asr1/local/download_and_untar.sh
@@ -14,10 +14,11 @@ if [ "$1" == --remove-archive ]; then
   shift
 fi
 
-if [ $# -ne 2 ]; then
+if [ $# -ne 3 ]; then
   echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
   echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  exit 0;
 fi
 
 data=$1
diff --git a/egs/covost2/asr1/cmd.sh b/egs/covost2/asr1/cmd.sh
new file mode 100644
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/covost2/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/covost2/asr1/conf/decode.yaml b/egs/covost2/asr1/conf/decode.yaml
new file mode 120000
index 00000000000..9c8169fd835
--- /dev/null
+++ b/egs/covost2/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer_nolm.yaml
\ No newline at end of file
diff --git a/egs/covost2/asr1/conf/fbank.conf b/egs/covost2/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..75232358639
--- /dev/null
+++ b/egs/covost2/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000
+--num-mel-bins=80
diff --git a/egs/covost2/asr1/conf/gpu.conf b/egs/covost2/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/covost2/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/covost2/asr1/conf/lm.yaml b/egs/covost2/asr1/conf/lm.yaml
new file mode 120000
index 00000000000..8e07325bc05
--- /dev/null
+++ b/egs/covost2/asr1/conf/lm.yaml
@@ -0,0 +1 @@
+tuning/lm.yaml
\ No newline at end of file
diff --git a/egs2/an4/tts1/conf/pitch.conf b/egs/covost2/asr1/conf/pitch.conf
similarity index 100%
rename from egs2/an4/tts1/conf/pitch.conf
rename to egs/covost2/asr1/conf/pitch.conf
diff --git a/egs/covost2/asr1/conf/queue.conf b/egs/covost2/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/covost2/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/covost2/asr1/conf/slurm.conf b/egs/covost2/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/covost2/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/covost2/asr1/conf/specaug.yaml b/egs/covost2/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/covost2/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/covost2/asr1/conf/train.yaml b/egs/covost2/asr1/conf/train.yaml
new file mode 120000
index 00000000000..6619e5b1e4f
--- /dev/null
+++ b/egs/covost2/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_conformer.yaml
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_bpe.yaml b/egs/covost2/asr1/conf/tuning/decode_pytorch_transformer.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_bpe.yaml
rename to egs/covost2/asr1/conf/tuning/decode_pytorch_transformer.yaml
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_bpe_nolm.yaml b/egs/covost2/asr1/conf/tuning/decode_pytorch_transformer_nolm.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_bpe_nolm.yaml
rename to egs/covost2/asr1/conf/tuning/decode_pytorch_transformer_nolm.yaml
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/lm_char.yaml b/egs/covost2/asr1/conf/tuning/lm.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/asr1b/conf/tuning/lm_char.yaml
rename to egs/covost2/asr1/conf/tuning/lm.yaml
diff --git a/egs/covost2/asr1/conf/tuning/train_pytorch_conformer.yaml b/egs/covost2/asr1/conf/tuning/train_pytorch_conformer.yaml
new file mode 100644
index 00000000000..38a94b36fbc
--- /dev/null
+++ b/egs/covost2/asr1/conf/tuning/train_pytorch_conformer.yaml
@@ -0,0 +1,53 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# Report CER & WER
+report-cer: true
+report-wer: true
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/covost2/asr1/conf/tuning/train_pytorch_transformer.yaml b/egs/covost2/asr1/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..35111c09eef
--- /dev/null
+++ b/egs/covost2/asr1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,44 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# Report CER & WER
+report-cer: true
+report-wer: true
diff --git a/egs/covost2/asr1/local b/egs/covost2/asr1/local
new file mode 120000
index 00000000000..705961977b3
--- /dev/null
+++ b/egs/covost2/asr1/local
@@ -0,0 +1 @@
+../st1/local
\ No newline at end of file
diff --git a/egs/covost2/asr1/path.sh b/egs/covost2/asr1/path.sh
new file mode 100644
index 00000000000..813bf6153ff
--- /dev/null
+++ b/egs/covost2/asr1/path.sh
@@ -0,0 +1,25 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/covost2/asr1/run.sh b/egs/covost2/asr1/run.sh
new file mode 100755
index 00000000000..3860893fd2a
--- /dev/null
+++ b/egs/covost2/asr1/run.sh
@@ -0,0 +1,377 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1        # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=2          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=8            # number of parallel jobs for decoding
+debugmode=1
+dumpdir=dump    # directory to dump full features
+N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0       # verbose option
+resume=         # Resume the training from snapshot
+seed=1          # seed to generate random number
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume=        # specify a snapshot file to resume LM training
+lmtag=            # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
+                             # if false, the last `n_average` ASR models will be averaged.
+metric=acc                   # loss/acc/cer/cer_ctc
+
+# preprocessing related
+src_case=lc.rm
+# tc: truercase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+
+cv_datadir=/n/rd8/covost2 # original data directory to be stored
+covost2_datadir=download/translation # original data directory to be stored
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+
+# bpemode (unigram or bpe)
+nbpe=1000
+bpemode=bpe
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_sp.${src_lang}-${tgt_lang}.${src_lang}
+train_dev=dev.${src_lang}-${tgt_lang}.${src_lang}
+recog_set="dev_org.${src_lang}-${tgt_lang}.${src_lang} test.${src_lang}-${tgt_lang}.${src_lang}"
+
+# verify language directions
+is_exist=false
+is_low_resource=false
+if [[ ${src_lang} == en ]]; then
+    tgt_langs=de_ca_zh-CN_fa_et_mn_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${tgt_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    lr_src_langs=it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${lr_src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_low_resource=true
+            break
+        fi
+    done
+    src_langs=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    nbpe=4000
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${cv_datadir} ${covost2_datadir}
+
+    # base url for downloads.
+    data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/${src_lang}.tar.gz
+
+    # Download CommonVoice
+    mkdir -p ${cv_datadir}/${src_lang}
+    local/download_and_untar_commonvoice.sh ${cv_datadir}/${src_lang} ${data_url} ${src_lang}.tar.gz
+
+    # Download translation
+    if [[ ${src_lang} != en ]]; then
+        wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz \
+            -P ${covost2_datadir}
+        tar -xzf ${covost2_datadir}/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz -C ${covost2_datadir}
+    fi
+    wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost2.zip \
+          -P ${covost2_datadir}
+    unzip ${covost2_datadir}/covost2.zip -d ${covost2_datadir}
+    # NOTE: some non-English target languages lack translation from English
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+
+    # use underscore-separated names in data directories.
+    local/data_prep_commonvoice.pl "${cv_datadir}/${src_lang}" validated data/validated.${src_lang}
+
+    # text preprocessing (tokenization, case, punctuation marks etc.)
+    local/data_prep_covost2.sh ${covost2_datadir} ${src_lang} ${tgt_lang} || exit 1;
+    # NOTE: train/dev/test splits are different from original CommonVoice
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in dev.${src_lang}-${tgt_lang} test.${src_lang}-${tgt_lang}; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+    # speed perturbation
+    if [ ${src_lang} = en ]; then
+        speed_perturb.sh --cmd "$train_cmd" --speeds "1.0"                 --cases "lc.rm lc tc" --langs "${src_lang} ${tgt_lang}" data/train.${src_lang}-${tgt_lang} data/train_sp.${src_lang}-${tgt_lang} ${fbankdir}
+    elif [ ${is_low_resource} = true ]; then
+        speed_perturb.sh --cmd "$train_cmd" --speeds "0.8 0.9 1.0 1.1 1.2" --cases "lc.rm lc tc" --langs "${src_lang} ${tgt_lang}" data/train.${src_lang}-${tgt_lang} data/train_sp.${src_lang}-${tgt_lang} ${fbankdir}
+    else
+        speed_perturb.sh --cmd "$train_cmd" --speeds "0.9 1.0 1.1"         --cases "lc.rm lc tc" --langs "${src_lang} ${tgt_lang}" data/train.${src_lang}-${tgt_lang} data/train_sp.${src_lang}-${tgt_lang} ${fbankdir}
+    fi
+
+    # Divide into source and target languages
+    for x in train_sp.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} test.${src_lang}-${tgt_lang}; do
+        divide_lang.sh ${x} "${src_lang} ${tgt_lang}"
+    done
+    for lang in ${src_lang} ${tgt_lang}; do
+        cp -rf data/dev.${src_lang}-${tgt_lang}.${lang} data/dev_org.${src_lang}-${tgt_lang}.${lang}
+    done
+
+    # remove long and short utterances
+    for x in train_sp.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang}; do
+        clean_corpus.sh --maxframes 3000 --maxchars 400 --utt_extra_files "text.tc text.lc text.lc.rm" data/${x} "${src_lang} ${tgt_lang}"
+    done
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj 80 --do_delta ${do_delta} \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
+    for x in ${train_dev} ${recog_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}; mkdir -p ${feat_dir}
+        dump.sh --cmd "$train_cmd" --nj 32 --do_delta ${do_delta} \
+            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${x} ${feat_dir}
+    done
+fi
+
+dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${src_case}.txt
+nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${src_case}.txt
+bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${src_case}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+
+    echo "make a non-linguistic symbol list for all languages"
+    grep sp1.0 data/${train_set}/text.${src_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+    cat ${nlsyms}
+
+    echo "make a dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    grep sp1.0 data/${train_set}/text.${src_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}.txt
+    spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}.txt \
+        --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=0.9995
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}.txt \
+        | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+    # NOTE: change coverage for Japanese
+
+    echo "make json files"
+    for x in ${train_set} ${train_dev} ${recog_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data2json.sh --nj 16 --feat ${feat_dir}/feats.scp --text data/${x}/text.${src_case} --bpecode ${bpemodel}.model \
+            data/${x} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}.json
+    done
+fi
+
+# You can skip this and remove --rnnlm option in the recognition (stage 3)
+if [ -z ${lmtag} ]; then
+    lmtag=$(basename ${lm_config%.*})_${src_case}
+fi
+lmexpname=${train_set}_${src_case}_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}
+lmexpdir=exp/${lmexpname}
+mkdir -p ${lmexpdir}
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Preparation"
+    lmdatadir=data/local/lm_${train_set}_${bpemode}${nbpe}
+    mkdir -p ${lmdatadir}
+    grep sp1.0 data/${train_set}/text.${src_case} | cut -f 2- -d " " | spm_encode --model=${bpemodel}.model --output_format=piece \
+        > ${lmdatadir}/train_${src_case}.txt
+    cut -f 2- -d " " data/${train_dev}/text.${src_case} | spm_encode --model=${bpemodel}.model --output_format=piece \
+        > ${lmdatadir}/valid_${src_case}.txt
+    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+        lm_train.py \
+        --config ${lm_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --verbose 1 \
+        --outdir ${lmexpdir} \
+        --tensorboard-dir tensorboard/${lmexpname} \
+        --train-label ${lmdatadir}/train_${src_case}.txt \
+        --valid-label ${lmdatadir}/valid_${src_case}.txt \
+        --resume ${lm_resume} \
+        --dict ${dict}
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${src_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${src_case}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --seed ${seed} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}.json \
+        --n-iter-processes 2
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+       [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+       [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
+       [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric ${metric}"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+    fi
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
+
+    pids=() # initialize pids
+    for x in ${recog_set}; do
+    (
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+
+        # reset log for RTF calculation
+        if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
+            rm ${expdir}/${decode_dir}/log/decode.*.log
+        fi
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_dir}/data_${bpemode}${nbpe}.${src_case}.json
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${dec_ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model} \
+            --rnnlm ${lmexpdir}/rnnlm.model.best
+
+        score_sclite_case.sh --case ${src_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true \
+            ${expdir}/${decode_dir} ${dict}
+        # TODO: support ja and zh-CN
+
+        calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/covost2/asr1/steps b/egs/covost2/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/covost2/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/covost2/asr1/utils b/egs/covost2/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/covost2/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/covost2/mt1/cmd.sh b/egs/covost2/mt1/cmd.sh
new file mode 100644
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/covost2/mt1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/covost2/mt1/conf/gpu.conf b/egs/covost2/mt1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/covost2/mt1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/covost2/mt1/conf/queue.conf b/egs/covost2/mt1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/covost2/mt1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/covost2/mt1/conf/slurm.conf b/egs/covost2/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/covost2/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/covost2/mt1/conf/tuning/train_pytorch_transformer.yaml b/egs/covost2/mt1/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..2767e950160
--- /dev/null
+++ b/egs/covost2/mt1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,42 @@
+# network architecture
+# encoder related
+elayers: 6
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+tie-src-tgt-embedding: false
+tie-classifier: false
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 96
+maxlen-in: 100  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 100 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 1
+grad-clip: 5
+patience: 0
+epochs: 200
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_mt_transformer:E2E"
+transformer-lr: 1.0
+transformer-warmup-steps: 8000
+transformer-attn-dropout-rate: 0.1
+transformer-length-normalized-loss: false
+transformer-init: xavier_uniform
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/covost2/mt1/local b/egs/covost2/mt1/local
new file mode 120000
index 00000000000..705961977b3
--- /dev/null
+++ b/egs/covost2/mt1/local
@@ -0,0 +1 @@
+../st1/local
\ No newline at end of file
diff --git a/egs/covost2/mt1/path.sh b/egs/covost2/mt1/path.sh
new file mode 100644
index 00000000000..813bf6153ff
--- /dev/null
+++ b/egs/covost2/mt1/path.sh
@@ -0,0 +1,25 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/covost2/mt1/run.sh b/egs/covost2/mt1/run.sh
new file mode 100755
index 00000000000..4d9b459fc2f
--- /dev/null
+++ b/egs/covost2/mt1/run.sh
@@ -0,0 +1,346 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1        # start from -1 if you need to start from data download
+stop_stage=5
+ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=8            # number of parallel jobs for decoding
+debugmode=1
+dumpdir=dump    # directory to dump full features
+N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0       # verbose option
+resume=         # Resume the training from snapshot
+seed=1          # seed to generate random number
+
+train_config=conf/train.yaml
+# decode_config=conf/decode.yaml
+decode_config=conf/tuning/decode_pytorch_transformer.tc.yaml
+
+# decoding parameter
+trans_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+
+# model average realted (only for transformer)
+n_average=5                  # the number of MT models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best MT models will be averaged.
+                             # if false, the last `n_average` MT models will be averaged.
+metric=bleu                  # loss/acc/bleu
+max_epoch=200
+
+# preprocessing related
+src_case=tc
+tgt_case=tc
+# tc: truercase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+
+cv_datadir=/n/rd8/covost2 # original data directory to be stored
+covost2_datadir=download/translation # original data directory to be stored
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+
+# if true, reverse source and target languages: **->English
+reverse_direction=false
+
+# use the same dict as in the ST task
+use_st_dict=true
+
+# bpemode (unigram or bpe)
+nbpe=1000
+bpemode=bpe
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+if [ ${reverse_direction} = true ]; then
+    train_set=train.${src_lang}-${tgt_lang}.${src_lang}
+    train_dev=dev.${src_lang}-${tgt_lang}.${src_lang}
+    trans_set="dev_org.${src_lang}-${tgt_lang}.${src_lang} test.${src_lang}-${tgt_lang}.${src_lang}"
+else
+    train_set=train.${src_lang}-${tgt_lang}.${tgt_lang}
+    train_dev=dev.${src_lang}-${tgt_lang}.${tgt_lang}
+    trans_set="dev_org.${src_lang}-${tgt_lang}.${tgt_lang} test.${src_lang}-${tgt_lang}.${tgt_lang}"
+fi
+
+# verify language directions
+is_exist=false
+if [[ ${src_lang} == en ]]; then
+    from_en=de_tr_fa_sv-SE_mn_zh-CN_cy_ca_sl_et_id_ar_ta_lv_ja
+    for lang in $(echo ${from_en} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    to_en=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${to_en} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    nbpe=4000
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${cv_datadir} ${covost2_datadir}
+
+    # base url for downloads.
+    data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/${src_lang}.tar.gz
+
+    # Download CommonVoice
+    mkdir -p ${cv_datadir}/${src_lang}
+    local/download_and_untar_commonvoice.sh ${cv_datadir}/${src_lang} ${data_url} ${src_lang}.tar.gz
+
+    # Download translation
+    if [[ ${src_lang} != en ]]; then
+        wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz \
+            -P ${covost2_datadir}
+        tar -xzf ${covost2_datadir}/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz -C ${covost2_datadir}
+    fi
+    wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost2.zip \
+          -P ${covost2_datadir}
+    unzip ${covost2_datadir}/covost2.zip -d ${covost2_datadir}
+    # NOTE: some non-English target languages lack translation from English
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+
+    # use underscore-separated names in data directories.
+    local/data_prep_commonvoice.pl "${cv_datadir}/${src_lang}" validated data/validated.${src_lang}
+
+    # text preprocessing (tokenization, case, punctuation marks etc.)
+    local/data_prep_covost2.sh ${covost2_datadir} ${src_lang} ${tgt_lang} || exit 1;
+    # NOTE: train/dev/test splits are different from original CommonVoice
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+
+    # Divide into source and target languages
+    for x in train.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} test.${src_lang}-${tgt_lang}; do
+        divide_lang.sh ${x} "${src_lang} ${tgt_lang}"
+    done
+    for lang in ${src_lang} ${tgt_lang}; do
+        cp -rf data/dev.${src_lang}-${tgt_lang}.${lang} data/dev_org.${src_lang}-${tgt_lang}.${lang}
+    done
+
+    # remove long and short utterances
+    for x in train.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang}; do
+        clean_corpus.sh --no_feat true --maxchars 400 --utt_extra_files "text.tc text.lc text.lc.rm" data/${x} "${src_lang} ${tgt_lang}"
+    done
+fi
+
+if [ ${use_st_dict} = true ]; then
+    if [ ${reverse_direction} = true ]; then
+        dict=../st1/data/lang_1spm/train_sp.${src_lang}-${tgt_lang}.${tgt_lang}_${bpemode}${nbpe}_units_${src_case}.txt
+        nlsyms=../st1/data/lang_1spm/train_sp.${src_lang}-${tgt_lang}.${tgt_lang}_non_lang_syms_${src_case}.txt
+        bpemodel=../st1/data/lang_1spm/train_sp.${src_lang}-${tgt_lang}.${tgt_lang}_${bpemode}${nbpe}_${src_case}
+    else
+        dict=../st1/data/lang_1spm/train_sp.${src_lang}-${tgt_lang}.${tgt_lang}_${bpemode}${nbpe}_units_${tgt_case}.txt
+        nlsyms=../st1/data/lang_1spm/train_sp.${src_lang}-${tgt_lang}.${tgt_lang}_non_lang_syms_${tgt_case}.txt
+        bpemodel=../st1/data/lang_1spm/train_sp.${src_lang}-${tgt_lang}.${tgt_lang}_${bpemode}${nbpe}_${tgt_case}
+    fi
+else
+    dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+    nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+    bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+fi
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+
+    if [ ${use_st_dict} = false ]; then
+        echo "make a non-linguistic symbol list for all languages"
+        cut -f 2- -d' ' data/train.${src_lang}-${tgt_lang}.*/text.${tgt_case} | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+        cat ${nlsyms}
+
+        echo "make a joint source and target dictionary"
+        echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+        offset=$(wc -l < ${dict})
+        cut -f 2- -d' ' data/train.${src_lang}-${tgt_lang}.*/text.${tgt_case} | grep -v -e '^\s*$' > data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}_${tgt_case}.txt
+        spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}_${tgt_case}.txt \
+            --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=0.9995
+        spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}_${tgt_case}.txt \
+            | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+        wc -l ${dict}
+    fi
+
+    echo "make json files"
+    if [ ${reverse_direction} = true ]; then
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}; mkdir -p ${feat_dir}
+            set=$(echo ${x} | cut -f 1 -d ".")
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${src_lang}" \
+                data/${set} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        done
+
+        # update json (add source references)
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}
+            data_dir=data/$(echo ${x} | cut -f 1 -d ".").${src_lang}-${tgt_lang}.${tgt_lang}
+            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+        done
+    else
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}; mkdir -p ${feat_dir}
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
+                data/${x} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        done
+
+        # update json (add source references)
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}
+            data_dir=data/$(echo ${x} | cut -f 1 -d ".").${src_lang}-${tgt_lang}.${src_lang}
+            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+        done
+    fi
+fi
+
+# NOTE: skip stage 3: LM Preparation
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${src_case}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+else
+    expname=${train_set}_${src_case}_${tgt_case}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        mt_train.py \
+        --config ${train_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --seed ${seed} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
+        # Average MT models
+        if ${use_valbest_average}; then
+            trans_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric ${metric}"
+        else
+            trans_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${trans_model} \
+            --num ${n_average} \
+            --max-epoch ${max_epoch}
+    fi
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
+
+    pids=() # initialize pids
+    for x in ${trans_set}; do
+    (
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_dir=${dumpdir}/${x}
+
+        # reset log for RTF calculation
+        if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
+            rm ${expdir}/${decode_dir}/log/decode.*.log
+        fi
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            mt_trans.py \
+            --config ${decode_config} \
+            --ngpu ${dec_ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --trans-json ${feat_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${trans_model}
+
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+            ${expdir}/${decode_dir} "${tgt_lang}" ${dict}
+
+        calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/covost2/mt1/steps b/egs/covost2/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/covost2/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/covost2/mt1/utils b/egs/covost2/mt1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/covost2/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/covost2/st1/cmd.sh b/egs/covost2/st1/cmd.sh
new file mode 100644
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/covost2/st1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/covost2/st1/conf/decode.yaml b/egs/covost2/st1/conf/decode.yaml
new file mode 120000
index 00000000000..9fd0988b5e2
--- /dev/null
+++ b/egs/covost2/st1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer_beam4.yaml
\ No newline at end of file
diff --git a/egs/covost2/st1/conf/fbank.conf b/egs/covost2/st1/conf/fbank.conf
new file mode 100644
index 00000000000..75232358639
--- /dev/null
+++ b/egs/covost2/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000
+--num-mel-bins=80
diff --git a/egs/covost2/st1/conf/gpu.conf b/egs/covost2/st1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/covost2/st1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs2/mini_an4/tts1/conf/pitch.conf b/egs/covost2/st1/conf/pitch.conf
similarity index 100%
rename from egs2/mini_an4/tts1/conf/pitch.conf
rename to egs/covost2/st1/conf/pitch.conf
diff --git a/egs/covost2/st1/conf/queue.conf b/egs/covost2/st1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/covost2/st1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/covost2/st1/conf/slurm.conf b/egs/covost2/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/covost2/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/covost2/st1/conf/specaug.yaml b/egs/covost2/st1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/covost2/st1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/covost2/st1/conf/train.yaml b/egs/covost2/st1/conf/train.yaml
new file mode 120000
index 00000000000..ee7633dd48e
--- /dev/null
+++ b/egs/covost2/st1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_conformer_ctcasr0.3.yaml
\ No newline at end of file
diff --git a/egs/covost2/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml b/egs/covost2/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml
new file mode 100644
index 00000000000..aab307eeec9
--- /dev/null
+++ b/egs/covost2/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 4
+penalty: 0.5
+maxlenratio: 0.3
+minlenratio: 0.0
diff --git a/egs/covost2/st1/conf/tuning/train_pytorch_conformer_ctcasr0.3.yaml b/egs/covost2/st1/conf/tuning/train_pytorch_conformer_ctcasr0.3.yaml
new file mode 100644
index 00000000000..48c69f0c9e5
--- /dev/null
+++ b/egs/covost2/st1/conf/tuning/train_pytorch_conformer_ctcasr0.3.yaml
@@ -0,0 +1,60 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# multitask
+mtlalpha: 1.0  # CTC weight
+asr-weight: 0.3
+mt-weight: 0.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50  # for En-X, X-En (high resource)
+# epochs: 100  # for X-En (low resource)
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_st_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.5
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
+
+# Report CER & WER
+report-cer: true
+report-wer: true
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/covost2/st1/local/data_prep_commonvoice.pl b/egs/covost2/st1/local/data_prep_commonvoice.pl
new file mode 100755
index 00000000000..b56e613f38c
--- /dev/null
+++ b/egs/covost2/st1/local/data_prep_commonvoice.pl
@@ -0,0 +1,74 @@
+#!/usr/bin/perl
+#
+# Copyright 2017   Ewald Enzinger
+# Apache 2.0
+#
+# Usage: data_prep.pl /export/data/cv_corpus_v1/cv-valid-train valid_train
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-commonvoice-corpus> <dataset> <valid-train|valid-dev|valid-test>\n";
+  print STDERR "e.g. $0 /export/data/cv_corpus_v1 cv-valid-train valid-train\n";
+  exit(1);
+}
+
+# use ffmpeg for mp3 to wav
+if (length(`which ffmpeg`) == 0) {
+  print "Please install 'ffmpeg' on All worker nodes!\n";
+  exit 1;
+}
+
+
+($db_base, $dataset, $out_dir) = @ARGV;
+mkdir data unless -d data;
+mkdir $out_dir unless -d $out_dir;
+
+open(CSV, "<", "$db_base/$dataset.tsv");
+
+open(CSV, "<", "$db_base/$dataset.tsv") or die "cannot open dataset CSV file";
+open(SPKR,">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
+open(GNDR,">", "$out_dir/utt2gender") or die "Could not open the output file $out_dir/utt2gender";
+open(TEXT,">", "$out_dir/text") or die "Could not open the output file $out_dir/text";
+open(WAV,">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
+my $header = <CSV>;
+while(<CSV>) {
+  chomp;
+  ($spkr, $filepath, $text, $upvotes, $downvotes, $age, $gender, $accent) = split("\t", $_);
+  if ("$gender" eq "female") {
+    $gender = "f";
+  } else {
+    # Use male as default if not provided (no reason, just adopting the same default as in voxforge)
+    $gender = "m";
+  }
+  $uttId = $filepath;
+  if (-z "$db_base/clips/$filepath") {
+    print "null file $filepath\n";
+    next;
+  }
+  $uttId =~ s/\.mp3//g;
+  $uttId =~ tr/\//-/;
+  # speaker information should be suffix of the utterance Id
+  $uttId = "$spkr-$uttId";
+  $text =~ tr/a-z/A-Z/;
+  # if (index($text, "{") != -1 and index($text, "}" != -1)) {
+  #   next;
+  # }
+  # NOTE: we keep these lines for CoVoST2
+  print TEXT "$uttId"," ","$text","\n";
+  print GNDR "$uttId"," ","$gender","\n";
+  print WAV "$uttId"," ffmpeg -i $db_base/clips/$filepath -f wav -ar 16000 -ab 16 -ac 1 - |\n";
+  print SPKR "$uttId"," $spkr","\n";
+}
+close(SPKR) || die;
+close(TEXT) || die;
+close(WAV) || die;
+close(GNDR) || die;
+close(WAVLIST);
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/covost2/st1/local/data_prep_covost2.sh b/egs/covost2/st1/local/data_prep_covost2.sh
new file mode 100755
index 00000000000..c419ee49723
--- /dev/null
+++ b/egs/covost2/st1/local/data_prep_covost2.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+export LC_ALL=C
+
+. ./path.sh || exit 1;
+
+. utils/parse_options.sh || exit 1;
+
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <translation-dir> <src-lang> <tgt-lang>"
+    echo "e.g.: $0 downloads/translation source_lang target_lang"
+    exit 1;
+fi
+
+covost2_datadir=$1
+src_lang=$2
+tgt_lang=$3
+
+
+tsv_path=${covost2_datadir}/covost_v2.${src_lang}_${tgt_lang}.tsv
+[ ! -f ${tsv_path} ] && echo "$0: no such directory ${tsv_path}" && exit 1;
+data_dir=data/validated.${src_lang}
+
+for set in train dev test; do
+    dst=data/local/${set}.${src_lang}-${tgt_lang}
+    mkdir -p ${dst} || exit 1;
+
+    src=${dst}/text.${src_lang}
+    tgt=${dst}/text.${tgt_lang}
+
+    # extract translation from CoVoST2 tsv file and align to transcription in CommonVoice
+    python3 local/process_tsv.py ${tsv_path} ${data_dir}/text ${data_dir}/utt2spk ${src} ${tgt} ${set} || exit 1;
+    utils/utt2spk_to_spk2utt.pl < ${data_dir}/utt2spk > ${dst}/spk2utt
+    utils/spk2utt_to_utt2spk.pl < ${dst}/spk2utt > ${dst}/utt2spk
+    cp ${data_dir}/wav.scp ${dst}/wav.scp
+
+    # sort
+    sort ${src} > ${src}.tmp
+    sort ${tgt} > ${tgt}.tmp
+    mv ${src}.tmp ${src}
+    mv ${tgt}.tmp ${tgt}
+
+    # error check
+    n_src=$(cat ${src} | wc -l)
+    n_tgt=$(cat ${tgt} | wc -l)
+    [ ${n_src} -ne ${n_tgt} ] && echo "Warning: expected ${n_tgt} data files, found ${n_src} in ${src_lang}-${tgt_lang}"
+    #  && exit 1;
+
+    cut -f 2- -d " " ${src} > ${dst}/${src_lang}.org
+    cut -f 2- -d " " ${tgt} > ${dst}/${tgt_lang}.org
+    cut -f 1 -d " " ${src} > ${dst}/reclist.${src_lang}
+    cut -f 1 -d " " ${tgt} > ${dst}/reclist.${tgt_lang}
+
+    for lang in ${src_lang} ${tgt_lang}; do
+        lang_trim="$(echo "${lang}" | cut -f 1 -d '-')"
+
+        # normalize punctuation
+        if [ ${lang} = ${src_lang} ]; then
+            lowercase.perl < ${dst}/${lang}.org > ${dst}/${lang}.org.lc
+            # NOTE: almost all characters in transcription on CommonVoice is truecased
+            normalize-punctuation.perl -l ${lang_trim} < ${dst}/${lang}.org.lc > ${dst}/${lang}.norm
+        else
+            normalize-punctuation.perl -l ${lang_trim} < ${dst}/${lang}.org > ${dst}/${lang}.norm
+        fi
+
+        # lowercasing
+        lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
+        cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
+
+        # remove punctuation
+        remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
+
+        for case in lc.rm lc tc; do
+            # tokenization
+            tokenizer.perl -l ${lang_trim} -q < ${dst}/${lang}.norm.${case} > ${dst}/${lang}.norm.${case}.tok
+
+            paste -d " " ${dst}/reclist.${lang} ${dst}/${lang}.norm.${case}.tok | sort > ${dst}/text.${case}.${lang}
+        done
+
+        # save original and cleaned punctuation
+        lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
+            | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.${lang}
+        lowercase.perl < ${dst}/${lang}.norm.tc | text2token.py -s 0 -n 1 | tr " " "\n" \
+            | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
+    done
+
+    # extract common lines
+    comm -12 <(sort ${dst}/reclist.${src_lang}) <(sort ${dst}/reclist.${tgt_lang}) > ${dst}/reclist
+
+    # Copy stuff intoc its final locations [this has been moved from the format_data script]
+    reduce_data_dir.sh ${dst} ${dst}/reclist data/${set}.${src_lang}-${tgt_lang}
+    for case in lc.rm lc tc; do
+        cp ${dst}/text.${case}.${src_lang} data/${set}.${src_lang}-${tgt_lang}
+        cp ${dst}/text.${case}.${tgt_lang} data/${set}.${src_lang}-${tgt_lang}
+    done
+    utils/fix_data_dir.sh --utt_extra_files \
+        "text.tc.${src_lang} text.lc.${src_lang} text.lc.rm.${src_lang} \
+         text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" data/${set}.${src_lang}-${tgt_lang}
+
+    echo "$0: successfully prepared data in ${dst}"
+done
diff --git a/egs/covost2/st1/local/download_and_untar_commonvoice.sh b/egs/covost2/st1/local/download_and_untar_commonvoice.sh
new file mode 120000
index 00000000000..46f6e7d9bbf
--- /dev/null
+++ b/egs/covost2/st1/local/download_and_untar_commonvoice.sh
@@ -0,0 +1 @@
+../../../commonvoice/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs/covost2/st1/local/process_tsv.py b/egs/covost2/st1/local/process_tsv.py
new file mode 100755
index 00000000000..2c46d83df75
--- /dev/null
+++ b/egs/covost2/st1/local/process_tsv.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import codecs
+import pandas as pd
+
+
+parser = argparse.ArgumentParser(description="extract translation from tsv file")
+parser.add_argument("tsv_path", type=str, default=None, help="input tsv path")
+parser.add_argument("text", type=str, default=None, help="text path")
+parser.add_argument("utt2spk", type=str, default=None, help="utt2spk path")
+parser.add_argument(
+    "save_path_src",
+    type=str,
+    default=None,
+    help="output filtered transcription path in the source language",
+)
+parser.add_argument(
+    "save_path_tgt",
+    type=str,
+    default=None,
+    help="output translation path in the target language",
+)
+parser.add_argument("set", type=str, default=None, help="data split")
+args = parser.parse_args()
+
+
+def main():
+    df = pd.read_csv(args.tsv_path, encoding="utf-8", delimiter="\t")
+    df = df.loc[:, ["path", "translation", "split"]]
+
+    if args.set == "train":
+        df = df[(df["split"] == args.set) | (df["split"] == "train_covost")]
+    else:
+        df = df[df["split"] == args.set]
+    # NOTE: following get_v2_split() in
+    # https://github.com/facebookresearch/covost/blob/master/get_covost_splits.py
+    data = df.to_dict(orient="index").items()
+    data = [v for k, v in sorted(data, key=lambda x: x[0])]
+
+    # read utt2spk (used to get complete speaker id missed in mp3 file name)
+    mp3path2uttid = {}
+    with codecs.open(args.utt2spk, "r", encoding="utf-8") as f:
+        for line in f:
+            utt_id, spk_id = line.strip().split(" ")
+            mp3_name = "-".join(utt_id.split("-")[1:])
+            mp3path2uttid[mp3_name] = utt_id
+            # NOTE: utt_id = spk_id - mp3_name
+
+    # filter transcription
+    uttid2transcription = {}
+    with codecs.open(args.text, "r", encoding="utf-8") as f:
+        for line in f:
+            utt_id = line.strip().split(" ")[0]
+            transcription = " ".join(line.strip().split(" ")[1:])
+            uttid2transcription[utt_id] = transcription
+
+    # save translation
+    with codecs.open(args.save_path_src, "w", encoding="utf-8") as f_src, codecs.open(
+        args.save_path_tgt, "w", encoding="utf-8"
+    ) as f_tgt:
+        for d in data:
+            if d["path"].split(".")[0] not in mp3path2uttid:
+                print("Skip %s (empty mp3)" % d["path"])
+                continue
+            utt_id = mp3path2uttid[d["path"].split(".")[0]]
+            if isinstance(d["translation"], float):
+                print("Skip %s (empty translation)" % utt_id)
+            else:
+                f_tgt.write(utt_id + " " + d["translation"] + "\n")
+                f_src.write(utt_id + " " + uttid2transcription[utt_id] + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/covost2/st1/path.sh b/egs/covost2/st1/path.sh
new file mode 100644
index 00000000000..813bf6153ff
--- /dev/null
+++ b/egs/covost2/st1/path.sh
@@ -0,0 +1,25 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/covost2/st1/run.sh b/egs/covost2/st1/run.sh
new file mode 100755
index 00000000000..59787a4cfd4
--- /dev/null
+++ b/egs/covost2/st1/run.sh
@@ -0,0 +1,369 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1        # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=2          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=8            # number of parallel jobs for decoding
+debugmode=1
+dumpdir=dump    # directory to dump full features
+N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0       # verbose option
+resume=         # Resume the training from snapshot
+seed=1          # seed to generate random number
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml
+decode_config=conf/decode.yaml
+
+# decoding parameter
+trans_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ST models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ST models will be averaged.
+                             # if false, the last `n_average` ST models will be averaged.
+metric=bleu                  # loss/acc/bleu
+max_epoch=100
+
+# pre-training related
+asr_model=
+mt_model=
+
+# preprocessing related
+src_case=lc.rm
+tgt_case=tc
+# tc: truercase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+
+cv_datadir=/n/rd8/covost2 # original data directory to be stored
+covost2_datadir=download/translation # original data directory to be stored
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Catalan (ca)
+# Italian (it)
+# Russian (ru)
+# Chinese (zh-CN)
+# Portuguese (pt)
+# Persian (fa)
+# Estonian (et)
+# Mongolian (mn)
+# Dutch (nl)
+# Turkish (tr)
+# Arabic (ar)
+# Swedish (sv-SE)
+# Latvian (lv)
+# Slovenian (sl)
+# Tamil (ta)
+# Japanese (ja)
+# Indonesian (id)
+# Welsh (cy)
+
+# bpemode (unigram or bpe)
+nbpe=1000
+bpemode=bpe
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_sp.${src_lang}-${tgt_lang}.${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}.${tgt_lang}
+trans_set="dev_org.${src_lang}-${tgt_lang}.${tgt_lang} test.${src_lang}-${tgt_lang}.${tgt_lang}"
+
+# verify language directions
+is_exist=false
+is_low_resource=false
+if [[ ${src_lang} == en ]]; then
+    tgt_langs=de_ca_zh-CN_fa_et_mn_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${tgt_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+else
+    lr_src_langs=it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${lr_src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_low_resource=true
+            break
+        fi
+    done
+    src_langs=fr_de_es_ca_it_ru_zh-CN_pt_fa_et_mn_nl_tr_ar_sv-SE_lv_sl_ta_ja_id_cy
+    for lang in $(echo ${src_langs} | tr '_' ' '); do
+        if [[ ${lang} == "${src_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+    nbpe=4000
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    mkdir -p ${cv_datadir} ${covost2_datadir}
+
+    # base url for downloads.
+    data_url=https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/${src_lang}.tar.gz
+
+    # Download CommonVoice
+    mkdir -p ${cv_datadir}/${src_lang}
+    local/download_and_untar_commonvoice.sh ${cv_datadir}/${src_lang} ${data_url} ${src_lang}.tar.gz
+
+    # Download translation
+    if [[ ${src_lang} != en ]]; then
+        wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz \
+            -P ${covost2_datadir}
+        tar -xzf ${covost2_datadir}/covost_v2.${src_lang}_${tgt_lang}.tsv.tar.gz -C ${covost2_datadir}
+    fi
+    wget --no-check-certificate https://dl.fbaipublicfiles.com/covost/covost2.zip \
+          -P ${covost2_datadir}
+    unzip ${covost2_datadir}/covost2.zip -d ${covost2_datadir}
+    # NOTE: some non-English target languages lack translation from English
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+
+    # use underscore-separated names in data directories.
+    local/data_prep_commonvoice.pl "${cv_datadir}/${src_lang}" validated data/validated.${src_lang}
+
+    # text preprocessing (tokenization, case, punctuation marks etc.)
+    local/data_prep_covost2.sh ${covost2_datadir} ${src_lang} ${tgt_lang} || exit 1;
+    # NOTE: train/dev/test splits are different from original CommonVoice
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in dev.${src_lang}-${tgt_lang} test.${src_lang}-${tgt_lang}; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+    # speed perturbation
+    if [ ${src_lang} = en ]; then
+        speed_perturb.sh --cmd "$train_cmd" --speeds "1.0"                 --cases "lc.rm lc tc" --langs "${src_lang} ${tgt_lang}" data/train.${src_lang}-${tgt_lang} data/train_sp.${src_lang}-${tgt_lang} ${fbankdir}
+    elif [ ${is_low_resource} = true ]; then
+        speed_perturb.sh --cmd "$train_cmd" --speeds "0.8 0.9 1.0 1.1 1.2" --cases "lc.rm lc tc" --langs "${src_lang} ${tgt_lang}" data/train.${src_lang}-${tgt_lang} data/train_sp.${src_lang}-${tgt_lang} ${fbankdir}
+    else
+        speed_perturb.sh --cmd "$train_cmd" --speeds "0.9 1.0 1.1"         --cases "lc.rm lc tc" --langs "${src_lang} ${tgt_lang}" data/train.${src_lang}-${tgt_lang} data/train_sp.${src_lang}-${tgt_lang} ${fbankdir}
+    fi
+
+    # Divide into source and target languages
+    for x in train_sp.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang} test.${src_lang}-${tgt_lang}; do
+        divide_lang.sh ${x} "${src_lang} ${tgt_lang}"
+    done
+    for lang in ${src_lang} ${tgt_lang}; do
+        cp -rf data/dev.${src_lang}-${tgt_lang}.${lang} data/dev_org.${src_lang}-${tgt_lang}.${lang}
+    done
+
+    # remove long and short utterances
+    for x in train_sp.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang}; do
+        clean_corpus.sh --maxframes 3000 --maxchars 400 --utt_extra_files "text.tc text.lc text.lc.rm" data/${x} "${src_lang} ${tgt_lang}"
+    done
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj 80 --do_delta ${do_delta} \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
+    for x in ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}; mkdir -p ${feat_dir}
+        dump.sh --cmd "$train_cmd" --nj 32 --do_delta ${do_delta} \
+            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${x} ${feat_dir}
+    done
+fi
+
+dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+
+    echo "make a non-linguistic symbol list for all languages"
+    grep sp1.0 data/train_sp.${src_lang}-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+    cat ${nlsyms}
+
+    echo "make a joint source and target dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    grep sp1.0 data/train_sp.${src_lang}-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${src_lang}_${tgt_lang}_${tgt_case}.txt
+    spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${src_lang}_${tgt_lang}_${tgt_case}.txt \
+        --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=0.9995
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${src_lang}_${tgt_lang}_${tgt_case}.txt \
+        | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    for x in ${train_set} ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data2json.sh --nj 16 --feat ${feat_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
+            data/${x} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+    done
+
+    # update json (add source references)
+    for x in ${train_set} ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data_dir=data/$(echo ${x} | cut -f 1 -d ".").${src_lang}-${tgt_lang}.${src_lang}
+        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+    done
+fi
+
+# NOTE: skip stage 3: LM Preparation
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+    if [ -n "${asr_model}" ]; then
+        expname=${expname}_asrtrans
+    fi
+    if [ -n "${mt_model}" ]; then
+        expname=${expname}_mttrans
+    fi
+else
+    expname=${train_set}_${tgt_case}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        st_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --seed ${seed} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --enc-init ${asr_model} \
+        --dec-init ${mt_model} \
+        --n-iter-processes 2 \
+        --num-save-attention 1 \
+        --num-save-ctc 1
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+       [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]]; then
+        # Average ST models
+        if ${use_valbest_average}; then
+            trans_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric ${metric}"
+        else
+            trans_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${trans_model} \
+            --num ${n_average} \
+            --max-epoch ${max_epoch}
+    fi
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
+
+    pids=() # initialize pids
+    for x in ${trans_set}; do
+    (
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+
+        # reset log for RTF calculation
+        if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
+            rm ${expdir}/${decode_dir}/log/decode.*.log
+        fi
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            st_trans.py \
+            --config ${decode_config} \
+            --ngpu ${dec_ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --trans-json ${feat_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${trans_model}
+
+        character_level=false
+        if [ ${src_lang} == ja ] || [ ${src_lang} == zh-CN ]; then
+            character_level=true
+        fi
+
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model --character_level ${character_level} \
+            ${expdir}/${decode_dir} "${tgt_lang}" ${dict}
+
+        calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/covost2/st1/steps b/egs/covost2/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/covost2/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/covost2/st1/utils b/egs/covost2/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/covost2/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/csj/align1/cmd.sh b/egs/csj/align1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/csj/align1/cmd.sh
+++ b/egs/csj/align1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/csj/asr1/cmd.sh b/egs/csj/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/csj/asr1/cmd.sh
+++ b/egs/csj/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/csmsc/tts1/cmd.sh b/egs/csmsc/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/csmsc/tts1/cmd.sh
+++ b/egs/csmsc/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/csmsc/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml b/egs/csmsc/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
index ff6b6e5e5e6..29fd2b46f6e 100644
--- a/egs/csmsc/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
+++ b/egs/csmsc/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
@@ -1,7 +1,7 @@
 # NOTE(kan-bayashi): This config is just copied from the other recipe, not yet tested.
 
 # This configuration uses reduction factor = 1 and location-sensitive attention.
-# Furthermore, to accelerate the learning of diaogonal attention, we additionaly
+# Furthermore, to accelerate the learning of diaogonal attention, we additionally
 # use guided attention loss. This leads super fast and robust attention learning.
 
 # encoder related
diff --git a/egs/csmsc/tts1/local/data_download.sh b/egs/csmsc/tts1/local/data_download.sh
index d87df67879c..1b5f495ff17 100755
--- a/egs/csmsc/tts1/local/data_download.sh
+++ b/egs/csmsc/tts1/local/data_download.sh
@@ -14,19 +14,17 @@ fi
 set -euo pipefail
 
 # download dataset
-cwd=$(pwd)
-if [ ! -e ${db}/CSMSC ]; then
-    mkdir -p ${db}
-    cd ${db}
-    wget https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar
-    mkdir CSMSC && cd CSMSC && unrar x ../BZNSYP.rar
-    # convert new line code
-    find ./PhoneLabeling -name "*.interval" | while read -r line; do
-        nkf -Lu -w --overwrite ${line}
-    done
-    rm ../BZNSYP.rar
-    cd ${cwd}
-    echo "Successfully finished download."
+if [ ! -e "${db}/CSMSC" ]; then
+    echo "Now CSMSC is not free, you cannot download it anymore."
+    echo "You need to apply the form: https://www.data-baker.com/open_source.html"
+    echo "After you get the corpus, please locate it as follows and then re-run the recipe:"
+    cat << EOF
+${db}/CSMSC
+├── PhoneLabeling
+├── ProsodyLabeling
+└── Wave
+EOF
+    exit 1;
 else
     echo "Already exists. Skip download."
 fi
diff --git a/egs/csmsc/tts1/local/data_prep.sh b/egs/csmsc/tts1/local/data_prep.sh
index 2eb7b9ef35e..42fedea568c 100755
--- a/egs/csmsc/tts1/local/data_prep.sh
+++ b/egs/csmsc/tts1/local/data_prep.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/dipco/asr1/cmd.sh b/egs/dipco/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/dipco/asr1/cmd.sh
+++ b/egs/dipco/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/dirha_wsj/asr1/cmd.sh b/egs/dirha_wsj/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/dirha_wsj/asr1/cmd.sh
+++ b/egs/dirha_wsj/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/fisher_callhome_spanish/asr1/cmd.sh b/egs/fisher_callhome_spanish/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/fisher_callhome_spanish/asr1/cmd.sh
+++ b/egs/fisher_callhome_spanish/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/fisher_callhome_spanish/asr1b/RESULTS.md b/egs/fisher_callhome_spanish/asr1b/RESULTS.md
index 81e30b98b73..5c306816abd 100644
--- a/egs/fisher_callhome_spanish/asr1b/RESULTS.md
+++ b/egs/fisher_callhome_spanish/asr1b/RESULTS.md
@@ -1,73 +1,79 @@
 # Summary (WER)
-|model|fisher_dev|fisher_dev2|fisher_test|callhome_devtest|callhome_evltest|
-|-----|----------|-----------|-----------|----------------|----------------|
-|RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581)|25.7|25.1|23.2|44.5|45.3|
-|RNN (BPE1k)|26.0|24.9|22.8|44.6|45.7|
-|Transformer (BPE1k)|24.2|23.6|21.5|41.1|41.4|
-| + SpecAugment|23.1|22.5|20.8|40.2|39.6|
-|Conformer (BPE1k) + SpecAugment|**21.3**|**21.1**|**19.4**|**37.1**|**37.9**|
 
+| model                                                         | fisher_dev | fisher_dev2 | fisher_test | callhome_devtest | callhome_evltest |
+| ------------------------------------------------------------- | ---------- | ----------- | ----------- | ---------------- | ---------------- |
+| RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581) | 25.7       | 25.1        | 23.2        | 44.5             | 45.3             |
+| RNN (BPE1k)                                                   | 26.0       | 24.9        | 22.8        | 44.6             | 45.7             |
+| Transformer (BPE1k)                                           | 24.2       | 23.6        | 21.5        | 41.1             | 41.4             |
+| + SpecAugment                                                 | 23.1       | 22.5        | 20.8        | 40.2             | 39.6             |
+| Conformer (BPE1k) + SpecAugment                               | **20.7**   | **20.1**    | **18.9**    | **36.0**         | **36.7**         |
 
 # Conformer results
-### train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug/decode_fisher_dev.es_decode_pytorch_transformer_bpe|3973|40966|83.0|12.3|4.6|4.3|**21.3**|63.5|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug/decode_fisher_dev2.es_decode_pytorch_transformer_bpe|3957|39895|83.7|12.4|3.9|4.9|**21.1**|63.7|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug/decode_fisher_test.es_decode_pytorch_transformer_bpe|3638|39990|85.8|10.8|3.4|5.3|**19.4**|62.4|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug/decode_callhome_devtest.es_decode_pytorch_transformer_bpe|3956|37584|70.3|23.4|6.3|7.4|**37.1**|78.7|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug/decode_callhome_evltest.es_decode_pytorch_transformer_bpe|1825|18807|69.4|23.4|7.2|7.3|**37.9**|80.5|
 
-- Model files (archived to train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug.tar.gz by `$ pack_model.sh`)
-  - training config file: `conf/tuning/train_pytorch_conformer_kernel15_bpe.yaml`
-  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe.yaml`
+### train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug
+
+| dataset                                                                                                                     | Snt  | Wrd   | Corr | Sub  | Del | Ins | Err      | S.Err |
+| --------------------------------------------------------------------------------------------------------------------------- | ---- | ----- | ---- | ---- | --- | --- | -------- | ----- |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/decode_fisher_dev.es_decode_pytorch_transformer       | 3973 | 40966 | 84.0 | 11.6 | 4.3 | 4.7 | **20.7** | 61.9  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/decode_fisher_dev2.es_decode_pytorch_transformer      | 3957 | 39895 | 84.9 | 11.6 | 3.5 | 5.0 | **20.1** | 62.0  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/decode_fisher_test.es_decode_pytorch_transformer      | 3638 | 39990 | 86.6 | 10.2 | 3.2 | 5.5 | **18.9** | 61.1  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/decode_callhome_devtest.es_decode_pytorch_transformer | 3956 | 37584 | 71.8 | 21.9 | 6.3 | 7.8 | **36.0** | 77.1  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/decode_callhome_evltest.es_decode_pytorch_transformer | 1825 | 18807 | 71.0 | 22.1 | 6.9 | 7.8 | **36.7** | 79.8  |
+
+- Model files (archived to train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug.tar.gz by `$ pack_model.sh`)
+  - model link: https://drive.google.com/file/d/1IuaujfXZP9my9lsvwhquzb0_N5-ojhg2/view?usp=sharing
+  - training config file: `conf/tuning/train_pytorch_conformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer.yaml`
   - preprocess config file: `conf/specaug.yaml`
   - cmvn file: `data/train_sp.es/cmvn.ark`
-  - e2e file: `exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug/results/model.val5.avg.best`
-  - e2e JSON file: `exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_bpe1000_specaug/results/model.json`
+  - e2e file: `exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/results/model.json`
   - lm file: `exp/train_sp.es_lc.rm_rnnlm_pytorch_lm_lc.rm_bpe1000/rnnlm.model.best`
   - lm JSON file: `exp/train_sp.es_lc.rm_rnnlm_pytorch_lm_lc.rm_bpe1000/model.json`
 - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
 
-
 # Transformer results
-### train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/decode_fisher_dev.es_decode_pytorch_transformer_bpe|3973|40966|81.0|12.0|7.0|4.1|**23.1**|65.8|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/decode_fisher_dev2.es_decode_pytorch_transformer_bpe|3957|39895|81.9|11.9|6.1|4.5|**22.5**|65.6|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/decode_fisher_test.es_decode_pytorch_transformer_bpe|3638|39990|84.1|10.8|5.2|4.9|**20.8**|64.7|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/decode_callhome_devtest.es_decode_pytorch_transformer_bpe|3956|37584|67.4|22.7|9.9|7.6|**40.2**|80.3|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/decode_callhome_evltest.es_decode_pytorch_transformer_bpe|1825|18807|67.4|22.8|9.8|7.0|**39.6**|82.0|
 
-- Model files (archived to train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug.tar.gz by `$ pack_model.sh`)
+### train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug
+
+| dataset                                                                                                                       | Snt  | Wrd   | Corr | Sub  | Del | Ins | Err      | S.Err |
+| ----------------------------------------------------------------------------------------------------------------------------- | ---- | ----- | ---- | ---- | --- | --- | -------- | ----- |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/decode_fisher_dev.es_decode_pytorch_transformer       | 3973 | 40966 | 81.0 | 12.0 | 7.0 | 4.1 | **23.1** | 65.8  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/decode_fisher_dev2.es_decode_pytorch_transformer      | 3957 | 39895 | 81.9 | 11.9 | 6.1 | 4.5 | **22.5** | 65.6  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/decode_fisher_test.es_decode_pytorch_transformer      | 3638 | 39990 | 84.1 | 10.8 | 5.2 | 4.9 | **20.8** | 64.7  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/decode_callhome_devtest.es_decode_pytorch_transformer | 3956 | 37584 | 67.4 | 22.7 | 9.9 | 7.6 | **40.2** | 80.3  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/decode_callhome_evltest.es_decode_pytorch_transformer | 1825 | 18807 | 67.4 | 22.8 | 9.8 | 7.0 | **39.6** | 82.0  |
+
+- Model files (archived to train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug.tar.gz by `$ pack_model.sh`)
   - model link: https://drive.google.com/open?id=1CPo1a8r2OXi5zqDmqs3gbZQW9BB4KHsu
-  - training config file: `conf/tuning/train_pytorch_transformer_bpe_long.yaml`
-  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe.yaml`
+  - training config file: `conf/tuning/train_pytorch_transformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer.yaml`
   - preprocess config file: `conf/specaug.yaml`
   - cmvn file: `data/train_sp.es/cmvn.ark`
-  - e2e file: `exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/results/model.val5.avg.best`
-  - e2e JSON file: `exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/results/model.json`
+  - e2e file: `exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/results/model.json`
   - lm file: `exp/train_sp.es_lc.rm_rnnlm_pytorch_lm_lc.rm_bpe1000/rnnlm.model.best`
   - lm JSON file: `exp/train_sp.es_lc.rm_rnnlm_pytorch_lm_lc.rm_bpe1000/model.json`
 - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
 
-### train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_fisher_dev.es_decode_pytorch_transformer_bpe|3973|40966|80.0|12.8|7.2|4.2|**24.2**|66.3|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_fisher_dev2.es_decode_pytorch_transformer_bpe|3957|39895|81.0|12.6|6.4|4.5|**23.6**|65.8|s
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_fisher_test.es_decode_pytorch_transformer_bpe|3638|39990|83.5|11.0|5.5|5.0|**21.5**|65.7|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_callhome_devtest.es_decode_pytorch_transformer_bpe|3956|37584|66.2|23.5|10.2|7.3|**41.1**|80.7|
-|exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_callhome_evltest.es_decode_pytorch_transformer_bpe|1825|18807|65.7|23.7|10.6|7.1|**41.4**|82.7|
+### train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000
 
+| dataset                                                                                                               | Snt  | Wrd   | Corr | Sub  | Del  | Ins | Err      | S.Err |
+| --------------------------------------------------------------------------------------------------------------------- | ---- | ----- | ---- | ---- | ---- | --- | -------- | ----- | --- |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_fisher_dev.es_decode_pytorch_transformer       | 3973 | 40966 | 80.0 | 12.8 | 7.2  | 4.2 | **24.2** | 66.3  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_fisher_dev2.es_decode_pytorch_transformer      | 3957 | 39895 | 81.0 | 12.6 | 6.4  | 4.5 | **23.6** | 65.8  | s   |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_fisher_test.es_decode_pytorch_transformer      | 3638 | 39990 | 83.5 | 11.0 | 5.5  | 5.0 | **21.5** | 65.7  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_callhome_devtest.es_decode_pytorch_transformer | 3956 | 37584 | 66.2 | 23.5 | 10.2 | 7.3 | **41.1** | 80.7  |
+| exp/train_sp.es_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_callhome_evltest.es_decode_pytorch_transformer | 1825 | 18807 | 65.7 | 23.7 | 10.6 | 7.1 | **41.4** | 82.7  |
 
 # RNN results
-### train_sp.es_lc.rm_pytorch_train_rnn_bpe_bpe1000
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe_bpe1000/decode_fisher_dev.es_decode_rnn_bpe|3973|40966|78.2|14.3|7.5|4.2|**26.0**|66.8|
-|exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe_bpe1000/decode_fisher_dev2.es_decode_rnn_bpe|3957|39895|79.5|14.2| 6.4|4.4|**24.9**|67.1|
-|exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe_bpe1000/decode_fisher_test.es_decode_rnn_bpe|3638|39990|82.0|12.4|5.5| 4.9|**22.8**|65.9|
-|exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe_bpe1000/decode_callhome_devtest.es_decode_rnn_bpe|3956|37584|61.7|26.1|12.2|6.3|**44.6**|81.5|
-|exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe_bpe1000/decode_callhome_evltest.es_decode_rnn_bpe|1825|18807|60.4|26.4|13.2|6.1|**45.7**|82.8|
+
+### train_sp.es_lc.rm_pytorch_train_rnn_bpe1000
+
+| dataset                                                                               | Snt  | Wrd   | Corr | Sub  | Del  | Ins | Err      | S.Err |
+| ------------------------------------------------------------------------------------- | ---- | ----- | ---- | ---- | ---- | --- | -------- | ----- |
+| exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe1000/decode_fisher_dev.es_decode_rnn       | 3973 | 40966 | 78.2 | 14.3 | 7.5  | 4.2 | **26.0** | 66.8  |
+| exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe1000/decode_fisher_dev2.es_decode_rnn      | 3957 | 39895 | 79.5 | 14.2 | 6.4  | 4.4 | **24.9** | 67.1  |
+| exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe1000/decode_fisher_test.es_decode_rnn      | 3638 | 39990 | 82.0 | 12.4 | 5.5  | 4.9 | **22.8** | 65.9  |
+| exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe1000/decode_callhome_devtest.es_decode_rnn | 3956 | 37584 | 61.7 | 26.1 | 12.2 | 6.3 | **44.6** | 81.5  |
+| exp/train_sp.es_lc.rm_pytorch_train_rnn_bpe1000/decode_callhome_evltest.es_decode_rnn | 1825 | 18807 | 60.4 | 26.4 | 13.2 | 6.1 | **45.7** | 82.8  |
diff --git a/egs/fisher_callhome_spanish/asr1b/cmd.sh b/egs/fisher_callhome_spanish/asr1b/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/fisher_callhome_spanish/asr1b/cmd.sh
+++ b/egs/fisher_callhome_spanish/asr1b/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/decode.yaml b/egs/fisher_callhome_spanish/asr1b/conf/decode.yaml
index ae629f8805d..1f358f011d4 120000
--- a/egs/fisher_callhome_spanish/asr1b/conf/decode.yaml
+++ b/egs/fisher_callhome_spanish/asr1b/conf/decode.yaml
@@ -1 +1 @@
-tuning/decode_pytorch_transformer_bpe_nolm.yaml
\ No newline at end of file
+tuning/decode_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/lm.yaml b/egs/fisher_callhome_spanish/asr1b/conf/lm.yaml
index fcc4edf79c9..8e07325bc05 120000
--- a/egs/fisher_callhome_spanish/asr1b/conf/lm.yaml
+++ b/egs/fisher_callhome_spanish/asr1b/conf/lm.yaml
@@ -1 +1 @@
-tuning/lm_char.yaml
\ No newline at end of file
+tuning/lm.yaml
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/train.yaml b/egs/fisher_callhome_spanish/asr1b/conf/train.yaml
index e28016670c1..6619e5b1e4f 120000
--- a/egs/fisher_callhome_spanish/asr1b/conf/train.yaml
+++ b/egs/fisher_callhome_spanish/asr1b/conf/train.yaml
@@ -1 +1 @@
-tuning/train_pytorch_conformer_kernel15_bpe.yaml
\ No newline at end of file
+tuning/train_pytorch_conformer.yaml
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer.yaml
new file mode 100644
index 00000000000..b092ab58fd7
--- /dev/null
+++ b/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.2
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_char.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_char.yaml
deleted file mode 100644
index c13963377c9..00000000000
--- a/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_char.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-batchsize: 0
-beam-size: 10
-penalty: 0.0
-maxlenratio: 0.0
-minlenratio: 0.0
-ctc-weight: 0.5
-lm-weight: 0.7
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_nolm.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_nolm.yaml
new file mode 100644
index 00000000000..bbaf4fe4068
--- /dev/null
+++ b/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_pytorch_transformer_nolm.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.0
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_rnn_bpe.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_rnn.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_rnn_bpe.yaml
rename to egs/fisher_callhome_spanish/asr1b/conf/tuning/decode_rnn.yaml
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/lm.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/lm.yaml
new file mode 100644
index 00000000000..2bea8373262
--- /dev/null
+++ b/egs/fisher_callhome_spanish/asr1b/conf/tuning/lm.yaml
@@ -0,0 +1,11 @@
+layer: 2
+unit: 1024
+opt: adam        # or sgd
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 256 # batch size in LM training
+epoch: 60      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 150      # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
+
+tie-weights: true
+emb_dropout_rate: 0.3
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_conformer.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_conformer.yaml
new file mode 100644
index 00000000000..9ad8a719706
--- /dev/null
+++ b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_conformer.yaml
@@ -0,0 +1,53 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# Report CER & WER
+report-cer: true
+report-wer: true
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_conformer_kernel15_bpe.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_conformer_kernel15_bpe.yaml
deleted file mode 100644
index 8c18198bce6..00000000000
--- a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_conformer_kernel15_bpe.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# hybrid CTC/attention
-mtlalpha: 0.3
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 32
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 4
-grad-clip: 5
-patience: 0
-epochs: 50
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 1.0
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# Report CER & WER
-report-cer: true
-report-wer: true
-
-# conformer specific setting
-transformer-encoder-pos-enc-layer-type: rel_pos
-transformer-encoder-selfattn-layer-type: rel_selfattn
-transformer-encoder-activation-type: swish
-macaron-style: true
-use-cnn-module: true
-cnn-module-kernel: 15
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..e596aac8e01
--- /dev/null
+++ b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,44 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64  # for BPE
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# Report CER & WER
+report-cer: true
+report-wer: true
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_bpe.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_bpe.yaml
deleted file mode 100644
index 08f95360740..00000000000
--- a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_bpe.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# hybrid CTC/attention
-mtlalpha: 0.3
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 25
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 5.0
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_bpe_long.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_bpe_long.yaml
deleted file mode 100644
index 1631583f4cc..00000000000
--- a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_bpe_long.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# hybrid CTC/attention
-mtlalpha: 0.3
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 50
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 5.0
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_char.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_char.yaml
deleted file mode 100644
index 6a39fb699ed..00000000000
--- a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_pytorch_transformer_char.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# hybrid CTC/attention
-mtlalpha: 0.3
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 32  # for character
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 40
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 5.0
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_rnn_bpe.yaml b/egs/fisher_callhome_spanish/asr1b/conf/tuning/train_rnn.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/asr1b/conf/tuning/train_rnn_bpe.yaml
rename to egs/fisher_callhome_spanish/asr1b/conf/tuning/train_rnn.yaml
diff --git a/egs/fisher_callhome_spanish/asr1b/run.sh b/egs/fisher_callhome_spanish/asr1b/run.sh
index 4258d50d245..1068beeb934 100755
--- a/egs/fisher_callhome_spanish/asr1b/run.sh
+++ b/egs/fisher_callhome_spanish/asr1b/run.sh
@@ -100,8 +100,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature Generation"
     fbankdir=fbank
     # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
-    sed -i.bak -e "s/$/ sox -R -t wav - -t wav - rate 16000 dither | /" data/fisher_train/wav.scp
-    speed_perturb.sh --cmd "$train_cmd" --cases "lc.rm lc tc" --langs "es en" data/fisher_train data/train_sp ${fbankdir}
     for x in fisher_dev fisher_dev2 fisher_test callhome_devtest callhome_evltest; do
         # upsample audio from 8k to 16k to make a recipe consistent with others
         sed -i.bak -e "s/$/ sox -R -t wav - -t wav - rate 16000 dither | /" data/${x}/wav.scp
@@ -110,6 +108,10 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
             data/${x} exp/make_fbank/${x} ${fbankdir}
     done
 
+    # speed perturbation
+    sed -i.bak -e "s/$/ sox -R -t wav - -t wav - rate 16000 dither | /" data/fisher_train/wav.scp
+    speed_perturb.sh --cmd "$train_cmd" --cases "lc.rm lc tc" --langs "es en" data/fisher_train data/train_sp ${fbankdir}
+
     # Divide into source and target languages
     for x in ${train_set_prefix} fisher_dev fisher_dev2 fisher_test callhome_devtest callhome_evltest; do
         local/divide_lang.sh ${x}
@@ -132,16 +134,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
 
     # dump features for training
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/fisher_callhome_spanish/asr1b/dump/${train_set}/delta${do_delta}/storage \
-          ${feat_tr_dir}/storage
-    fi
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/fisher_callhome_spanish/asr1b/dump/${train_dev}/delta${do_delta}/storage \
-          ${feat_dt_dir}/storage
-    fi
     dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
         data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
     for x in ${train_dev} ${recog_set}; do
@@ -177,11 +169,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # NOTE: ASR vocab is created with a source language only
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${src_case} --bpecode ${bpemodel}.model \
-        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}.json
-    for x in ${train_dev} ${recog_set}; do
+    for x in ${train_set} ${train_dev} ${recog_set}; do
         feat_recog_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_recog_dir}/feats.scp --text data/${x}/text.${src_case} --bpecode ${bpemodel}.model \
+        data2json.sh --nj 16 --feat ${feat_recog_dir}/feats.scp --text data/${x}/text.${src_case} --bpecode ${bpemodel}.model \
             data/${x} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.${src_case}.json
     done
 fi
@@ -249,13 +239,15 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --verbose ${verbose} \
         --resume ${resume} \
         --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}.json \
-        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}.json
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}.json \
+        --n-iter-processes 2
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "stage 5: Decoding"
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
        [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+       [[ $(get_yaml.py ${train_config} model-module) = *maskctc* ]] || \
        [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
        [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
         # Average ASR models
diff --git a/egs/fisher_callhome_spanish/mt1/RESULTS.md b/egs/fisher_callhome_spanish/mt1/RESULTS.md
index a7af786cae1..7b627060847 100644
--- a/egs/fisher_callhome_spanish/mt1/RESULTS.md
+++ b/egs/fisher_callhome_spanish/mt1/RESULTS.md
@@ -1,48 +1,53 @@
 # NOTE: apostrophe is included both in hyp and ref
 
 # Summary (4-gram BLEU)
-|model|fisher_dev|fisher_dev2|fisher_test|callhome_devtest|callhome_evltest|
-|-----|----------|-----------|-----------|----------------|----------------|
-|RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581)|58.70|59.90|57.90|28.20|27.90|
-|Transformer (char) |**62.90**|**64.31**|**61.64**|**31.54**|**31.62**|
-|Transformer (BPE1k)|62.59|63.22|61.45|30.58|29.86|
 
+| model                                                         | fisher_dev | fisher_dev2 | fisher_test | callhome_devtest | callhome_evltest |
+| ------------------------------------------------------------- | ---------- | ----------- | ----------- | ---------------- | ---------------- |
+| RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581) | 58.70      | 59.90       | 57.90       | 28.20            | 27.90            |
+| Transformer (char)                                            | **62.90**  | **64.31**   | **61.64**   | **31.54**        | **31.62**        |
+| Transformer (BPE1k)                                           | 62.59      | 63.22       | 61.45       | 30.58            | 29.86            |
 
 # Transformer results
-### train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_fisher_dev.en_decode_pytorch_transformer_bpe|**62.59**|88.0|71.0|56.1|43.8|1.000|1.001|39799|39772|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_fisher_dev2.en_decode_pytorch_transformer_bpe|**63.22**|88.2|71.6|57.0|44.6|0.998|0.998|38815|38877|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_fisher_test.en_decode_pytorch_transformer_bpe|**61.45**|87.9|70.1|54.8|42.2|1.000|1.002|38852|38761|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_callhome_devtest.en_decode_pytorch_transformer_bpe|**30.58**|61.6|37.6|24.2|16.0|0.994|0.994|37198|37416|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_callhome_evltest.en_decode_pytorch_transformer_bpe|**29.86**|60.0|36.2|23.6|15.6|0.999|0.999|18435|18457|
-
-- Model files (archived to train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000.tar.gz by `$ pack_model.sh`)
+
+### train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000
+
+| dataset                                                                                                                  | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ------------------------------------------------------------------------------------------------------------------------ | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_fisher_dev.en_decode_pytorch_transformer       | **62.59** | 88.0   | 71.0   | 56.1   | 43.8   | 1.000 | 1.001 | 39799   | 39772   |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_fisher_dev2.en_decode_pytorch_transformer      | **63.22** | 88.2   | 71.6   | 57.0   | 44.6   | 0.998 | 0.998 | 38815   | 38877   |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_fisher_test.en_decode_pytorch_transformer      | **61.45** | 87.9   | 70.1   | 54.8   | 42.2   | 1.000 | 1.002 | 38852   | 38761   |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_callhome_devtest.en_decode_pytorch_transformer | **30.58** | 61.6   | 37.6   | 24.2   | 16.0   | 0.994 | 0.994 | 37198   | 37416   |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000/decode_callhome_evltest.en_decode_pytorch_transformer | **29.86** | 60.0   | 36.2   | 23.6   | 15.6   | 0.999 | 0.999 | 18435   | 18457   |
+
+- Model files (archived to train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000.tar.gz by `$ pack_model.sh`)
   - model link: https://drive.google.com/open?id=1nScq_ZU0vGgPixwyXP9cUgGlPVC-VByd
-  - training config file: `conf/tuning/train_pytorch_transformer_bpe.yaml`
-  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe.yaml`
-  - e2e file: `exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/results/model.val5.avg.best`
-  - e2e JSON file: `exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/results/model.json`
+  - training config file: `conf/tuning/train_pytorch_transformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer.yaml`
+  - e2e file: `exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_bpe1000/results/model.json`
 - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
 
 ### train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_fisher_dev.en_decode_pytorch_transformer_char|**62.90**|88.9|72.6|58.0|45.5|0.979|0.979|39095|39926|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_fisher_dev2.en_decode_pytorch_transformer_char|**64.31**|89.3|73.2|58.9|46.5|0.988|0.989|38505|38952|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_fisher_test.en_decode_pytorch_transformer_char|**61.64**|88.8|71.5|56.3|43.6|0.981|0.982|38234|38954|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_callhome_devtest.en_decode_pytorch_transformer_char|**31.54**|62.6|38.9|25.7|17.5|0.975|0.975|36496|37416|
-|exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_callhome_evltest.en_decode_pytorch_transformer_char|**31.62**|61.5|38.3|25.3|17.1|0.994|0.994|18354|18457|
-- NOTE: this is quite slow
 
+| dataset                                                                                                                          | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| -------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_fisher_dev.en_decode_pytorch_transformer_char       | **62.90** | 88.9   | 72.6   | 58.0   | 45.5   | 0.979 | 0.979 | 39095   | 39926   |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_fisher_dev2.en_decode_pytorch_transformer_char      | **64.31** | 89.3   | 73.2   | 58.9   | 46.5   | 0.988 | 0.989 | 38505   | 38952   |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_fisher_test.en_decode_pytorch_transformer_char      | **61.64** | 88.8   | 71.5   | 56.3   | 43.6   | 0.981 | 0.982 | 38234   | 38954   |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_callhome_devtest.en_decode_pytorch_transformer_char | **31.54** | 62.6   | 38.9   | 25.7   | 17.5   | 0.975 | 0.975 | 36496   | 37416   |
+| exp/train.en_lc.rm_lc.rm_pytorch_train_pytorch_transformer_char_bpe53/decode_callhome_evltest.en_decode_pytorch_transformer_char | **31.62** | 61.5   | 38.3   | 25.3   | 17.1   | 0.994 | 0.994 | 18354   | 18457   |
+
+- NOTE: this is quite slow
 
 # RNN results
+
 ### train.en_lc.rm_lc_pytorch_train
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train.en_lc.rm_lc_pytorch_train/decode_fisher_dev.en_decode_pytorch_transformer_char|**60.68**|86.3|69.2|54.3|41.8|1.000|1.015|40791|40196|
-|exp/train.en_lc.rm_lc_pytorch_train/decode_fisher_dev2.en_decode_pytorch_transformer_char|**62.05**|87.3|70.5|55.7|43.2|1.000|1.009|39726|39360|
-|exp/train.en_lc.rm_lc_pytorch_train/decode_fisher_test.en_decode_pytorch_transformer_char|**59.63**|86.5|68.6|53.0|40.2|1.000|1.019|39922|39186|
-|exp/train.en_lc.rm_lc_pytorch_train/decode_callhome_devtest.en_decode_pytorch_transformer_char|**29.46**|60.2|36.2|22.9|15.1|1.000|1.020|38168|37424|
-|exp/train.en_lc.rm_lc_pytorch_train/decode_callhome_evltest.en_decode_pytorch_transformer_char|**28.97**|58.4|35.1|22.7|15.2|1.000|1.036|19129|18463|
+
+| dataset                                                                                        | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ---------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.en_lc.rm_lc_pytorch_train/decode_fisher_dev.en_decode_pytorch_transformer_char       | **60.68** | 86.3   | 69.2   | 54.3   | 41.8   | 1.000 | 1.015 | 40791   | 40196   |
+| exp/train.en_lc.rm_lc_pytorch_train/decode_fisher_dev2.en_decode_pytorch_transformer_char      | **62.05** | 87.3   | 70.5   | 55.7   | 43.2   | 1.000 | 1.009 | 39726   | 39360   |
+| exp/train.en_lc.rm_lc_pytorch_train/decode_fisher_test.en_decode_pytorch_transformer_char      | **59.63** | 86.5   | 68.6   | 53.0   | 40.2   | 1.000 | 1.019 | 39922   | 39186   |
+| exp/train.en_lc.rm_lc_pytorch_train/decode_callhome_devtest.en_decode_pytorch_transformer_char | **29.46** | 60.2   | 36.2   | 22.9   | 15.1   | 1.000 | 1.020 | 38168   | 37424   |
+| exp/train.en_lc.rm_lc_pytorch_train/decode_callhome_evltest.en_decode_pytorch_transformer_char | **28.97** | 58.4   | 35.1   | 22.7   | 15.2   | 1.000 | 1.036 | 19129   | 18463   |
diff --git a/egs/fisher_callhome_spanish/mt1/cmd.sh b/egs/fisher_callhome_spanish/mt1/cmd.sh
index 9f648974ff4..3099918dd5d 100644
--- a/egs/fisher_callhome_spanish/mt1/cmd.sh
+++ b/egs/fisher_callhome_spanish/mt1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/fisher_callhome_spanish/mt1/conf/tuning/decode_pytorch_transformer.2en.yaml b/egs/fisher_callhome_spanish/mt1/conf/tuning/decode_pytorch_transformer.2en.yaml
new file mode 100644
index 00000000000..c7df1166882
--- /dev/null
+++ b/egs/fisher_callhome_spanish/mt1/conf/tuning/decode_pytorch_transformer.2en.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.4
+maxlenratio: 1.3
+minlenratio: 0.0
diff --git a/egs/fisher_callhome_spanish/mt1/conf/tuning/decode_pytorch_transformer_bpe.yaml b/egs/fisher_callhome_spanish/mt1/conf/tuning/decode_pytorch_transformer.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/mt1/conf/tuning/decode_pytorch_transformer_bpe.yaml
rename to egs/fisher_callhome_spanish/mt1/conf/tuning/decode_pytorch_transformer.yaml
diff --git a/egs/fisher_callhome_spanish/mt1/conf/tuning/decode_rnn_spm.yaml b/egs/fisher_callhome_spanish/mt1/conf/tuning/decode_rnn.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/mt1/conf/tuning/decode_rnn_spm.yaml
rename to egs/fisher_callhome_spanish/mt1/conf/tuning/decode_rnn.yaml
diff --git a/egs/fisher_callhome_spanish/mt1/conf/tuning/train_pytorch_transformer_bpe.yaml b/egs/fisher_callhome_spanish/mt1/conf/tuning/train_pytorch_transformer.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/mt1/conf/tuning/train_pytorch_transformer_bpe.yaml
rename to egs/fisher_callhome_spanish/mt1/conf/tuning/train_pytorch_transformer.yaml
diff --git a/egs/fisher_callhome_spanish/mt1/run.sh b/egs/fisher_callhome_spanish/mt1/run.sh
index 28589aee115..344d12f7814 100755
--- a/egs/fisher_callhome_spanish/mt1/run.sh
+++ b/egs/fisher_callhome_spanish/mt1/run.sh
@@ -7,7 +7,7 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=0         # start from 0 if you need to start from data preparation
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
@@ -33,9 +33,13 @@ use_valbest_average=true     # if true, the validation `n_average`-best MT model
 metric=bleu                  # loss/acc/bleu
 
 # cascaded-ST related
-asr_model=
+asr_model_dir=
 decode_config_asr=
 dict_asr=
+# example:
+# asr_model_dir=../asr1b/exp/train_sp.es_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug
+# decode_config_asr=../asr1b/config/tuning/decode_pytorch_transformer.yaml
+# dict_asr=../asr1b/data/lang_1spm/train_sp.es_bpe1000_units_lc.rm.txt
 
 # preprocessing related
 src_case=lc.rm
@@ -171,11 +175,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 
     echo "make json files"
     if [ ${reverse_direction} = true ]; then
-        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang es \
-            data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-        for x in ${train_dev} ${trans_set}; do
+        for x in ${train_set} ${train_dev} ${trans_set}; do
             feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
-            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang es \
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "es" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         done
 
@@ -187,37 +189,39 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
                 ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
         done
     else
-        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
-            data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-        for x in ${train_dev} ${trans_set}; do
+        for x in ${train_set} ${train_dev} ${trans_set}; do
             feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
-            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         done
 
-        # update json (add source references)
-        for x in ${train_set} ${train_dev} ${trans_set}; do
-            feat_dir=${dumpdir}/${x}
-            data_dir=data/$(echo ${x} | cut -f 1 -d ".").es
-            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
-                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
-        done
-
         # Fisher has 4 references per utterance
         for x in fisher_dev.en fisher_dev2.en fisher_test.en; do
             feat_trans_dir=${dumpdir}/${x}
             for no in 1 2 3; do
-                data2json.sh --text data/${x}/text.${tgt_case}.${no} --bpecode ${bpemodel}.model --lang en \
+                data2json.sh --text data/${x}/text.${tgt_case}.${no} --bpecode ${bpemodel}.model --lang "en" \
                     data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}_${no}.${src_case}_${tgt_case}.json
             done
         done
+
+        # update json (add source references)
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}
+            data_dir=data/$(echo ${x} | cut -f 1 -d ".").es
+            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+        done
     fi
 fi
 
 # NOTE: skip stage 3: LM Preparation
 
 if [ -z ${tag} ]; then
-    expname=${train_set}_${src_case}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+    if [ ${seed} = 1 ]; then
+        expname=${train_set}_${src_case}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+    else
+        expname=${train_set}_${src_case}_${tgt_case}_${backend}_$(basename ${train_config%.*})_seed${seed}_${bpemode}${nbpe}
+    fi
 else
     expname=${train_set}_${src_case}_${tgt_case}_${backend}_${tag}
 fi
@@ -275,7 +279,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         feat_trans_dir=${dumpdir}/${x}
 
         # reset log for RTF calculation
-        if [ -d ${expdir}/${decode_dir}/log/decode.1.log ]; then
+        if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
             rm ${expdir}/${decode_dir}/log/decode.*.log
         fi
 
@@ -300,10 +304,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         fi
 
         if [ ${reverse_direction} = true ]; then
-            score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
                 ${expdir}/${decode_dir} "es" ${dict}
         else
-            local/score_bleu.sh --case ${tgt_case} --set ${x} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+            local/score_bleu.sh --case ${tgt_case} --set ${x} --bpemodel ${bpemodel}.model \
                 ${expdir}/${decode_dir} ${dict}
         fi
 
@@ -316,7 +320,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "Finished"
 fi
 
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -n "${decode_config_asr}" ] && [ -n "${dict_asr}" ] && [ ${reverse_direction} = false ]; then
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model_dir}" ] && [ -n "${decode_config_asr}" ] && [ -n "${dict_asr}" ] && [ ${reverse_direction} = false ]; then
     echo "stage 6: Cascaded-ST decoding"
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
         # Average MT models
@@ -328,17 +332,17 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
     fi
 
     for x in ${trans_set}; do
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev); mkdir -p ${feat_trans_dir}
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}; mkdir -p ${feat_trans_dir}
         rtask=$(echo ${x} | cut -f -1 -d ".").es
         data_dir=data/${rtask}
 
         # ASR outputs
         asr_decode_dir=decode_${rtask}_$(basename ${decode_config_asr%.*})
-        json2text.py ${asr_model}/${asr_decode_dir}/data.json ${dict_asr} ${data_dir}/text_asr_ref.${src_case} ${data_dir}/text_asr_hyp.${src_case}
+        json2text.py ${asr_model_dir}/${asr_decode_dir}/data.json ${dict_asr} ${data_dir}/text_asr_ref.${src_case} ${data_dir}/text_asr_hyp.${src_case}
         spm_decode --model=${bpemodel}.model --input_format=piece < ${data_dir}/text_asr_hyp.${src_case} | sed -e "s/▁/ /g" \
             > ${data_dir}/text_asr_hyp.wrd.${src_case}
 
-        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
+        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         update_json.sh --text ${data_dir}/text_asr_hyp.wrd.${src_case} --bpecode ${bpemodel}.model \
             ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
@@ -346,9 +350,9 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
 
     # Fisher has 4 references per utterance
     for x in fisher_dev.en fisher_dev2.en fisher_test.en; do
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev); mkdir -p ${feat_trans_dir}
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}; mkdir -p ${feat_trans_dir}
         for no in 1 2 3; do
-            data2json.sh --text data/${x}/text.${tgt_case}.${no} --bpecode ${bpemodel}.model --lang en \
+            data2json.sh --text data/${x}/text.${tgt_case}.${no} --bpecode ${bpemodel}.model --lang "en" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}_${no}.${src_case}_${tgt_case}.json
         done
     done
@@ -361,7 +365,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
     for x in ${trans_set}; do
     (
         decode_dir=decode_${x}_$(basename ${decode_config%.*})_pipeline
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev)
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}
 
         # reset log for RTF calculation
         if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
@@ -388,7 +392,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
             done
         fi
 
-        local/score_bleu.sh --case ${tgt_case} --set ${x} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        local/score_bleu.sh --case ${tgt_case} --set ${x} --bpemodel ${bpemodel}.model \
             ${expdir}/${decode_dir} ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
diff --git a/egs/fisher_callhome_spanish/st1/RESULTS.md b/egs/fisher_callhome_spanish/st1/RESULTS.md
index a5d4597201c..c9840996fca 100644
--- a/egs/fisher_callhome_spanish/st1/RESULTS.md
+++ b/egs/fisher_callhome_spanish/st1/RESULTS.md
@@ -1,137 +1,148 @@
 # NOTE: apostrophe is included both in hyp and ref
 
 # Summary (4-gram BLEU)
-|model|fisher_dev|fisher_dev2|fisher_test|callhome_devtest|callhome_evltest|
-|-----|----------|-----------|-----------|----------------|----------------|
-|RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581)|48.30|49.10|48.70|16.80|17.40|
-|RNN (char)                                        |40.42|41.49|41.51|14.10|14.20|
-|RNN (BPE1k)                                       |30.96|31.56|31.31|9.74 |10.30|
-|RNN (BPE1k) + ASR-MTL                             |36.54|36.99|35.57|12.19|12.66|
-|Transformer (char) + ASR-MTL                      |45.51|46.64|45.61|17.10|16.60|
-|Transformer (BPE1k) + ASR-MTL                     |46.64|47.64|46.45|16.80|16.80|
-|Transformer (BPE1k) + ASR-MTL + MT-MTL            |47.17|48.20|46.99|17.51|17.64|
-|Transformer (BPE1k) + ASR-PT                      |46.25|47.11|46.21|17.35|16.94|
-|Transformer (BPE1k) + ASR-PT + MT-PT              |46.25|47.60|46.72|17.62|17.50|
-|Transformer (BPE1k) + ASR-PT + MT-PT + SpecAugment|48.94|49.32|48.39|18.83|18.67|
-|Conformer (BPE1k) + ASR-PT + MT-PT + SpecAugment|**51.14**|**51.59**|**51.03**|**19.97**|**20.44**|
 
+| model                                                         | fisher_dev | fisher_dev2 | fisher_test | callhome_devtest | callhome_evltest |
+| ------------------------------------------------------------- | ---------- | ----------- | ----------- | ---------------- | ---------------- |
+| RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581) | 48.30      | 49.10       | 48.70       | 16.80            | 17.40            |
+| RNN (char)                                                    | 40.42      | 41.49       | 41.51       | 14.10            | 14.20            |
+| RNN (BPE1k)                                                   | 30.96      | 31.56       | 31.31       | 9.74             | 10.30            |
+| RNN (BPE1k) + ASR-MTL                                         | 36.54      | 36.99       | 35.57       | 12.19            | 12.66            |
+| Transformer (char) + ASR-MTL                                  | 45.51      | 46.64       | 45.61       | 17.10            | 16.60            |
+| Transformer (BPE1k) + ASR-MTL                                 | 46.64      | 47.64       | 46.45       | 16.80            | 16.80            |
+| Transformer (BPE1k) + ASR-MTL + MT-MTL                        | 47.17      | 48.20       | 46.99       | 17.51            | 17.64            |
+| Transformer (BPE1k) + ASR-PT                                  | 46.25      | 47.11       | 46.21       | 17.35            | 16.94            |
+| Transformer (BPE1k) + ASR-PT + MT-PT                          | 46.25      | 47.60       | 46.72       | 17.62            | 17.50            |
+| Transformer (BPE1k) + ASR-PT + MT-PT + SpecAugment            | 48.94      | 49.32       | 48.39       | 18.83            | 18.67            |
+| Conformer (BPE1k) + ASR-PT + MT-PT + SpecAugment              | **51.14**  | **51.59**   | **51.03**   | **19.97**        | **20.44**        |
 
 # Conformer results
-### train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans/decode_fisher_dev.en_decode_pytorch_transformer_bpe|**51.14**|79.4|59.7|44.4|32.5|1.000|1.010|39996|39600|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans/decode_fisher_dev2.en_decode_pytorch_transformer_bpe|**51.59**|79.8|60.2|44.8|32.9|1.000|1.010|39498|39101|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans/decode_fisher_test.en_decode_pytorch_transformer_bpe|**51.03**|80.6|60.0|44.1|31.8|1.000|1.015|39397|38825|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans/decode_callhome_devtest.en_decode_pytorch_transformer_bpe|**19.97**|49.2|25.7|14.7|8.6|1.000|1.003|37524|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans/decode_callhome_evltest.en_decode_pytorch_transformer_bpe|**20.44**|49.3|26.3|15.2|9.2|0.991|0.991|18299|18457|
-
-- Model files (archived to train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans.tar.gz by `$ pack_model.sh`)
-  - training config file: `conf/tuning/train_pytorch_conformer_kernel15_bpe_short.yaml`
-  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml`
+
+### train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans
+
+| dataset                                                                                                                                      | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| -------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans/decode_fisher_dev.en_decode_pytorch_transformer       | **51.14** | 79.4   | 59.7   | 44.4   | 32.5   | 1.000 | 1.010 | 39996   | 39600   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans/decode_fisher_dev2.en_decode_pytorch_transformer      | **51.59** | 79.8   | 60.2   | 44.8   | 32.9   | 1.000 | 1.010 | 39498   | 39101   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans/decode_fisher_test.en_decode_pytorch_transformer      | **51.03** | 80.6   | 60.0   | 44.1   | 31.8   | 1.000 | 1.015 | 39397   | 38825   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans/decode_callhome_devtest.en_decode_pytorch_transformer | **19.97** | 49.2   | 25.7   | 14.7   | 8.6    | 1.000 | 1.003 | 37524   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans/decode_callhome_evltest.en_decode_pytorch_transformer | **20.44** | 49.3   | 26.3   | 15.2   | 9.2    | 0.991 | 0.991 | 18299   | 18457   |
+
+- Model files (archived to train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans.tar.gz by `$ pack_model.sh`)
+  - training config file: `conf/tuning/train_pytorch_conformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer_pretrain.yaml`
   - preprocess config file: `conf/specaug.yaml`
   - cmvn file: `data/train_sp.en/cmvn.ark`
-  - e2e file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans/results/model.val5.avg.best`
-  - e2e JSON file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_kernel15_bpe_short_bpe1000_specaug_asrtrans_mttrans/results/model.json`
+  - e2e file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug_asrtrans_mttrans/results/model.json`
   - NOTE: This model is initialized with the Transformer ASR model (BPE1k, use SpecAugment) on the encoder side and Transformer MT model (BPE1k) on the decoder side.
 - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
-- NOTE: longer version of "short" for SpecAugment: 20ep->30ep
-
 
 # Transformer results
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_fisher_dev.en_decode_pytorch_transformer_bpe|**48.94**|77.5|57.5|42.2|30.5|1.000|1.013|40630|40118|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_fisher_dev2.en_decode_pytorch_transformer_bpe|**49.32**|77.7|57.7|42.6|31.0|1.000|1.019|40229|39482|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_fisher_test.en_decode_pytorch_transformer_bpe|**48.39**|78.3|57.2|41.4|29.6|1.000|1.025|40334|39357|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_callhome_devtest.en_decode_pytorch_transformer_bpe|**18.83**|47.1|24.3|13.8|7.9|1.000|1.016|38018|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_callhome_evltest.en_decode_pytorch_transformer_bpe|**18.67**|46.5|23.9|13.6|8.0|1.000|1.014|18716|18457|
-
-- Model files (archived to train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans.tar.gz by `$ pack_model.sh`)
+
+### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans
+
+| dataset                                                                                                                                        | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ---------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_fisher_dev.en_decode_pytorch_transformer       | **48.94** | 77.5   | 57.5   | 42.2   | 30.5   | 1.000 | 1.013 | 40630   | 40118   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_fisher_dev2.en_decode_pytorch_transformer      | **49.32** | 77.7   | 57.7   | 42.6   | 31.0   | 1.000 | 1.019 | 40229   | 39482   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_fisher_test.en_decode_pytorch_transformer      | **48.39** | 78.3   | 57.2   | 41.4   | 29.6   | 1.000 | 1.025 | 40334   | 39357   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_callhome_devtest.en_decode_pytorch_transformer | **18.83** | 47.1   | 24.3   | 13.8   | 7.9    | 1.000 | 1.016 | 38018   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_callhome_evltest.en_decode_pytorch_transformer | **18.67** | 46.5   | 23.9   | 13.6   | 8.0    | 1.000 | 1.014 | 18716   | 18457   |
+
+- Model files (archived to train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans.tar.gz by `$ pack_model.sh`)
   - model link: https://drive.google.com/open?id=1hawp5ZLw4_SIHIT3edglxbKIIkPVe8n3
-  - training config file: `conf/tuning/train_pytorch_transformer_bpe_short_long.yaml`
-  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml`
+  - training config file: `conf/tuning/train_pytorch_transformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer_pretrain.yaml`
   - preprocess config file: `conf/specaug.yaml`
   - cmvn file: `data/train_sp.en/cmvn.ark`
-  - e2e file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/results/model.val5.avg.best`
-  - e2e JSON file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/results/model.json`
+  - e2e file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/results/model.json`
   - NOTE: This model is initialized with the Transformer ASR model (BPE1k, use SpecAugment) on the encoder side and Transformer MT model (BPE1k) on the decoder side.
 - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
-- NOTE: longer version of "short" for SpecAugment: 20ep->30ep
-
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans/decode_fisher_dev.en_decode_pytorch_transformer_bpe|**46.25**|75.9|54.7|39.4|28.0|1.000|1.016|40746|40107|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans/decode_fisher_dev2.en_decode_pytorch_transformer_bpe|**47.60**|77.0|56.1|40.8|29.2|1.000|1.014|40042|39497|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans/decode_fisher_test.en_decode_pytorch_transformer_bpe|**46.72**|77.3|55.5|39.6|28.0|1.000|1.021|40222|39383|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans/decode_callhome_devtest.en_decode_pytorch_transformer_bpe|**17.62**|45.9|23.0|12.7|7.2|0.999|0.999|37391|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans/decode_callhome_evltest.en_decode_pytorch_transformer_bpe|**17.50**|45.8|22.7|12.6|7.3|0.996|0.996|18375|18457|
+
+### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans
+
+| dataset                                                                                                                                | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| -------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans/decode_fisher_dev.en_decode_pytorch_transformer       | **46.25** | 75.9   | 54.7   | 39.4   | 28.0   | 1.000 | 1.016 | 40746   | 40107   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans/decode_fisher_dev2.en_decode_pytorch_transformer      | **47.60** | 77.0   | 56.1   | 40.8   | 29.2   | 1.000 | 1.014 | 40042   | 39497   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans/decode_fisher_test.en_decode_pytorch_transformer      | **46.72** | 77.3   | 55.5   | 39.6   | 28.0   | 1.000 | 1.021 | 40222   | 39383   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans/decode_callhome_devtest.en_decode_pytorch_transformer | **17.62** | 45.9   | 23.0   | 12.7   | 7.2    | 0.999 | 0.999 | 37391   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans/decode_callhome_evltest.en_decode_pytorch_transformer | **17.50** | 45.8   | 22.7   | 12.6   | 7.3    | 0.996 | 0.996 | 18375   | 18457   |
+
 - NOTE: shorten the total number epochs when pre-training the model: 30ep->20ep
 
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans/decode_fisher_dev.en_decode_pytorch_transformer_bpe|**46.25**|76.2|55.0|39.3|27.8|1.000|1.008|40277|39962|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans/decode_fisher_dev2.en_decode_pytorch_transformer_bpe|**47.11**|76.7|55.8|40.3|28.6|1.000|1.014|39856|39287|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans/decode_fisher_test.en_decode_pytorch_transformer_bpe|**46.21**|77.3|55.1|39.1|27.4|1.000|1.021|40073|39248|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans/decode_callhome_devtest.en_decode_pytorch_transformer_bpe|**17.35**|46.1|22.8|12.5|7.0|0.996|0.996|37281|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans/decode_callhome_evltest.en_decode_pytorch_transformer_bpe|**16.94**|45.4|22.4|12.2|6.9|0.990|0.990|18268|18457|
+### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans
+
+| dataset                                                                                                                        | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ------------------------------------------------------------------------------------------------------------------------------ | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans/decode_fisher_dev.en_decode_pytorch_transformer       | **46.25** | 76.2   | 55.0   | 39.3   | 27.8   | 1.000 | 1.008 | 40277   | 39962   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans/decode_fisher_dev2.en_decode_pytorch_transformer      | **47.11** | 76.7   | 55.8   | 40.3   | 28.6   | 1.000 | 1.014 | 39856   | 39287   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans/decode_fisher_test.en_decode_pytorch_transformer      | **46.21** | 77.3   | 55.1   | 39.1   | 27.4   | 1.000 | 1.021 | 40073   | 39248   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans/decode_callhome_devtest.en_decode_pytorch_transformer | **17.35** | 46.1   | 22.8   | 12.5   | 7.0    | 0.996 | 0.996 | 37281   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_asrtrans/decode_callhome_evltest.en_decode_pytorch_transformer | **16.94** | 45.4   | 22.4   | 12.2   | 6.9    | 0.990 | 0.990 | 18268   | 18457   |
+
 - NOTE: shorten the total number epochs when pre-training the model: 30ep->20ep
 
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000/decode_fisher_dev.en_decode_pytorch_transformer_bpe|**47.17**|77.4|56.1|40.2|28.4|1.000|1.002|39710|39647|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000/decode_fisher_dev2.en_decode_pytorch_transformer_bpe|**48.20**|77.8|56.9|41.4|29.5|1.000|1.009|39380|39037|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000/decode_fisher_test.en_decode_pytorch_transformer_bpe|**46.99**|78.2|56.0|39.9|27.9|1.000|1.013|39290|38803|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000/decode_callhome_devtest.en_decode_pytorch_transformer_bpe|**17.51**|46.7|23.3|12.8|7.2|0.984|0.984|36809|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000/decode_callhome_evltest.en_decode_pytorch_transformer_bpe|**17.64**|46.5|23.2|12.9|7.6|0.979|0.979|18069|18457|
-
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000/decode_fisher_dev.en_decode_pytorch_transformer_bpe|**46.64**|76.8|55.4|39.8|28.1|0.999|0.999|39616|39669|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000/decode_fisher_dev2.en_decode_pytorch_transformer_bpe|**47.64**|77.4|56.4|40.7|29.0|1.000|1.007|39193|38933|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000/decode_fisher_test.en_decode_pytorch_transformer_bpe|**46.45**|77.7|55.4|39.2|27.5|1.000|1.010|39135|38741|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000/decode_callhome_devtest.en_decode_pytorch_transformer_bpe|**16.80**|46.0|22.6|12.1|6.9|0.979|0.980|36651|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000/decode_callhome_evltest.en_decode_pytorch_transformer_bpe|**16.80**|45.8|22.2|12.4|7.1|0.970|0.970|17904|18457|
-
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctc_asr0.3_bpe53
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctc_asr0.3_bpe53/decode_fisher_dev.en_decode_pytorch_transformer_char|**45.51**|75.8|54.3|38.6|27.1|1.000|1.016|40943|40279|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctc_asr0.3_bpe53/decode_fisher_dev2.en_decode_pytorch_transformer_char|**46.64**|76.6|55.3|39.6|28.2|1.000|1.018|40233|39508|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctc_asr0.3_bpe53/decode_fisher_test.en_decode_pytorch_transformer_char|**45.61**|77.0|54.7|38.4|26.7|1.000|1.026|40451|39441|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctc_asr0.3_bpe53/decode_callhome_devtest.en_decode_pytorch_transformer_char|**17.10**|45.8|22.6|12.3|6.7|1.000|1.008|37717|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctc_asr0.3_bpe53/decode_callhome_evltest.en_decode_pytorch_transformer_char|**16.60**|45.3|22.0|11.7|6.5|1.000|1.005|18557|18457|
+### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000
+
+| dataset                                                                                                                               | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000/decode_fisher_dev.en_decode_pytorch_transformer       | **47.17** | 77.4   | 56.1   | 40.2   | 28.4   | 1.000 | 1.002 | 39710   | 39647   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000/decode_fisher_dev2.en_decode_pytorch_transformer      | **48.20** | 77.8   | 56.9   | 41.4   | 29.5   | 1.000 | 1.009 | 39380   | 39037   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000/decode_fisher_test.en_decode_pytorch_transformer      | **46.99** | 78.2   | 56.0   | 39.9   | 27.9   | 1.000 | 1.013 | 39290   | 38803   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000/decode_callhome_devtest.en_decode_pytorch_transformer | **17.51** | 46.7   | 23.3   | 12.8   | 7.2    | 0.984 | 0.984 | 36809   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000/decode_callhome_evltest.en_decode_pytorch_transformer | **17.64** | 46.5   | 23.2   | 12.9   | 7.6    | 0.979 | 0.979 | 18069   | 18457   |
+
+### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000
+
+| dataset                                                                                                                         | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000/decode_fisher_dev.en_decode_pytorch_transformer       | **46.64** | 76.8   | 55.4   | 39.8   | 28.1   | 0.999 | 0.999 | 39616   | 39669   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000/decode_fisher_dev2.en_decode_pytorch_transformer      | **47.64** | 77.4   | 56.4   | 40.7   | 29.0   | 1.000 | 1.007 | 39193   | 38933   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000/decode_fisher_test.en_decode_pytorch_transformer      | **46.45** | 77.7   | 55.4   | 39.2   | 27.5   | 1.000 | 1.010 | 39135   | 38741   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000/decode_callhome_devtest.en_decode_pytorch_transformer | **16.80** | 46.0   | 22.6   | 12.1   | 6.9    | 0.979 | 0.980 | 36651   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000/decode_callhome_evltest.en_decode_pytorch_transformer | **16.80** | 45.8   | 22.2   | 12.4   | 7.1    | 0.970 | 0.970 | 17904   | 18457   |
+
+### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctcasr0.3_bpe53
 
+| dataset                                                                                                                                 | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctcasr0.3_bpe53/decode_fisher_dev.en_decode_pytorch_transformer_char       | **45.51** | 75.8   | 54.3   | 38.6   | 27.1   | 1.000 | 1.016 | 40943   | 40279   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctcasr0.3_bpe53/decode_fisher_dev2.en_decode_pytorch_transformer_char      | **46.64** | 76.6   | 55.3   | 39.6   | 28.2   | 1.000 | 1.018 | 40233   | 39508   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctcasr0.3_bpe53/decode_fisher_test.en_decode_pytorch_transformer_char      | **45.61** | 77.0   | 54.7   | 38.4   | 26.7   | 1.000 | 1.026 | 40451   | 39441   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctcasr0.3_bpe53/decode_callhome_devtest.en_decode_pytorch_transformer_char | **17.10** | 45.8   | 22.6   | 12.3   | 6.7    | 1.000 | 1.008 | 37717   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_char_ctcasr0.3_bpe53/decode_callhome_evltest.en_decode_pytorch_transformer_char | **16.60** | 45.3   | 22.0   | 11.7   | 6.5    | 1.000 | 1.005 | 18557   | 18457   |
 
 # RNN results
-### train_sp.en_lc.rm_pytorch_train_rnn_spm_ctc_asr0.3_bpe1000
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_ctc_asr0.3_bpe1000/decode_fisher_dev.en_decode_rnn_spm|**36.54**|68.5|44.9|29.7|19.5|1.000|1.032|41512|40226|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_ctc_asr0.3_bpe1000/decode_fisher_dev2.en_decode_rnn_spm|**36.99**|68.6|45.3|30.2|19.9|1.000|1.042|41243|39593|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_ctc_asr0.3_bpe1000/decode_fisher_test.en_decode_rnn_spm|**35.57**|68.8|44.1|28.6|18.4|1.000|1.050|41540|39565|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_ctc_asr0.3_bpe1000/decode_callhome_devtest.en_decode_rnn_spm|**12.19**|39.3|16.8|8.2|4.1|1.000|1.017|38052|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_ctc_asr0.3_bpe1000/decode_callhome_evltest.en_decode_rnn_spm|**12.66**|39.0|17.1|8.5|4.5|1.000|1.005|18557|18457|
-
-### train_sp.en_lc.rm_pytorch_train_rnn_spm_bpe1000
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_bpe1000/decode_fisher_dev.en_decode_rnn_spm|**30.96**|63.8|39.0|24.3|15.2|1.000|1.034|41550|40188|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_bpe1000/decode_fisher_dev2.en_decode_rnn_spm|**31.56**|64.2|39.6|25.1|15.5|1.000|1.044|41442|39711|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_bpe1000/decode_fisher_test.en_decode_rnn_spm|**31.31**|65.1|39.4|24.6|15.2|1.000|1.045|41381|39614|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_bpe1000/decode_callhome_devtest.en_decode_rnn_spm|**9.74**|35.3|13.8|6.3|3.0|1.000|1.017|38063|37416|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_spm_bpe1000/decode_callhome_evltest.en_decode_rnn_spm|**10.30**|35.2|14.2|6.7|3.4|1.000|1.018|18788|18457|
+
+### train_sp.en_lc.rm_pytorch_train_rnn_ctcasr0.3_bpe1000
+
+| dataset                                                                                         | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ----------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_ctcasr0.3_bpe1000/decode_fisher_dev.en_decode_rnn       | **36.54** | 68.5   | 44.9   | 29.7   | 19.5   | 1.000 | 1.032 | 41512   | 40226   |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_ctcasr0.3_bpe1000/decode_fisher_dev2.en_decode_rnn      | **36.99** | 68.6   | 45.3   | 30.2   | 19.9   | 1.000 | 1.042 | 41243   | 39593   |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_ctcasr0.3_bpe1000/decode_fisher_test.en_decode_rnn      | **35.57** | 68.8   | 44.1   | 28.6   | 18.4   | 1.000 | 1.050 | 41540   | 39565   |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_ctcasr0.3_bpe1000/decode_callhome_devtest.en_decode_rnn | **12.19** | 39.3   | 16.8   | 8.2    | 4.1    | 1.000 | 1.017 | 38052   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_ctcasr0.3_bpe1000/decode_callhome_evltest.en_decode_rnn | **12.66** | 39.0   | 17.1   | 8.5    | 4.5    | 1.000 | 1.005 | 18557   | 18457   |
+
+### train_sp.en_lc.rm_pytorch_train_rnn_bpe1000
+
+| dataset                                                                               | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe1000/decode_fisher_dev.en_decode_rnn       | **30.96** | 63.8   | 39.0   | 24.3   | 15.2   | 1.000 | 1.034 | 41550   | 40188   |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe1000/decode_fisher_dev2.en_decode_rnn      | **31.56** | 64.2   | 39.6   | 25.1   | 15.5   | 1.000 | 1.044 | 41442   | 39711   |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe1000/decode_fisher_test.en_decode_rnn      | **31.31** | 65.1   | 39.4   | 24.6   | 15.2   | 1.000 | 1.045 | 41381   | 39614   |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe1000/decode_callhome_devtest.en_decode_rnn | **9.74**  | 35.3   | 13.8   | 6.3    | 3.0    | 1.000 | 1.017 | 38063   | 37416   |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe1000/decode_callhome_evltest.en_decode_rnn | **10.30** | 35.2   | 14.2   | 6.7    | 3.4    | 1.000 | 1.018 | 18788   | 18457   |
 
 ### train_sp.en_lc_pytorch_train
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc_pytorch_train/decode_fisher_dev.en_decode|**40.42**|71.4|49.0|33.6|22.7|1.000|1.018|40695|39981|
-|exp/train_sp.en_lc_pytorch_train/decode_fisher_dev2.en_decode|**41.49**|71.9|49.9|34.8|23.8|1.000|1.027|40285|39213|
-|exp/train_sp.en_lc_pytorch_train/decode_fisher_test.en_decode|**41.51**|72.9|50.1|34.6|23.5|1.000|1.034|40358|39049|
-|exp/train_sp.en_lc_pytorch_train/decode_callhome_devtest.en_decode|**14.10**|41.3|19.0|9.8|5.2|0.996|0.996|37268|37424|
-|exp/train_sp.en_lc_pytorch_train/decode_callhome_evltest.en_decode|**14.20**|41.4|19.1|10.0|5.5|0.982|0.982|18139|18463|
+
+| dataset                                                            | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ------------------------------------------------------------------ | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.en_lc_pytorch_train/decode_fisher_dev.en_decode       | **40.42** | 71.4   | 49.0   | 33.6   | 22.7   | 1.000 | 1.018 | 40695   | 39981   |
+| exp/train_sp.en_lc_pytorch_train/decode_fisher_dev2.en_decode      | **41.49** | 71.9   | 49.9   | 34.8   | 23.8   | 1.000 | 1.027 | 40285   | 39213   |
+| exp/train_sp.en_lc_pytorch_train/decode_fisher_test.en_decode      | **41.51** | 72.9   | 50.1   | 34.6   | 23.5   | 1.000 | 1.034 | 40358   | 39049   |
+| exp/train_sp.en_lc_pytorch_train/decode_callhome_devtest.en_decode | **14.10** | 41.3   | 19.0   | 9.8    | 5.2    | 0.996 | 0.996 | 37268   | 37424   |
+| exp/train_sp.en_lc_pytorch_train/decode_callhome_evltest.en_decode | **14.20** | 41.4   | 19.1   | 10.0   | 5.5    | 0.982 | 0.982 | 18139   | 18463   |
diff --git a/egs/fisher_callhome_spanish/st1/cmd.sh b/egs/fisher_callhome_spanish/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/fisher_callhome_spanish/st1/cmd.sh
+++ b/egs/fisher_callhome_spanish/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/fisher_callhome_spanish/st1/conf/decode.yaml b/egs/fisher_callhome_spanish/st1/conf/decode.yaml
index f3348cfda6b..1f358f011d4 120000
--- a/egs/fisher_callhome_spanish/st1/conf/decode.yaml
+++ b/egs/fisher_callhome_spanish/st1/conf/decode.yaml
@@ -1 +1 @@
-tuning/decode_pytorch_transformer_bpe.yaml
\ No newline at end of file
+tuning/decode_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/st1/conf/train.yaml b/egs/fisher_callhome_spanish/st1/conf/train.yaml
index 1c28468c9da..6619e5b1e4f 120000
--- a/egs/fisher_callhome_spanish/st1/conf/train.yaml
+++ b/egs/fisher_callhome_spanish/st1/conf/train.yaml
@@ -1 +1 @@
-tuning/train_pytorch_conformer_kernel15_bpe_ctc_asr0.3.yaml
\ No newline at end of file
+tuning/train_pytorch_conformer.yaml
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_bpe.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_bpe.yaml
rename to egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer.yaml
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml
deleted file mode 100644
index 6186f241d78..00000000000
--- a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-beam-size: 10
-penalty: 0.6
-maxlenratio: 0.3
-minlenratio: 0.0
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_char.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_char.yaml
deleted file mode 100644
index 24a2e8a6152..00000000000
--- a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_char.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-beam-size: 10
-penalty: 0.1
-maxlenratio: 1.6
-minlenratio: 0.0
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_char_pretrain.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_char_pretrain.yaml
deleted file mode 100644
index c252be65190..00000000000
--- a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_char_pretrain.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-beam-size: 10
-penalty: 0.05
-maxlenratio: 1.6
-minlenratio: 0.0
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_pretrain.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_pretrain.yaml
new file mode 100644
index 00000000000..ae3cb1a8eb7
--- /dev/null
+++ b/egs/fisher_callhome_spanish/st1/conf/tuning/decode_pytorch_transformer_pretrain.yaml
@@ -0,0 +1,4 @@
+beam-size: 10
+penalty: 0.2
+maxlenratio: 0.3
+minlenratio: 0.0
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/decode_rnn_spm.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/decode_rnn.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/st1/conf/tuning/decode_rnn_spm.yaml
rename to egs/fisher_callhome_spanish/st1/conf/tuning/decode_rnn.yaml
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer.yaml
new file mode 100644
index 00000000000..26789e4139e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer.yaml
@@ -0,0 +1,55 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# multitask
+mtlalpha: 0.0  # CTC weight
+asr-weight: 0.0
+mt-weight: 0.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 30  # longer version of "short" for SpecAugment: 20ep->30ep
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_st_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.5
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer_kernel15_bpe_ctc_asr0.3.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer_kernel15_bpe_ctc_asr0.3.yaml
deleted file mode 100644
index 810a3a1f11f..00000000000
--- a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer_kernel15_bpe_ctc_asr0.3.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 1.0  # CTC weight
-asr-weight: 0.3
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 32
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 4
-grad-clip: 5
-patience: 0
-epochs: 30
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_conformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 1.0
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
-
-# Report CER & WER
-report-cer: true
-report-wer: true
-
-# conformer specific setting
-transformer-encoder-pos-enc-layer-type: rel_pos
-transformer-encoder-selfattn-layer-type: rel_selfattn
-transformer-encoder-activation-type: swish
-rel-pos-type: latest
-macaron-style: true
-use-cnn-module: true
-cnn-module-kernel: 15
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer_kernel15_bpe_short.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer_kernel15_bpe_short.yaml
deleted file mode 100644
index da97d6e844b..00000000000
--- a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_conformer_kernel15_bpe_short.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 0.0  # CTC weight
-asr-weight: 0.0
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 32
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 4
-grad-clip: 5
-patience: 0
-epochs: 30  # longer version of "short" for SpecAugment: 20ep->30ep
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_conformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 1.0
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
-
-# conformer specific setting
-transformer-encoder-pos-enc-layer-type: rel_pos
-transformer-encoder-selfattn-layer-type: rel_selfattn
-transformer-encoder-activation-type: swish
-rel-pos-type: latest
-macaron-style: true
-use-cnn-module: true
-cnn-module-kernel: 15
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_short_long.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_short_long.yaml
rename to egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer.yaml
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_short.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_short.yaml
deleted file mode 100644
index e2b576f18cb..00000000000
--- a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_short.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 0.0  # CTC weight
-asr-weight: 0.0
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 20  # shorten the total number epochs when pre-training the model: 30ep->20ep
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_char_ctc_asr0.3.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_char_ctc_asr0.3.yaml
deleted file mode 100644
index 0c722931884..00000000000
--- a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_char_ctc_asr0.3.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 1.0  # CTC weight
-asr-weight: 0.3
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 32  # for char
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 40
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_char_short.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_char_short.yaml
deleted file mode 100644
index b23632830a6..00000000000
--- a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_char_short.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 0.0  # CTC weight
-asr-weight: 0.0
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 32  # for char
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 20  # shorten the total number epochs when pre-training the model: 40ep->20ep
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_ctcasr0.2_mt0.2.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2.yaml
rename to egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_ctcasr0.2_mt0.2.yaml
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.3.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_ctcasr0.3.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.3.yaml
rename to egs/fisher_callhome_spanish/st1/conf/tuning/train_pytorch_transformer_ctcasr0.3.yaml
diff --git a/egs/fisher_callhome_spanish/st1/conf/tuning/train_rnn_spm.yaml b/egs/fisher_callhome_spanish/st1/conf/tuning/train_rnn.yaml
similarity index 100%
rename from egs/fisher_callhome_spanish/st1/conf/tuning/train_rnn_spm.yaml
rename to egs/fisher_callhome_spanish/st1/conf/tuning/train_rnn.yaml
diff --git a/egs/fisher_callhome_spanish/st1/local/normalize_trans.sh b/egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
index bdc23561830..646b5bbf213 100755
--- a/egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
+++ b/egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
@@ -14,7 +14,8 @@ if [ $# -lt 2 ]; then
 fi
 
 # download data preparation scripts for transcriptions
-[ ! -d data/local/fisher-callhome-corpus ] && git clone https://github.com/joshua-decoder/fisher-callhome-corpus.git data/local/fisher-callhome-corpus
+# Note uses fork because of utf-8 issues in iconv - https://github.com/joshua-decoder/fisher-callhome-corpus/pull/3
+[ ! -d data/local/fisher-callhome-corpus ] && git clone https://github.com/siddalmia/fisher-callhome-corpus.git data/local/fisher-callhome-corpus
 
 # create symbolic links
 cur_dir=$(pwd)
@@ -57,9 +58,9 @@ for set in fisher_train fisher_dev fisher_dev2 fisher_test callhome_train callho
     normalize-punctuation.perl -l es < data/${set}/es.joshua.org  | local/normalize_punctuation.pl > data/${set}/es.joshua.norm.tc
     lowercase.perl < data/${set}/es.joshua.norm.tc > data/${set}/es.joshua.norm.lc
     remove_punctuation.pl < data/${set}/es.joshua.norm.lc > data/${set}/es.joshua.norm.lc.rm
-    tokenizer.perl -l es -q < data/${set}/es.joshua.norm.tc > data/${set}/es.joshua.norm.tc.tok
-    tokenizer.perl -l es -q < data/${set}/es.joshua.norm.lc > data/${set}/es.joshua.norm.lc.tok
-    tokenizer.perl -l es -q < data/${set}/es.joshua.norm.lc.rm > data/${set}/es.joshua.norm.lc.rm.tok
+    for case in lc.rm lc tc; do
+        tokenizer.perl -l es -q < data/${set}/es.joshua.norm.${case} > data/${set}/es.joshua.norm.${case}.tok
+    done
 
     # Now checking these Es transcriptions are matching (double check)
     cp data/${set}/text data/${set}/text.tmp
@@ -70,12 +71,10 @@ for set in fisher_train fisher_dev fisher_dev2 fisher_test callhome_train callho
     tokenizer.perl -l es -q < data/${set}/es.kaldi.norm.lc > data/${set}/es.kaldi.norm.lc.tok
 
     # use references from joshua-decoder/fisher-callhome-corpus
-    paste -d " " <(awk '{print $1}' data/${set}/text.tmp) <(cat data/${set}/es.joshua.norm.tc.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-        > data/${set}/text.tc.es
-    paste -d " " <(awk '{print $1}' data/${set}/text.tmp) <(cat data/${set}/es.joshua.norm.lc.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-        > data/${set}/text.lc.es
-    paste -d " " <(awk '{print $1}' data/${set}/text.tmp) <(cat data/${set}/es.joshua.norm.lc.rm.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-        > data/${set}/text.lc.rm.es
+    for case in lc.rm lc tc; do
+        paste -d " " <(awk '{print $1}' data/${set}/text.tmp) <(cat data/${set}/es.joshua.norm.${case}.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
+            > data/${set}/text.${case}.es
+    done
 
     # save original and cleaned punctuation
     text2token.py -s 0 -n 1 data/${set}/es.joshua.org | tr " " "\n" | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > data/${set}/punctuation.es
@@ -90,15 +89,11 @@ for set in fisher_train callhome_train callhome_devtest callhome_evltest; do
     normalize-punctuation.perl -l en < data/${set}/en.org | sed -e "s/¿//g" | local/normalize_punctuation.pl > data/${set}/en.norm.tc
     lowercase.perl < data/${set}/en.norm.tc > data/${set}/en.norm.lc
     remove_punctuation.pl < data/${set}/en.norm.lc > data/${set}/en.norm.lc.rm
-    tokenizer.perl -l en -q < data/${set}/en.norm.tc > data/${set}/en.norm.tc.tok
-    tokenizer.perl -l en -q < data/${set}/en.norm.lc > data/${set}/en.norm.lc.tok
-    tokenizer.perl -l en -q < data/${set}/en.norm.lc.rm > data/${set}/en.norm.lc.rm.tok
-    paste -d " " <(awk '{print $1}' data/${set}/text.tc.es) <(cat data/${set}/en.norm.tc.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-        > data/${set}/text.tc.en
-    paste -d " " <(awk '{print $1}' data/${set}/text.lc.es) <(cat data/${set}/en.norm.lc.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-        > data/${set}/text.lc.en
-    paste -d " " <(awk '{print $1}' data/${set}/text.lc.rm.es) <(cat data/${set}/en.norm.lc.rm.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-        > data/${set}/text.lc.rm.en
+    for case in lc.rm lc tc; do
+        tokenizer.perl -l en -q < data/${set}/en.norm.${case} > data/${set}/en.norm.${case}.tok
+        paste -d " " <(awk '{print $1}' data/${set}/text.${case}.es) <(cat data/${set}/en.norm.${case}.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
+            > data/${set}/text.${case}.en
+    done
 
     # save original and cleaned punctuation
     text2token.py -s 0 -n 1 data/${set}/en.org | tr " " "\n" | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > data/${set}/punctuation.en
@@ -111,15 +106,11 @@ for set in fisher_dev fisher_dev2 fisher_test; do
         normalize-punctuation.perl -l en < data/${set}/en.${no}.org | sed -e "s/¿//g"| local/normalize_punctuation.pl > data/${set}/en.${no}.norm.tc
         lowercase.perl < data/${set}/en.${no}.norm.tc > data/${set}/en.${no}.norm.lc
         remove_punctuation.pl < data/${set}/en.${no}.norm.lc > data/${set}/en.${no}.norm.lc.rm
-        tokenizer.perl -l en -q < data/${set}/en.${no}.norm.tc > data/${set}/en.${no}.norm.tc.tok
-        tokenizer.perl -l en -q < data/${set}/en.${no}.norm.lc > data/${set}/en.${no}.norm.lc.tok
-        tokenizer.perl -l en -q < data/${set}/en.${no}.norm.lc.rm > data/${set}/en.${no}.norm.lc.rm.tok
-        paste -d " " <(awk '{print $1}' data/${set}/text.tc.es) <(cat data/${set}/en.${no}.norm.tc.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-            > data/${set}/text.tc.en.${no}
-        paste -d " " <(awk '{print $1}' data/${set}/text.lc.es) <(cat data/${set}/en.${no}.norm.lc.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-            > data/${set}/text.lc.en.${no}
-        paste -d " " <(awk '{print $1}' data/${set}/text.lc.rm.es) <(cat data/${set}/en.${no}.norm.lc.rm.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
-            > data/${set}/text.lc.rm.en.${no}
+        for case in lc.rm lc tc; do
+            tokenizer.perl -l en -q < data/${set}/en.${no}.norm.${case} > data/${set}/en.${no}.norm.${case}.tok
+            paste -d " " <(awk '{print $1}' data/${set}/text.${case}.es) <(cat data/${set}/en.${no}.norm.${case}.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
+                > data/${set}/text.${case}.en.${no}
+        done
     done
 
     # save original and cleaned punctuation
@@ -158,10 +149,9 @@ for set in fisher_train fisher_dev fisher_dev2 fisher_test callhome_train callho
     if [ -f data/${set}/text.tc.en ]; then
         grep -v emptyutterance data/${set}/text.tc.en | cut -f 1 -d " " | sort > data/${set}/reclist.en
     else
-        grep -v emptyutterance data/${set}/text.tc.en.0 | cut -f 1 -d " " | sort > data/${set}/reclist.en.0
-        grep -v emptyutterance data/${set}/text.tc.en.1 | cut -f 1 -d " " | sort > data/${set}/reclist.en.1
-        grep -v emptyutterance data/${set}/text.tc.en.2 | cut -f 1 -d " " | sort > data/${set}/reclist.en.2
-        grep -v emptyutterance data/${set}/text.tc.en.3 | cut -f 1 -d " " | sort > data/${set}/reclist.en.3
+        for no in 0 1 2 3; do
+            grep -v emptyutterance data/${set}/text.tc.en.${no} | cut -f 1 -d " " | sort > data/${set}/reclist.en.${no}
+        done
         comm -12 data/${set}/reclist.en.0 data/${set}/reclist.en.1 > data/${set}/reclist.en
         cp data/${set}/reclist.en data/${set}/reclist.en.tmp
         comm -12 data/${set}/reclist.en.tmp data/${set}/reclist.en.2 > data/${set}/reclist.en
diff --git a/egs/fisher_callhome_spanish/st1/local/score_bleu.sh b/egs/fisher_callhome_spanish/st1/local/score_bleu.sh
index 20e60be41ef..2c54711cd39 100755
--- a/egs/fisher_callhome_spanish/st1/local/score_bleu.sh
+++ b/egs/fisher_callhome_spanish/st1/local/score_bleu.sh
@@ -8,7 +8,6 @@ export LC_ALL=C
 . ./path.sh
 
 nlsyms=""
-bpe=""
 bpemodel=""
 filter=""
 case=lc
@@ -28,7 +27,7 @@ dic_src=$3
 concatjson.py ${dir}/data.*.json > ${dir}/data.json
 json2trn_mt.py ${dir}/data.json ${dic_tgt} --refs ${dir}/ref.trn.org \
     --hyps ${dir}/hyp.trn.org --srcs ${dir}/src.trn.org --dict-src ${dic_src}
-if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
+if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
     json2trn_mt.py ${dir}/data_ref1.json ${dic_tgt} --refs ${dir}/ref1.trn.org
     json2trn_mt.py ${dir}/data_ref2.json ${dic_tgt} --refs ${dir}/ref2.trn.org
     json2trn_mt.py ${dir}/data_ref3.json ${dic_tgt} --refs ${dir}/ref3.trn.org
@@ -38,26 +37,26 @@ fi
 perl -pe 's/\([^\)]+\)//g;' ${dir}/ref.trn.org > ${dir}/ref.trn
 perl -pe 's/\([^\)]+\)//g;' ${dir}/hyp.trn.org > ${dir}/hyp.trn
 perl -pe 's/\([^\)]+\)//g;' ${dir}/src.trn.org > ${dir}/src.trn
-if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
+if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
     perl -pe 's/\([^\)]+\)//g;' ${dir}/ref1.trn.org > ${dir}/ref1.trn
     perl -pe 's/\([^\)]+\)//g;' ${dir}/ref2.trn.org > ${dir}/ref2.trn
     perl -pe 's/\([^\)]+\)//g;' ${dir}/ref3.trn.org > ${dir}/ref3.trn
 fi
 
-if [ ! -z ${bpemodel} ]; then
-    spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
+if [ -n "${bpemodel}" ]; then
+    cat ${dir}/ref.trn > ${dir}/ref.wrd.trn
     spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
     spm_decode --model=${bpemodel} --input_format=piece < ${dir}/src.trn | sed -e "s/▁/ /g" > ${dir}/src.wrd.trn
-    if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
-        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref1.trn | sed -e "s/▁/ /g" > ${dir}/ref1.wrd.trn
-        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref2.trn | sed -e "s/▁/ /g" > ${dir}/ref2.wrd.trn
-        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref3.trn | sed -e "s/▁/ /g" > ${dir}/ref3.wrd.trn
+    if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
+        cat ${dir}/ref1.trn > ${dir}/ref1.wrd.trn
+        cat ${dir}/ref2.trn > ${dir}/ref2.wrd.trn
+        cat ${dir}/ref3.trn > ${dir}/ref3.wrd.trn
     fi
 else
     sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/ref.trn > ${dir}/ref.wrd.trn
     sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
     sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/src.trn > ${dir}/src.wrd.trn
-    if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
+    if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
         sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/ref1.trn >> ${dir}/ref1.wrd.trn
         sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/ref2.trn >> ${dir}/ref2.wrd.trn
         sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/ref3.trn >> ${dir}/ref3.wrd.trn
@@ -68,20 +67,20 @@ fi
 detokenizer.perl -l en -q < ${dir}/ref.wrd.trn > ${dir}/ref.wrd.trn.detok
 detokenizer.perl -l en -q < ${dir}/hyp.wrd.trn > ${dir}/hyp.wrd.trn.detok
 detokenizer.perl -l en -q < ${dir}/src.wrd.trn > ${dir}/src.wrd.trn.detok
-if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
+if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
     detokenizer.perl -l en -q < ${dir}/ref1.wrd.trn > ${dir}/ref1.wrd.trn.detok
     detokenizer.perl -l en -q < ${dir}/ref2.wrd.trn > ${dir}/ref2.wrd.trn.detok
     detokenizer.perl -l en -q < ${dir}/ref3.wrd.trn > ${dir}/ref3.wrd.trn.detok
 fi
 
-if [ ! -z ${nlsyms} ]; then
+if [ -n "${nlsyms}" ]; then
     cp ${dir}/ref.wrd.trn.detok ${dir}/ref.wrd.trn.detok.org
     cp ${dir}/hyp.wrd.trn.detok ${dir}/hyp.wrd.trn.detok.org
     cp ${dir}/src.wrd.trn.detok ${dir}/src.wrd.trn.detok.org
     filt.py -v $nlsyms ${dir}/ref.wrd.trn.detok.org > ${dir}/ref.wrd.trn.detok
     filt.py -v $nlsyms ${dir}/hyp.wrd.trn.detok.org > ${dir}/hyp.wrd.trn.detok
     filt.py -v $nlsyms ${dir}/src.wrd.trn.detok.org > ${dir}/src.wrd.trn.detok
-    if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
+    if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
         cp ${dir}/ref1.wrd.trn.detok ${dir}/ref1.wrd.trn.detok.org
         cp ${dir}/ref2.wrd.trn.detok ${dir}/ref2.wrd.trn.detok.org
         cp ${dir}/ref3.wrd.trn.detok ${dir}/ref3.wrd.trn.detok.org
@@ -90,11 +89,11 @@ if [ ! -z ${nlsyms} ]; then
         filt.py -v $nlsyms ${dir}/ref3.wrd.trn.detok.org > ${dir}/ref3.wrd.trn.detok
     fi
 fi
-if [ ! -z ${filter} ]; then
+if [ -n "${filter}" ]; then
     sed -i.bak3 -f ${filter} ${dir}/hyp.wrd.trn.detok
     sed -i.bak3 -f ${filter} ${dir}/ref.wrd.trn.detok
     sed -i.bak3 -f ${filter} ${dir}/src.wrd.trn.detok
-    if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
+    if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
         sed -i.bak3 -f ${filter} ${dir}/ref1.wrd.trn.detok
         sed -i.bak3 -f ${filter} ${dir}/ref2.wrd.trn.detok
         sed -i.bak3 -f ${filter} ${dir}/ref3.wrd.trn.detok
@@ -103,41 +102,47 @@ fi
 
 if [ ${case} = tc ]; then
     echo ${set} > ${dir}/result.tc.txt
-    if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
+    if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
         # 4 references
-        multi-bleu-detok.perl ${dir}/ref.wrd.trn.detok ${dir}/ref1.wrd.trn.detok ${dir}/ref2.wrd.trn.detok ${dir}/ref3.wrd.trn.detok < ${dir}/hyp.wrd.trn.detok >> ${dir}/result.tc.txt
+        sacrebleu ${dir}/ref.wrd.trn.detok ${dir}/ref1.wrd.trn.detok ${dir}/ref2.wrd.trn.detok ${dir}/ref3.wrd.trn.detok \
+            -i ${dir}/hyp.wrd.trn.detok -m bleu chrf ter \
+        >> ${dir}/result.tc.txt
     else
         # 1 reference
-        multi-bleu-detok.perl ${dir}/ref.wrd.trn.detok < ${dir}/hyp.wrd.trn.detok >> ${dir}/result.tc.txt
+        sacrebleu ${dir}/ref.wrd.trn.detok -i ${dir}/hyp.wrd.trn.detok -m bleu chrf ter >> ${dir}/result.tc.txt
     fi
     echo "write a case-sensitive BLEU result in ${dir}/result.tc.txt"
     cat ${dir}/result.tc.txt
 fi
 
 # detokenize & remove punctuation except apostrophe
-local/remove_punctuation.pl < ${dir}/ref.wrd.trn.detok > ${dir}/ref.wrd.trn.detok.lc.rm
-local/remove_punctuation.pl < ${dir}/hyp.wrd.trn.detok > ${dir}/hyp.wrd.trn.detok.lc.rm
-local/remove_punctuation.pl < ${dir}/src.wrd.trn.detok > ${dir}/src.wrd.trn.detok.lc.rm
-if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
-    local/remove_punctuation.pl < ${dir}/ref1.wrd.trn.detok > ${dir}/ref1.wrd.trn.detok.lc.rm
-    local/remove_punctuation.pl < ${dir}/ref2.wrd.trn.detok > ${dir}/ref2.wrd.trn.detok.lc.rm
-    local/remove_punctuation.pl < ${dir}/ref3.wrd.trn.detok > ${dir}/ref3.wrd.trn.detok.lc.rm
+remove_punctuation.pl < ${dir}/ref.wrd.trn.detok > ${dir}/ref.wrd.trn.detok.lc.rm
+remove_punctuation.pl < ${dir}/hyp.wrd.trn.detok > ${dir}/hyp.wrd.trn.detok.lc.rm
+remove_punctuation.pl < ${dir}/src.wrd.trn.detok > ${dir}/src.wrd.trn.detok.lc.rm
+if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
+    remove_punctuation.pl < ${dir}/ref1.wrd.trn.detok > ${dir}/ref1.wrd.trn.detok.lc.rm
+    remove_punctuation.pl < ${dir}/ref2.wrd.trn.detok > ${dir}/ref2.wrd.trn.detok.lc.rm
+    remove_punctuation.pl < ${dir}/ref3.wrd.trn.detok > ${dir}/ref3.wrd.trn.detok.lc.rm
 fi
 
 echo ${set} > ${dir}/result.lc.txt
-if [ ! -z ${set} ] && [ -f ${dir}/data_ref1.json ]; then
+if [ -n "${set}" ] && [ -f ${dir}/data_ref1.json ]; then
     # 4 references
-    echo "4-ref BLEU"
-    multi-bleu-detok.perl -lc ${dir}/ref.wrd.trn.detok.lc.rm  \
-                              ${dir}/ref1.wrd.trn.detok.lc.rm \
-                              ${dir}/ref2.wrd.trn.detok.lc.rm \
-                              ${dir}/ref3.wrd.trn.detok.lc.rm < ${dir}/hyp.wrd.trn.detok.lc.rm >> ${dir}/result.lc.txt
+    echo "4-ref BLEU" >> ${dir}/result.lc.txt
+    echo "########################################################################################################################" >> ${dir}/result.lc.txt
+    echo "sacleBLEU" >> ${dir}/result.lc.txt
+    sacrebleu -lc ${dir}/ref.wrd.trn.detok.lc.rm ${dir}/ref1.wrd.trn.detok.lc.rm ${dir}/ref2.wrd.trn.detok.lc.rm ${dir}/ref3.wrd.trn.detok.lc.rm \
+        -i ${dir}/hyp.wrd.trn.detok.lc.rm -m bleu chrf ter \
+        >> ${dir}/result.lc.txt
+    echo "########################################################################################################################" >> ${dir}/result.lc.txt
 fi
 # 1 reference
-echo "1-ref BLEU"
-multi-bleu-detok.perl -lc ${dir}/ref.wrd.trn.detok.lc.rm < ${dir}/hyp.wrd.trn.detok.lc.rm >> ${dir}/result.lc.txt
-
+echo "1-ref BLEU" >> ${dir}/result.lc.txt
+echo "########################################################################################################################" >> ${dir}/result.lc.txt
+echo "sacleBLEU" >> ${dir}/result.lc.txt
+sacrebleu -lc ${dir}/ref.wrd.trn.detok.lc.rm -i ${dir}/hyp.wrd.trn.detok.lc.rm -m bleu chrf ter >> ${dir}/result.lc.txt
+echo "########################################################################################################################" >> ${dir}/result.lc.txt
 echo "write a case-insensitive BLEU result in ${dir}/result.lc.txt"
 cat ${dir}/result.lc.txt
 
-# TODO(hirofumi): add TER & METEOR metrics here
+# TODO(hirofumi): add METEOR, BERTscore here
diff --git a/egs/fisher_callhome_spanish/st1/run.sh b/egs/fisher_callhome_spanish/st1/run.sh
index 54fcc64fdca..5c00eb9ec98 100755
--- a/egs/fisher_callhome_spanish/st1/run.sh
+++ b/egs/fisher_callhome_spanish/st1/run.sh
@@ -7,7 +7,7 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=0         # start from 0 if you need to start from data preparation
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
@@ -137,16 +137,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
 
     # dump features for training
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/fisher_callhome_spanish/st1/dump/${train_set}/delta${do_delta}/storage \
-          ${feat_tr_dir}/storage
-    fi
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/fisher_callhome_spanish/st1/dump/${train_dev}/delta${do_delta}/storage \
-          ${feat_dt_dir}/storage
-    fi
     dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
         data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
     for x in ${train_dev} ${trans_set}; do
@@ -181,11 +171,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
-        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-    for x in ${train_dev} ${trans_set}; do
+    for x in ${train_set} ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
+        data2json.sh --nj 16 --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     done
 
@@ -193,26 +181,18 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for x in fisher_dev.en fisher_dev2.en fisher_test.en; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
         for no in 1 2 3; do
-            data2json.sh --text data/${x}/text.${tgt_case}.${no} --feat ${feat_trans_dir}/feats.scp --bpecode ${bpemodel}.model --lang en \
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case}.${no} --feat ${feat_trans_dir}/feats.scp --bpecode ${bpemodel}.model --lang "en" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}_${no}.${tgt_case}.json
         done
     done
 
     # update json (add source references)
-    for x in ${train_set} ${train_dev} fisher_dev.en fisher_dev2.en fisher_test.en; do
+    for x in ${train_set} ${train_dev} ${trans_set}; do
         feat_dir=${dumpdir}/${x}/delta${do_delta}
         data_dir=data/$(echo ${x} | cut -f 1 -d ".").es
         update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
             ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
     done
-    for x in fisher_dev.en fisher_dev2.en fisher_test.en; do
-        feat_dir=${dumpdir}/${x}/delta${do_delta}
-        data_dir=data/$(echo ${x} | cut -f 1 -d ".").es
-        for no in 1 2 3; do
-            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
-                ${feat_dir}/data_${bpemode}${nbpe}_${no}.${tgt_case}.json ${data_dir} ${dict}
-        done
-    done
 fi
 
 # NOTE: skip stage 3: LM Preparation
@@ -258,7 +238,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
         --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
         --enc-init ${asr_model} \
-        --dec-init ${mt_model}
+        --dec-init ${mt_model} \
+        --n-iter-processes 2
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -316,7 +297,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             done
         fi
 
-        local/score_bleu.sh --case ${tgt_case} --set ${x} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        local/score_bleu.sh --case ${tgt_case} --set ${x} --bpemodel ${bpemodel}.model \
             ${expdir}/${decode_dir} ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
diff --git a/egs/fisher_swbd/asr1/cmd.sh b/egs/fisher_swbd/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/fisher_swbd/asr1/cmd.sh
+++ b/egs/fisher_swbd/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/fisher_swbd/asr1/local/fisher_swbd_prepare_dict.sh b/egs/fisher_swbd/asr1/local/fisher_swbd_prepare_dict.sh
index 816628e44ea..09899486d73 100755
--- a/egs/fisher_swbd/asr1/local/fisher_swbd_prepare_dict.sh
+++ b/egs/fisher_swbd/asr1/local/fisher_swbd_prepare_dict.sh
@@ -103,7 +103,7 @@ head -n 20 $dir/oov_counts.txt
 
 
 
-# Preparing SWBD acronymns from its dictionary
+# Preparing SWBD acronyms from its dictionary
 srcdir=data/local/train_swbd # This is where we downloaded some stuff..
 dir=data/local/dict_nosp
 mkdir -p $dir
diff --git a/egs/fisher_swbd/asr1/local/map_acronyms_transcripts.py b/egs/fisher_swbd/asr1/local/map_acronyms_transcripts.py
index 6a74c19c4c7..e67a758c40f 100755
--- a/egs/fisher_swbd/asr1/local/map_acronyms_transcripts.py
+++ b/egs/fisher_swbd/asr1/local/map_acronyms_transcripts.py
@@ -4,7 +4,7 @@
 # Apache 2.0
 
 # convert acronyms in swbd transcript to fisher convention
-# accoring to first two columns in the input acronyms mapping
+# according to first two columns in the input acronyms mapping
 
 import argparse
 import re
diff --git a/egs/fisher_swbd/asr1/local/swbd1_map_words.pl b/egs/fisher_swbd/asr1/local/swbd1_map_words.pl
index 39f90d72816..fc35938d8ea 100755
--- a/egs/fisher_swbd/asr1/local/swbd1_map_words.pl
+++ b/egs/fisher_swbd/asr1/local/swbd1_map_words.pl
@@ -9,7 +9,7 @@
   if ($field_spec =~ m/^\d+$/) {
     $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
   }
-  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
     if ($1 ne "") {
       $field_begin = $1 - 1;    # Change to zero-based indexing.
     }
diff --git a/egs/hkust/asr1/RESULTS.md b/egs/hkust/asr1/RESULTS.md
index 66699232134..d435887fa52 100644
--- a/egs/hkust/asr1/RESULTS.md
+++ b/egs/hkust/asr1/RESULTS.md
@@ -1,3 +1,21 @@
+# conformer (only 20 epochs)
+## Environments
+- date: `Wed Aug  4 15:52:24 EDT 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `1fa7f0da2fcc8656feb5cb4325d562409ad23dbf`
+  - Commit date: `Fri Jun 25 17:24:32 2021 -0400`
+
+## train_nodup_sp_pytorch_train_pytorch_conformer_kernel15
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_decode_lm|5413|56154|80.9|16.0|3.0|2.8|21.9|68.3|
+|decode_train_dev_decode_lm|4000|47147|81.2|15.5|3.3|3.2|22.0|71.4|
+
 # transformer (only 20 epochs)
   - This recipe seems to be over-trained at more than 20 epochs. It may require some tuning to avoid it.
   - Environments (obtained by `$ get_sys_info.sh`)
diff --git a/egs/hkust/asr1/cmd.sh b/egs/hkust/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/hkust/asr1/cmd.sh
+++ b/egs/hkust/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/hkust/asr1/conf/train.yaml b/egs/hkust/asr1/conf/train.yaml
index 4a8f9de8969..5e11a9c3db2 120000
--- a/egs/hkust/asr1/conf/train.yaml
+++ b/egs/hkust/asr1/conf/train.yaml
@@ -1 +1 @@
-tuning/train_pytorch_transformer.yaml
\ No newline at end of file
+tuning/train_pytorch_conformer_kernel15.yaml
\ No newline at end of file
diff --git a/egs/hkust/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml b/egs/hkust/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
new file mode 100644
index 00000000000..9104070256d
--- /dev/null
+++ b/egs/hkust/asr1/conf/tuning/train_pytorch_conformer_kernel15.yaml
@@ -0,0 +1,49 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 20
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+transformer-encoder-activation-type: swish
+rel-pos-type: latest
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/hkust/asr1/local/hkust_data_prep.sh b/egs/hkust/asr1/local/hkust_data_prep.sh
index 2754820fff3..85020a93ec9 100755
--- a/egs/hkust/asr1/local/hkust_data_prep.sh
+++ b/egs/hkust/asr1/local/hkust_data_prep.sh
@@ -40,7 +40,7 @@ n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
 
 #collect all trans, convert encodings to utf-8,
 find -L $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
-  iconv -f GBK -t utf-8 - | perl -e '
+  iconv -f GBK -t UTF-8 | perl -e '
     while (<STDIN>) {
       @A = split(" ", $_);
       if (@A <= 1) { next; }
@@ -55,7 +55,7 @@ find -L $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
   ' | sort -k1 > $train_dir/transcripts.txt || { echo "Error: $hkust_text_dir is invalid"; exit 1; }
 
 find -L $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
-  iconv -f GBK -t utf-8 - | perl -e '
+  iconv -f GBK -t UTF-8 | perl -e '
     while (<STDIN>) {
       @A = split(" ", $_);
       if (@A <= 1) { next; }
diff --git a/egs/how2/asr1/cmd.sh b/egs/how2/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/how2/asr1/cmd.sh
+++ b/egs/how2/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/how2/asr1/run.sh b/egs/how2/asr1/run.sh
index 6b14831c406..37f02002b74 100755
--- a/egs/how2/asr1/run.sh
+++ b/egs/how2/asr1/run.sh
@@ -95,7 +95,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # Divide into source and target languages
     for x in train val dev5 test_set_iwslt2019; do
         utils/data/get_utt2num_frames.sh data/${x}
-        local/divide_lang.sh ${x}
+        divide_lang.sh ${x} "en pt"
     done
 
     # remove long and short utterances
diff --git a/egs/how2/mt1/cmd.sh b/egs/how2/mt1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/how2/mt1/cmd.sh
+++ b/egs/how2/mt1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/how2/mt1/run.sh b/egs/how2/mt1/run.sh
index 94564056205..13c5314e472 100755
--- a/egs/how2/mt1/run.sh
+++ b/egs/how2/mt1/run.sh
@@ -7,7 +7,7 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=0         # start from -1 if you need to start from data download
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
@@ -92,7 +92,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in train val dev5; do
-        local/divide_lang.sh ${x}
+        divide_lang.sh ${x} "en pt"
     done
 
     # remove long and short utterances
@@ -133,11 +133,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     fi
 
     echo "make json files"
-    data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang pt \
+    data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "pt" \
         data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     for x in ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
-        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang pt \
+        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "pt" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     done
 
@@ -224,7 +224,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --model ${expdir}/results/${trans_model}
 
         score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
-            ${expdir}/${decode_dir} pt ${dict}
+            ${expdir}/${decode_dir} "pt" ${dict}
     ) &
     pids+=($!) # store background pids
     done
@@ -255,7 +255,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
         spm_decode --model=${bpemodel}.model --input_format=piece < ${data_dir}/text_asr_hyp.${src_case} | sed -e "s/▁/ /g" \
             > ${data_dir}/text_asr_hyp.wrd.${src_case}
 
-        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang pt \
+        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "pt" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         update_json.sh --text ${data_dir}/text_asr_hyp.wrd.${src_case} --bpecode ${bpemodel}.model \
             ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
@@ -284,7 +284,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             ${expdir}/${decode_dir} "pt" ${dict}
     ) &
     pids+=($!) # store background pids
diff --git a/egs/how2/st1/cmd.sh b/egs/how2/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/how2/st1/cmd.sh
+++ b/egs/how2/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/how2/st1/local/divide_lang.sh b/egs/how2/st1/local/divide_lang.sh
deleted file mode 100755
index cf399e3adb3..00000000000
--- a/egs/how2/st1/local/divide_lang.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2019 Kyoto University (Hirofumi Inaguma)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-. ./path.sh
-
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 <set>"
-    echo "e.g.: $0 dev"
-    exit 1
-fi
-
-set=$1
-
-# Copy stuff into its final locations [this has been moved from the format_data script]
-# for En
-mkdir -p data/${set}.en
-for f in spk2utt utt2spk feats.scp; do
-    if [ -f data/${set}/${f} ]; then
-        sort data/${set}/${f} > data/${set}.en/${f}
-    fi
-done
-sort data/${set}/text.lc.rm.en > data/${set}.en/text  # dummy
-sort data/${set}/text.tc.en > data/${set}.en/text.tc
-sort data/${set}/text.lc.en > data/${set}.en/text.lc
-sort data/${set}/text.lc.rm.en > data/${set}.en/text.lc.rm
-utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
-if [ -f data/${set}.en/feats.scp ]; then
-    utils/validate_data_dir.sh --no-wav data/${set}.en || exit 1;
-else
-    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.en || exit 1;
-fi
-
-# for Pt
-if [ ! ${set} = test_set_iwslt2019 ]; then
-    mkdir -p data/${set}.pt
-    for f in spk2utt utt2spk feats.scp; do
-        if [ -f data/${set}/${f} ]; then
-            sort data/${set}/${f} > data/${set}.pt/${f}
-        fi
-    done
-    sort data/${set}/text.tc.pt > data/${set}.pt/text  # dummy
-    sort data/${set}/text.tc.pt > data/${set}.pt/text.tc
-    sort data/${set}/text.lc.pt > data/${set}.pt/text.lc
-    sort data/${set}/text.lc.rm.pt > data/${set}.pt/text.lc.rm
-    utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.pt
-    if [ -f data/${set}.pt/feats.scp ]; then
-        utils/validate_data_dir.sh --no-wav data/${set}.pt || exit 1;
-    else
-        utils/validate_data_dir.sh --no-feats --no-wav data/${set}.pt || exit 1;
-    fi
-fi
diff --git a/egs/how2/st1/run.sh b/egs/how2/st1/run.sh
index bf60a87c0c4..83cf4799b22 100755
--- a/egs/how2/st1/run.sh
+++ b/egs/how2/st1/run.sh
@@ -7,7 +7,7 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=0         # start from -1 if you need to start from data download
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
@@ -95,7 +95,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # Divide into source and target languages
     for x in train val dev5; do
         utils/data/get_utt2num_frames.sh data/${x}
-        local/divide_lang.sh ${x}
+        divide_lang.sh ${x} "en pt"
     done
 
     # remove long and short utterances
@@ -107,16 +107,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
 
     # dump features for training
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/how2/st1/dump/${train_set}/delta${do_delta}/storage \
-          ${feat_tr_dir}/storage
-    fi
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/how2/st1/dump/${train_dev}/delta${do_delta}/storage \
-          ${feat_dt_dir}/storage
-    fi
     dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
         data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
     for x in ${train_dev} ${trans_set}; do
@@ -151,11 +141,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang pt \
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "pt" \
         data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     for x in ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang pt \
+        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "pt" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     done
 
@@ -257,7 +247,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             ${expdir}/${decode_dir} "pt" ${dict}
     ) &
     pids+=($!) # store background pids
diff --git a/egs/hub4_spanish/asr1/cmd.sh b/egs/hub4_spanish/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/hub4_spanish/asr1/cmd.sh
+++ b/egs/hub4_spanish/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt16/mt1/cmd.sh b/egs/iwslt16/mt1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/iwslt16/mt1/cmd.sh
+++ b/egs/iwslt16/mt1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt18/asr1/cmd.sh b/egs/iwslt18/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/iwslt18/asr1/cmd.sh
+++ b/egs/iwslt18/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt18/asr1/run.sh b/egs/iwslt18/asr1/run.sh
index da92edbb5ae..ee5527eb7bb 100755
--- a/egs/iwslt18/asr1/run.sh
+++ b/egs/iwslt18/asr1/run.sh
@@ -75,11 +75,11 @@ train_set=train_nodevtest_sp.en
 train_set_prefix=train_nodevtest_sp
 train_dev=train_dev.en
 recog_subset="dev.en test.en"
-recog_set="dev.en test.en dev2010.en tst2010.en tst2013.en tst2014.en tst2015.en"
+recog_set="dev.en test.en dev2010.en tst2010.en tst2013.en tst2014.en tst2015.en tst2018.en tst2019.en"
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     echo "stage -1: Data Download"
-    for part in train dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for part in train dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         local/download_and_untar.sh ${st_ted} ${part}
     done
 fi
@@ -90,7 +90,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     echo "stage 0: Data Preparation"
     local/data_prep_train.sh ${st_ted}
 
-    for part in dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for part in dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         local/data_prep_eval.sh ${st_ted} ${part}
     done
 
@@ -135,7 +135,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     fbankdir=fbank
     # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
-    for x in dev test dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for x in dev test dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
             data/${x} exp/make_fbank/${x} ${fbankdir}
     done
@@ -144,7 +144,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     speed_perturb.sh --cmd "$train_cmd" --cases "lc.rm lc tc" --langs "en de" data/train_nodevtest data/train_nodevtest_sp ${fbankdir}
 
     # Divide into source and target languages
-    for x in ${train_set_prefix} dev test dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for x in ${train_set_prefix} dev test dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         local/divide_lang.sh ${x}
     done
 
diff --git a/egs/iwslt18/mt1/cmd.sh b/egs/iwslt18/mt1/cmd.sh
index 9f648974ff4..3099918dd5d 100644
--- a/egs/iwslt18/mt1/cmd.sh
+++ b/egs/iwslt18/mt1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt18/mt1/run.sh b/egs/iwslt18/mt1/run.sh
index 38a60d8ed12..44e6b2afb0d 100755
--- a/egs/iwslt18/mt1/run.sh
+++ b/egs/iwslt18/mt1/run.sh
@@ -10,7 +10,9 @@
 backend=pytorch # chainer or pytorch
 stage=-1        # start from -1 if you need to start from data download
 stop_stage=100
-ngpu=1          # number of gpus ("0" uses cpu, otherwise use gpu)
+ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=4            # number of parallel jobs for decoding
 debugmode=1
 dumpdir=dump    # directory to dump full features
 N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
@@ -25,29 +27,36 @@ decode_config=conf/decode.yaml
 trans_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
 
 # model average realted (only for transformer)
-n_average=5                  # the number of NMT models to be averaged
-use_valbest_average=true     # if true, the validation `n_average`-best NMT models will be averaged.
-                             # if false, the last `n_average` NMT models will be averaged.
+n_average=5                  # the number of MT models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best MT models will be averaged.
+                             # if false, the last `n_average` MT models will be averaged.
+metric=bleu                  # loss/acc/bleu
 
 # preprocessing related
 src_case=lc.rm
-tgt_case=lc
+tgt_case=tc
 # tc: truecase
 # lc: lowercase
 # lc.rm: lowercase with punctuation removal
 
-# bpemode (unigram or bpe)
-nbpe=106
-bpemode=bpe
-# NOTE: nbpe=88 means character-level NMT (lc.rm)
-# NOTE: nbpe=106 means character-level NMT (lc)
-# NOTE: nbpe=134 means character-level NMT (tc)
-
 # Set this to somewhere where you want to put your data, or where
 # someone else has already put it.  You'll want to change this
 # if you're not on the CLSP grid.
-st_ted=/export/b08/inaguma/IWSLT
-# st_ted=/n/rd11/corpora_8/iwslt18
+st_ted=/n/rd8/iwslt18
+# st_ted=/export/b08/inaguma/IWSLT
+
+# if true, reverse source and target languages: **->English
+reverse_direction=false
+
+# use the same dict as in the ST task
+use_st_dict=true
+
+# bpemode (unigram or bpe)
+nbpe=8000
+bpemode=bpe
+# NOTE: nbpe=88 means character-level MT (lc.rm)
+# NOTE: nbpe=106 means character-level MT (lc)
+# NOTE: nbpe=134 means character-level MT (tc)
 
 # exp tag
 tag="" # tag for managing experiments.
@@ -63,11 +72,11 @@ set -o pipefail
 train_set=train_nodevtest.de
 train_set_prefix=train_nodevtest
 train_dev=train_dev.de
-trans_set="test.de dev2010.de tst2010.de tst2013.de tst2014.de tst2015.de"
+trans_set="dev.de test.de dev2010.de tst2010.de tst2013.de tst2014.de tst2015.de tst2018.de tst2019.de"
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     echo "stage -1: Data Download"
-    for part in train dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for part in train dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         local/download_and_untar.sh ${st_ted} ${part}
     done
 fi
@@ -78,20 +87,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     echo "stage 0: Data Preparation"
     local/data_prep_train.sh ${st_ted}
 
+    for part in dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
+        local/data_prep_eval.sh --is_mt true ${st_ted} ${part}
+    done
+
     # data cleaning
     ### local/forced_align.sh ${st_ted} data/train
     cp -rf data/train data/train.tmp
     reduce_data_dir.sh data/train.tmp data/local/downloads/reclist data/train
     for lang in en de; do
-        utils/filter_scp.pl data/train/utt2spk <data/train.tmp/text.tc.${lang} >data/train/text.tc.${lang}
-        utils/filter_scp.pl data/train/utt2spk <data/train.tmp/text.lc.${lang} >data/train/text.lc.${lang}
-        utils/filter_scp.pl data/train/utt2spk <data/train.tmp/text.lc.rm.${lang} >data/train/text.lc.rm.${lang}
+        for case in lc.rm lc tc; do
+            utils/filter_scp.pl data/train/utt2spk <data/train.tmp/text.${case}.${lang} >data/train/text.${case}.${lang}
+        done
     done
     rm -rf data/train.tmp
-
-    for part in dev2010 tst2010 tst2013 tst2014 tst2015; do
-        local/data_prep_eval.sh ${st_ted} ${part}
-    done
 fi
 
 feat_tr_dir=${dumpdir}/${train_set}; mkdir -p ${feat_tr_dir}
@@ -105,107 +114,94 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     utils/subset_data_dir.sh --speakers data/train 2000 data/dev
     utils/subset_data_dir.sh --spk-list <(utils/filter_scp.pl --exclude data/dev/spk2utt data/train/spk2utt) data/train data/train_nodev
     for lang in en de; do
-        utils/filter_scp.pl data/train_nodev/utt2spk <data/train/text.tc.${lang} >data/train_nodev/text.tc.${lang}
-        utils/filter_scp.pl data/train_nodev/utt2spk <data/train/text.lc.${lang} >data/train_nodev/text.lc.${lang}
-        utils/filter_scp.pl data/train_nodev/utt2spk <data/train/text.lc.rm.${lang} >data/train_nodev/text.lc.rm.${lang}
-        utils/filter_scp.pl data/dev/utt2spk <data/train/text.tc.${lang} >data/dev/text.tc.${lang}
-        utils/filter_scp.pl data/dev/utt2spk <data/train/text.lc.${lang} >data/dev/text.lc.${lang}
-        utils/filter_scp.pl data/dev/utt2spk <data/train/text.lc.rm.${lang} >data/dev/text.lc.rm.${lang}
+        for case in lc.rm lc tc; do
+            utils/filter_scp.pl data/train_nodev/utt2spk <data/train/text.${case}.${lang} >data/train_nodev/text.${case}.${lang}
+            utils/filter_scp.pl data/dev/utt2spk <data/train/text.${case}.${lang} >data/dev/text.${case}.${lang}
+        done
     done
 
     # make a speaker-disjoint test set
     utils/subset_data_dir.sh --speakers data/train_nodev 2000 data/test
     utils/subset_data_dir.sh --spk-list <(utils/filter_scp.pl --exclude data/test/spk2utt data/train_nodev/spk2utt) data/train_nodev data/train_nodevtest
     for lang in en de; do
-        utils/filter_scp.pl data/train_nodevtest/utt2spk <data/train_nodev/text.tc.${lang} >data/train_nodevtest/text.tc.${lang}
-        utils/filter_scp.pl data/train_nodevtest/utt2spk <data/train_nodev/text.lc.${lang} >data/train_nodevtest/text.lc.${lang}
-        utils/filter_scp.pl data/train_nodevtest/utt2spk <data/train_nodev/text.lc.rm.${lang} >data/train_nodevtest/text.lc.rm.${lang}
-        utils/filter_scp.pl data/test/utt2spk <data/train_nodev/text.tc.${lang} >data/test/text.tc.${lang}
-        utils/filter_scp.pl data/test/utt2spk <data/train_nodev/text.lc.${lang} >data/test/text.lc.${lang}
-        utils/filter_scp.pl data/test/utt2spk <data/train_nodev/text.lc.rm.${lang} >data/test/text.lc.rm.${lang}
+        for case in lc.rm lc tc; do
+            utils/filter_scp.pl data/train_nodevtest/utt2spk <data/train_nodev/text.${case}.${lang} >data/train_nodevtest/text.${case}.${lang}
+            utils/filter_scp.pl data/test/utt2spk <data/train_nodev/text.${case}.${lang} >data/test/text.${case}.${lang}
+        done
     done
 
     # Divide into source and target languages
-    for x in ${train_set_prefix} dev test dev2010 tst2010 tst2013 tst2014 tst2015; do
-        local/divide_lang.sh ${x}
+    for x in ${train_set_prefix} dev test dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
+        divide_lang.sh ${x} "en de"
     done
 
-    cp -rf data/dev.en data/train_dev.en
-    cp -rf data/dev.de data/train_dev.de
+    for lang in en de; do
+        if [ -d data/train_dev.${lang} ];then
+            rm -rf data/train_dev.${lang}
+        fi
+        cp -rf data/dev.${lang} data/train_dev.${lang}
+    done
 
+    # remove long and short utterances
     for x in ${train_set_prefix} train_dev; do
-        # remove utt having more than 3000 frames
-        # remove utt having more than 400 characters
-        for lang in en de; do
-            remove_longshortdata.sh --no_feat true --maxchars 400 data/${x}.${lang} data/${x}.${lang}.tmp
-        done
-
-        # Match the number of utterances between source and target languages
-        # extract common lines
-        cut -f 1 -d " " data/${x}.en.tmp/text > data/${x}.de.tmp/reclist1
-        cut -f 1 -d " " data/${x}.de.tmp/text > data/${x}.de.tmp/reclist2
-        comm -12 data/${x}.de.tmp/reclist1 data/${x}.de.tmp/reclist2 > data/${x}.de.tmp/reclist
-
-        for lang in en de; do
-            reduce_data_dir.sh data/${x}.${lang}.tmp data/${x}.de.tmp/reclist data/${x}.${lang}
-            utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}.${lang}
-        done
-        rm -rf data/${x}.*.tmp
+        clean_corpus.sh --no_feat true --maxchars 400 --utt_extra_files "text.tc text.lc text.lc.rm" data/${x} "en de"
     done
 fi
 
-dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
-nlsyms=data/lang_1spm/non_lang_syms_${tgt_case}.txt
-bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+if [ ${use_st_dict} = true ]; then
+    if [ ${reverse_direction} = true ]; then
+        dict=../st1/data/lang_1spm/train_nodevtest_sp.de_${bpemode}${nbpe}_units_${src_case}.txt
+        nlsyms=../st1/data/lang_1spm/non_lang_syms_${src_case}.txt
+        bpemodel=../st1/data/lang_1spm/train_nodevtest_sp.de_${bpemode}${nbpe}_${src_case}
+    else
+        dict=../st1/data/lang_1spm/train_nodevtest_sp.de_${bpemode}${nbpe}_units_${tgt_case}.txt
+        nlsyms=../st1/data/lang_1spm/non_lang_syms_${tgt_case}.txt
+        bpemodel=../st1/data/lang_1spm/train_nodevtest_sp.de_${bpemode}${nbpe}_${tgt_case}
+    fi
+else
+    dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+    nlsyms=data/lang_1spm/non_lang_syms_${tgt_case}.txt
+    bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+fi
 echo "dictionary: ${dict}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     ### Task dependent. You have to check non-linguistic symbols used in the corpus.
     echo "stage 2: Dictionary and Json Data Preparation"
     mkdir -p data/lang_1spm/
 
-    echo "make a non-linguistic symbol list for all languages"
-    cut -f 2- -d' ' data/${train_set_prefix}.*/text.${tgt_case} | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
-    cat ${nlsyms}
-
-    echo "make a joint source and target dictionary"
-    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
-    offset=$(wc -l < ${dict})
-    cut -f 2- -d " " data/${train_set_prefix}.*/text.${tgt_case} | grep -v -e '^\s*$' > data/lang_1spm/input.txt
-    spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
-    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
-    wc -l ${dict}
+    if [ ${use_st_dict} = false ]; then
+        echo "make a non-linguistic symbol list for all languages"
+        cut -f 2- -d' ' data/${train_set_prefix}.*/text.${tgt_case} | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+        cat ${nlsyms}
+
+        echo "make a joint source and target dictionary"
+        echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+        offset=$(wc -l < ${dict})
+        cut -f 2- -d' ' data/${train_set_prefix}.*/text.${tgt_case} | grep -v -e '^\s*$' > data/lang_1spm/input_${src_case}_${tgt_case}.txt
+        spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${src_case}_${tgt_case}.txt \
+            --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+        spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${src_case}_${tgt_case}.txt \
+            | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+        wc -l ${dict}
+    fi
 
     echo "make json files"
-    local/data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model \
+    data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
         data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-    local/data2json.sh --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model \
-        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-    for ttask in ${trans_set}; do
-        feat_trans_dir=${dumpdir}/${ttask}; mkdir -p ${feat_trans_dir}
-        if [ ${ttask} = "dev.de" ] || [ ${ttask} = "test.de" ]; then
-            local/data2json.sh --text data/${ttask}/text.${tgt_case} --bpecode ${bpemodel}.model \
-                data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-        else
-            local/data2json.sh --text data/${ttask}/text_noseg.${tgt_case} --bpecode ${bpemodel}.model --skip_utt2spk true \
-                data/${ttask} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-        fi
+    for x in ${train_dev} ${trans_set}; do
+        feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
+        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
+            data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     done
 
     # update json (add source references)
-    local/update_json.sh --text data/"$(echo ${train_set} | cut -f 1 -d ".")".en/text.${src_case} --bpecode ${bpemodel}.model \
+    update_json.sh --text data/"$(echo ${train_set} | cut -f 1 -d ".")".en/text.${src_case} --bpecode ${bpemodel}.model \
         ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json data/"$(echo ${train_set} | cut -f 1 -d ".")".en ${dict}
-    local/update_json.sh --text data/"$(echo ${train_dev} | cut -f 1 -d ".")".en/text.${src_case} --bpecode ${bpemodel}.model \
-        ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json data/"$(echo ${train_dev} | cut -f 1 -d ".")".en ${dict}
-    for ttask in ${trans_set}; do
-        feat_dir=${dumpdir}/${ttask}
-        data_dir=data/"$(echo ${ttask} | cut -f 1 -d ".")".en
-        if [ ${ttask} = "dev.de" ] || [ ${ttask} = "test.de" ]; then
-            local/update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model --set ${ttask} \
-                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
-        else
-            local/update_json.sh --text ${data_dir}/text_noseg.${src_case} --bpecode ${bpemodel}.model --set ${ttask} \
-                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
-        fi
+    for x in ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}
+        data_dir=data/"$(echo ${x} | cut -f 1 -d ".")".en
+        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
     done
 fi
 
@@ -243,10 +239,10 @@ fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "stage 5: Decoding"
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
-        # Average NMT models
+        # Average MT models
         if ${use_valbest_average}; then
             trans_model=model.val${n_average}.avg.best
-            opt="--log ${expdir}/results/log"
+            opt="--log ${expdir}/results/log --metric ${metric}"
         else
             trans_model=model.last${n_average}.avg.best
             opt="--log"
@@ -258,33 +254,39 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --out ${expdir}/results/${trans_model} \
             --num ${n_average}
     fi
-    nj=16
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
 
     pids=() # initialize pids
-    for ttask in ${trans_set}; do
+    for x in ${trans_set}; do
     (
-        decode_dir=decode_${ttask}_$(basename ${decode_config%.*})
-        feat_trans_dir=${dumpdir}/${ttask}
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_trans_dir=${dumpdir}/${x}
+
+        # reset log for RTF calculation
+        if [ -d ${expdir}/${decode_dir}/log/decode.1.log ]; then
+            rm ${expdir}/${decode_dir}/log/decode.*.log
+        fi
 
         # split data
         splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
 
-        #### use CPU for decoding
-        ngpu=0
-
         ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
             mt_trans.py \
             --config ${decode_config} \
-            --ngpu ${ngpu} \
+            --ngpu ${dec_ngpu} \
             --backend ${backend} \
             --batchsize 0 \
             --trans-json ${feat_trans_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
-            ${expdir}/${decode_dir} de ${dict}
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+            ${expdir}/${decode_dir} "de" ${dict}
 
+        calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
     ) &
     pids+=($!) # store background pids
     done
@@ -292,3 +294,5 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
     echo "Finished"
 fi
+
+# TODO: add stage6
diff --git a/egs/iwslt18/st1/cmd.sh b/egs/iwslt18/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/iwslt18/st1/cmd.sh
+++ b/egs/iwslt18/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt18/st1/local/ctm2segments.py b/egs/iwslt18/st1/local/ctm2segments.py
index 1919adc22de..2d44d87c6e2 100755
--- a/egs/iwslt18/st1/local/ctm2segments.py
+++ b/egs/iwslt18/st1/local/ctm2segments.py
@@ -46,7 +46,7 @@ def main():
 
         if start_t is not None and i < num_lines - 1:
             if (float(start_time_w) - end_t >= threshold) and (end_t - start_t > 0.2):
-                # differnece utterance
+                # difference utterance
                 hyps += [(utt_id, start_t, end_t, hyp[1:])]
 
                 # reset
diff --git a/egs/iwslt18/st1/local/data_prep_eval.sh b/egs/iwslt18/st1/local/data_prep_eval.sh
index d768a7e0812..5e8a39ec980 100755
--- a/egs/iwslt18/st1/local/data_prep_eval.sh
+++ b/egs/iwslt18/st1/local/data_prep_eval.sh
@@ -3,8 +3,16 @@
 # Copyright 2018 Kyoto University (Hirofumi Inaguma)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
+###################################################
+# Update: 03/09/2021
+# tst2020 does not have transcription/translation
+###################################################
+
 export LC_ALL=C
 
+is_mt=false
+no_reference=false
+
 . utils/parse_options.sh || exit 1;
 
 if [ "$#" -ne 2 ]; then
@@ -22,19 +30,14 @@ dst=data/local/${set}
 wav_dir=${src}/wavs
 xml_en=${src}/IWSLT.TED.${set}.en-de.en.xml
 xml_de=${src}/IWSLT.TED.${set}.en-de.de.xml
-if [ ${set} = tst2018 ]; then
+if [[ ${set} = *tst2018* ]] || [[ ${set} = *tst2019* ]] || [[ ${set} = *tst2020* ]] || [[ ${set} = *tst2021* ]]; then
     yml=${src}/IWSLT.TED.${set}.en-de.yaml
 else
-    yml=data/local/downloads/test-db-${set}.yaml
+    yml=${src}/test-db.yaml
 fi
-
 mkdir -p ${dst} || exit 1;
 
 [ ! -d ${wav_dir} ] && echo "$0: no such directory ${wav_dir}" && exit 1;
-if [ ${set} != tst2018 ]; then
-    [ ! -f ${xml_en} ] && echo "$0: expected file ${xml_en} to exist" && exit 1;
-    [ ! -f ${xml_de} ] && echo "$0: expected file ${xml_de} to exist" && exit 1;
-fi
 
 wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
 trans_en=${dst}/text.en; [[ -f "${trans_en}" ]] && rm ${trans_en}
@@ -63,28 +66,38 @@ fi
 # TODO(hirofumi): Remove this after updating download URL
 
 
-# downloads test-db.yaml and reclist (for removing noisy training utterances)
-if [ ! -d data/local/downloads ]; then
-    download_from_google_drive.sh https://drive.google.com/open?id=1agQOUEm47LIeLZAFF8RTZ5qx6OsOFGTM data/local
-fi
+# downloads test-db.yaml
+# if [ ! -d data/local/downloads ]; then
+#     download_from_google_drive.sh https://drive.google.com/open?id=1agQOUEm47LIeLZAFF8RTZ5qx6OsOFGTM data/local
+# fi
+
+# copy for evaluation
+cp ${src}/FILE_ORDER ${dst}
+# cp ${src}/CTM_LIST ${dst}
 
 # (1a) Transcriptions and translations preparation
-if [ ${set} != tst2018 ]; then
-    # make basic transcription file (add segments info)
-    local/parse_xml.py ${xml_en} ${dst}/en.org
-    local/parse_xml.py ${xml_de} ${dst}/de.org
+if [ ${no_reference} = false ]; then
+    [ ! -f ${xml_en} ] && echo "$0: expected file ${xml_en} to exist" && exit 1;
+    [ ! -f ${xml_de} ] && echo "$0: expected file ${xml_de} to exist" && exit 1;
 
-    # copy for evaluation
     cp ${xml_en} ${dst}
     cp ${xml_de} ${dst}
-    cp ${src}/FILE_ORDER ${dst}
-    cp ${src}/CTM_LIST ${dst}
+
+    local/parse_xml.py ${xml_en} ${dst}/en.org || exit 1;
+    local/parse_xml.py ${xml_de} ${dst}/de.org || exit 1;
 
     for lang in en de; do
         # normalize punctuation
+        cut -d " " -f 1 ${dst}/${lang}.org > ${dst}/reclist.${lang}
         cut -d " " -f 2- ${dst}/${lang}.org | normalize-punctuation.perl -l ${lang} > ${dst}/${lang}.norm
         # NOTE: Only Moses script is applied for the evaluation sets
 
+        # fix reclist (original utterance ids include language tags)
+        awk '{
+            uttid=$1; split(uttid,S,"[.]");
+            uttid2=S[1] "." S[3]; print uttid2
+        }' ${dst}/reclist.${lang} > ${dst}/reclist
+
         # lowercasing
         lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
         cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
@@ -92,14 +105,16 @@ if [ ${set} != tst2018 ]; then
         # remove punctuation (not used)
         remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
 
-        # tokenization
-        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
-        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
-        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
+        for case in lc.rm lc tc; do
+            # tokenization
+            tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.${case} > ${dst}/${lang}.norm.${case}.tok
 
-        paste -d " " <(awk '{print $1}' ${dst}/${lang}.org) <(cat ${dst}/${lang}.norm.tc.tok) > ${dst}/text.tc.${lang}
-        paste -d " " <(awk '{print $1}' ${dst}/${lang}.org) <(cat ${dst}/${lang}.norm.lc.tok) > ${dst}/text.lc.${lang}
-        paste -d " " <(awk '{print $1}' ${dst}/${lang}.org) <(cat ${dst}/${lang}.norm.lc.rm.tok) > ${dst}/text.lc.rm.${lang}
+            if [ ${is_mt} = true ]; then
+                paste -d " " <(cat ${dst}/reclist) <(cat ${dst}/${lang}.norm.${case}.tok) > ${dst}/text.${case}.${lang}
+            else
+                paste -d " " <(awk '{print $1}' ${dst}/${lang}.org) <(cat ${dst}/${lang}.norm.${case}.tok) > ${dst}/text.${case}.${lang}
+            fi
+        done
 
         # save original and cleaned punctuation
         lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
@@ -108,21 +123,21 @@ if [ ${set} != tst2018 ]; then
             | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
     done
 
-    # add segmentation based ctm files provided by organizers
-    # for f in $(cat ${src}/CTM_LIST); do
-    #     talkid=$(echo ${f} | cut -d "."  -f 3)
-    #     sort ${src}/${f} | sed -e "/#/d" > ${dst}/ctm.$talkid
-    #     paste -d " " <(cut -d " " -f 1 ${dst}/en.org) <(cat ${dst}/en.norm.lc.rm) | grep $talkid > ${dst}/text.en.$talkid
-    #     local/ctm2segments.py ${dst}/text.en.$talkid ${dst}/ctm.$talkid ${set} $talkid > ${dst}/segments.$talkid || exit 1;
-    # done
-    # sort ${dst}/segments* > ${dst}/segments
-
     # error check
     n_en=$(cat ${dst}/en.norm.tc.tok | wc -l)
     n_de=$(cat ${dst}/de.norm.tc.tok | wc -l)
     [ ${n_en} -ne ${n_de} ] && echo "Warning: expected ${n_en} data data files, found ${n_de}" && exit 1;
 fi
 
+# add segmentation based ctm files provided by organizers
+# for f in $(cat ${src}/CTM_LIST); do
+#     talkid=$(echo ${f} | cut -d "."  -f 3)
+#     sort ${src}/${f} | sed -e "/#/d" > ${dst}/ctm.$talkid
+#     paste -d " " <(cut -d " " -f 1 ${dst}/en.org) <(cat ${dst}/en.norm.lc.rm) | grep $talkid > ${dst}/text.en.$talkid
+#     local/ctm2segments.py ${dst}/text.en.$talkid ${dst}/ctm.$talkid ${set} $talkid > ${dst}/segments.$talkid || exit 1;
+# done
+# sort ${dst}/segments* > ${dst}/segments
+
 # NOTE: This is how to extract test-db-****.yaml. But that is downloaded form Google drive instead.
 # (1b) Segmente audio file with LIUM diarization tool
 # if [ ${set} != tst2018 ]; then
@@ -140,36 +155,48 @@ fi
 
 
 # (1c-a) Make segments files from ${src}/test-db.yaml
-#segments file format is: utt-id start-time end-time, e.g.:
-#ted_0001_0003501_0003684 ted_0001 003.501 0003.684
-awk '/./{ print $0 }' < ${yml} > ${dst}/.yaml0
-awk '{
-    wav=$3; offset=$4; duration=$5;
-    gsub(",","",wav); gsub("\"","",wav);
-    gsub(",","",offset); gsub("\"","",offset); gsub("offset:","",offset);
-    gsub("}","",duration); gsub("\"","",duration); gsub("duration:","",duration);
-    match(wav, /\/[a-z0-9]+.en.[a-z]+[0-9]+.wav/);
-    spkid = substr(wav, RSTART, RLENGTH); gsub(".wav","",spkid); gsub("/","",spkid);
-    duration=sprintf("%.7f", duration);
-    if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
-    else extendt=0;
-    offset=sprintf("%.7f", offset);
-    startt=offset-extendt;
-    endt=offset+duration+extendt;
-    printf("%s_%07.0f_%07.0f %s %.2f %.2f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5), spkid, startt, endt);
-}' ${dst}/.yaml0 | sort > ${dst}/segments
-# NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
-
-awk '{
-    spkid=$2; split(spkid,S,"[.]");
-    set=S[1]; talkid=S[3];
-    printf("%s cat '${wav_dir}'/%s.en.%s.wav |\n", spkid, set, talkid);
-}' < ${dst}/segments | uniq | sort > ${dst}/wav.scp
-
-awk '{
-    segment=$1; split(segment,S,"[_]");
-    spkid=S[1]; print $1 " " spkid
-}' ${dst}/segments | sort > ${dst}/utt2spk
+if [ ${is_mt} = true ]; then
+    awk '{
+        uttid=$1; split(uttid,S,"[_]");
+        spkid=S[1]; print $1 " " spkid
+    }' ${dst}/reclist | uniq | sort > ${dst}/utt2spk
+
+else
+    # segments file format is: utt-id start-time end-time, e.g.:
+    # ted_0001_0003501_0003684 ted_0001 003.501 0003.684
+    awk '/./{ print $0 }' < ${yml} > ${dst}/.yaml0
+    awk '{
+        wav=$3; offset=$4; duration=$5;
+        gsub(",","",wav); gsub("\"","",wav);
+        gsub(",","",offset); gsub("\"","",offset); gsub("offset:","",offset);
+        gsub("}","",duration); gsub("\"","",duration); gsub("duration:","",duration);
+        match(wav, /\/[^\/]+.wav/);
+        n = split(wav, a, "/");
+        spkid = a[n]; gsub(".wav","",spkid);
+        duration=sprintf("%.7f", duration);
+        if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
+        else extendt=0;
+        offset=sprintf("%.7f", offset);
+        startt=offset-extendt;
+        endt=offset+duration+extendt;
+        printf("%s_%07.0f_%07.0f %s %.2f %.2f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5), spkid, startt, endt);
+    }' ${dst}/.yaml0 | sort > ${dst}/segments
+    # NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
+
+    awk '{
+        wav=$3;
+        gsub(",","",wav); gsub("\"","",wav);
+        match(wav, /\/[^\/]+.wav/);
+        n = split(wav, a, "/");
+        spkid = a[n]; gsub(".wav","",spkid);
+        printf("%s cat '${wav_dir}'/%s.wav |\n", spkid, spkid);
+    }' < ${dst}/.yaml0 | uniq | sort > ${dst}/wav.scp
+
+    awk '{
+        segment=$1; num = split(segment,S,"[_]");
+        if (num == 3) spkid=S[1]; else spkid=S[1]"_"S[2]; print $1 " " spkid;
+    }' ${dst}/segments | sort > ${dst}/utt2spk
+fi
 
 sort ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
 
@@ -177,19 +204,21 @@ sort ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
 # Copy stuff into its final locations [this has been moved from the format_data script]
 mkdir -p data/${set}
 for f in spk2utt utt2spk wav.scp segments; do
-    cp ${dst}/${f} data/${set}/
+    if [ -f ${dst}/${f} ]; then
+        cp ${dst}/${f} data/${set}/
+    fi
 done
-if [ ${set} != tst2018 ]; then
-    # en
-    cp ${dst}/text.tc.en data/${set}/text_noseg.tc.en
-    cp ${dst}/text.lc.en data/${set}/text_noseg.lc.en
-    cp ${dst}/text.lc.rm.en data/${set}/text_noseg.lc.rm.en
-    # de
-    cp ${dst}/text.tc.de data/${set}/text_noseg.tc.de
-    cp ${dst}/text.lc.de data/${set}/text_noseg.lc.de
-    cp ${dst}/text.lc.rm.de data/${set}/text_noseg.lc.rm.de
-    # NOTE: text -> text_noseg for passing utils/validate_data_dir.sh
+if [ ${no_reference} = false ]; then
+    for lang in en de; do
+        for case in lc.rm lc tc; do
+            if [ ${is_mt} = true ]; then
+                cp ${dst}/text.${case}.${lang} data/${set}/text.${case}.${lang}
+            else
+                cp ${dst}/text.${case}.${lang} data/${set}/text_noseg.${case}.${lang}
+                # NOTE: text -> text_noseg for passing utils/validate_data_dir.sh (ASR/ST)
+            fi
+        done
+    done
 fi
 
-
 echo "$0: successfully prepared data in ${dst}"
diff --git a/egs/iwslt18/st1/local/data_prep_train.sh b/egs/iwslt18/st1/local/data_prep_train.sh
index 583097c5bfe..ec41a1cf82e 100755
--- a/egs/iwslt18/st1/local/data_prep_train.sh
+++ b/egs/iwslt18/st1/local/data_prep_train.sh
@@ -47,6 +47,11 @@ n_de=$(cat ${de} | wc -l)
 [ ${n} -ne ${n_de} ] && echo "Warning: expected ${n} data files, found ${n_de}" && exit 1;
 
 
+# downloads reclist (for removing noisy training utterances)
+if [ ! -d data/local/downloads ]; then
+    download_from_google_drive.sh https://drive.google.com/open?id=1agQOUEm47LIeLZAFF8RTZ5qx6OsOFGTM data/local
+fi
+
 # (1a) Transcriptions and translations preparation
 # make basic transcription file (add segments info)
 cp ${yml} ${dst}/.yaml0
diff --git a/egs/iwslt18/st1/local/download_and_untar.sh b/egs/iwslt18/st1/local/download_and_untar.sh
index 1e5cdb71659..799fd9bafd6 100755
--- a/egs/iwslt18/st1/local/download_and_untar.sh
+++ b/egs/iwslt18/st1/local/download_and_untar.sh
@@ -19,8 +19,9 @@ fi
 data=$1
 set=$2
 
-train_url=http://i13pc106.ira.uka.de/~mmueller/iwslt-corpus.zip
-if [ ${set} = "dev2010" ]; then
+if [ ${set} = "train" ]; then
+    url=http://i13pc106.ira.uka.de/~mmueller/iwslt-corpus.zip
+elif [ ${set} = "dev2010" ]; then
     url=http://i13pc106.ira.uka.de/~jniehues/IWSLT-SLT/data/eval/en-de/preprocessed/IWSLT-SLT.dev2010.en-de.tgz
 elif [ ${set} = "tst2010" ]; then
     url=http://i13pc106.ira.uka.de/~jniehues/IWSLT-SLT/data/eval/en-de/preprocessed/IWSLT-SLT.tst2010.en-de.tgz
@@ -32,6 +33,12 @@ elif [ ${set} = "tst2015" ]; then
     url=http://i13pc106.ira.uka.de/~jniehues/IWSLT-SLT/data/eval/en-de/preprocessed/IWSLT-SLT.tst2015.en-de.tgz
 elif [ ${set} = "tst2018" ]; then
     url=http://i13pc106.ira.uka.de/~jniehues/IWSLT-SLT/data/eval/en-de/preprocessed/IWSLT-SLT.tst2018.en-de.tgz
+elif [ ${set} = "tst2019" ]; then
+    # url=http://i13pc106.ira.uka.de/~jniehues/IWSLT-SLT/data/eval/en-de/segmented/IWSLT-SLT.segmented.tst2019.en-de.tgz
+    url=http://i13pc106.ira.uka.de/~jniehues/IWSLT-SLT/data/eval/en-de/preprocessed/IWSLT-SLT.tst2019.en-de.tgz
+elif [ ${set} = "tst2020" ]; then
+    # url=http://i13pc106.ira.uka.de/~jniehues/IWSLT-SLT/data/eval/en-de/segmented/IWSLT-SLT.segmented.tst2020.en-de.tgz
+    url=http://i13pc106.ira.uka.de/~jniehues/IWSLT-SLT/data/eval/en-de/preprocessed/IWSLT-SLT.tst2020.en-de.tgz
 fi
 
 if [ ! -d "${data}" ]; then
@@ -39,7 +46,7 @@ if [ ! -d "${data}" ]; then
     exit 1;
 fi
 
-sets="train_dev2010_tst2010_tst2013_tst2014_tst2015_tst2018"
+sets="train_dev2010_tst2010_tst2013_tst2014_tst2015_tst2018_tst2019_tst2020"
 if [ ! $(echo ${sets} | grep ${set}) ]; then
     echo "$0: no such set ${set}"
     exit 1;
@@ -62,10 +69,10 @@ if [ ${set} = train ]; then
             echo "$0: wget is not installed."
             exit 1;
         fi
-        echo "$0: downloading data from ${train_url}.  This may take some time, please be patient."
+        echo "$0: downloading data from ${url}.  This may take some time, please be patient."
 
-        if ! wget --no-check-certificate -P ${data}/${set} ${train_url}; then
-            echo "$0: error executing wget ${train_url}"
+        if ! wget --no-check-certificate -P ${data}/${set} ${url}; then
+            echo "$0: error executing wget ${url}"
             exit 1;
         fi
     fi
diff --git a/egs/iwslt18/st1/local/json2trn_reorder.py b/egs/iwslt18/st1/local/json2trn_reorder.py
index f4a1868aa36..efa94809d3c 100755
--- a/egs/iwslt18/st1/local/json2trn_reorder.py
+++ b/egs/iwslt18/st1/local/json2trn_reorder.py
@@ -48,8 +48,8 @@
 
     hyps = {}
     for x in j["utts"]:
-        talkid = x.split("_")[0]
-        start_time = int(x.split("_")[1])
+        talkid = "_".join(x.split("_")[:-2])  # x: userid_start_end
+        start_time = int(x.split("_")[-2])
         if talkid not in hyps.keys():
             hyps[talkid] = {}
 
diff --git a/egs/iwslt18/st1/local/parse_xml.py b/egs/iwslt18/st1/local/parse_xml.py
index 7d415e05671..e42f8e2c79e 100755
--- a/egs/iwslt18/st1/local/parse_xml.py
+++ b/egs/iwslt18/st1/local/parse_xml.py
@@ -32,6 +32,7 @@ def main():
         for e in elem.getiterator():
             if e.tag == "doc":
                 talk_id = e.get("docid").replace(" ", "")
+                talk_id = talk_id.replace("lecture", "")  # for tst2018
                 trans_dict_all[talk_id] = OrderedDict()
             elif e.tag == "seg":
                 utt_id = int(e.get("id"))
@@ -49,7 +50,7 @@ def main():
         for talk_id, trans_dict in trans_dict_all.items():
             for utt_id, ref in trans_dict.items():
                 f.write(
-                    "%s.%s.talkid%d_%04d %s\n"
+                    "%s.%s.talkid%04d_%04d %s\n"
                     % (_set, lang, int(talk_id), int(utt_id), ref)
                 )
 
diff --git a/egs/iwslt18/st1/local/remove_punctuation.pl b/egs/iwslt18/st1/local/remove_punctuation.pl
deleted file mode 100755
index 89e19c6f4f5..00000000000
--- a/egs/iwslt18/st1/local/remove_punctuation.pl
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/perl
-
-use warnings;
-use strict;
-
-binmode(STDIN,":utf8");
-binmode(STDOUT,":utf8");
-
-while(<STDIN>) {
-  $_ = " $_ ";
-
-  # remove punctuation except apostrophe
-  s/<space>/spacemark/g;  # for scoring
-  s/'/apostrophe/g;
-  s/[[:punct:]]//g;
-  s/apostrophe/'/g;
-  s/spacemark/<space>/g;  # for scoring
-
-  # remove whitespace
-  s/\s+/ /g;
-  s/^\s+//;
-  s/\s+$//;
-
-  print "$_\n";
-}
diff --git a/egs/iwslt18/st1/local/score_bleu_reseg.sh b/egs/iwslt18/st1/local/score_bleu_reseg.sh
index 7d309de4a6f..0d4cb21cb3a 100755
--- a/egs/iwslt18/st1/local/score_bleu_reseg.sh
+++ b/egs/iwslt18/st1/local/score_bleu_reseg.sh
@@ -8,11 +8,11 @@ export LC_ALL=C
 . ./path.sh
 
 nlsyms=""
-bpe=""
 bpemodel=""
 filter=""
-case=lc
+case=tc
 text=""
+remove_nonverbal=true
 
 . utils/parse_options.sh
 
@@ -22,23 +22,26 @@ if [ $# != 4 ]; then
 fi
 
 dir=$1
-dic=$2
+dict=$2
 set=$4
 src=$3/${set}/IWSLT.${set}
 
-sl=en
-tl=de
-system=st
-xml_src=${src}/IWSLT.TED.${set}.${sl}-${tl}.${sl}.xml
-xml_tgt=${src}/IWSLT.TED.${set}.${sl}-${tl}.${tl}.xml
+src_lang=en
+tgt_lang=de
+sysid=st
+xml_src=${src}/IWSLT.TED.${set}.${src_lang}-${tgt_lang}.${src_lang}.xml
+xml_tgt=${src}/IWSLT.TED.${set}.${src_lang}-${tgt_lang}.${tgt_lang}.xml
 
 # sort hypotheses
 concatjson.py ${dir}/data.*.json > ${dir}/data.json
-local/json2trn_reorder.py ${dir}/data.json ${dic} ${dir}/hyp.trn.org ${src}/FILE_ORDER
+local/json2trn_reorder.py ${dir}/data.json ${dict} ${dir}/hyp.trn.org ${src}/FILE_ORDER
 
 # remove uttterance id
 perl -pe 's/\([^\)]+\)//g;' ${dir}/hyp.trn.org > ${dir}/hyp.trn
 
+# remove non-verbal labels (optional)
+perl -pe 's/\([^\)]+\)//g;' ${dir}/hyp.trn > ${dir}/hyp.rm.trn
+
 # if [ -n "${nlsyms}" ]; then
 #     cp ${dir}/hyp.trn ${dir}/hyp.trn.org
 #     filt.py -v ${nlsyms} ${dir}/hyp.trn.org > ${dir}/hyp.trn
@@ -48,16 +51,24 @@ if [ -n "${filter}" ]; then
 fi
 
 # reorder text based on the order of the xml file
-# if [ -z ${text} ]; then
+# if [ ! -n "${text}" ]; then
 #   text=data/${set}.en/text_noseg.${case}
 # fi
 # local/reorder_text.py ${text} ${src}/FILE_ORDER > ${dir}/ref.wrd.trn || exit 1;
 grep "<seg id" ${xml_tgt} | sed -e "s/<[^>]*>//g" | sed 's/^[ \t]*//' | sed -e 's/[ \t]*$//' > ${dir}/ref.wrd.trn
 
-if [ ! -z ${bpemodel} ]; then
-    spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
+if [ -n "${bpemodel}" ]; then
+    if [ ${remove_nonverbal} = true ]; then
+        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.rm.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
+    else
+        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
+    fi
 else
-    sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
+    if [ ${remove_nonverbal} = true ]; then
+        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/hyp.rm.trn > ${dir}/hyp.wrd.trn
+    else
+        sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
+    fi
 fi
 
 # detokenize
@@ -72,30 +83,24 @@ if [ -n "${nlsyms}" ]; then
 fi
 # NOTE: this must be performed after detokenization so that punctuation marks are not removed
 
+echo ${set} > ${dir}/result.${case}.txt
+echo "########################################################################################################################" >> ${dir}/result.${case}.txt
+echo "sacleBLEU" >> ${dir}/result.${case}.txt
 if [ ${case} = tc ]; then
     ### case-sensitive
-    echo ${set} > ${dir}/result.tc.txt
-    # segment hypotheses with RWTH tool
-    segmentBasedOnMWER.sh ${dir}/src.xml ${dir}/ref.xml ${dir}/hyp.wrd.trn.detok ${system} de ${dir}/hyp.wrd.trn.detok.sgm.xml "" 0 || exit 1;
-    sed -e "/<[^>]*>/d" ${dir}/hyp.wrd.trn.detok.sgm.xml > ${dir}/hyp.wrd.trn.detok.sgm
-    multi-bleu-detok.perl ${dir}/ref.wrd.trn < ${dir}/hyp.wrd.trn.detok.sgm > ${dir}/result.tc.txt
-    echo "write a case-sensitive BLEU result in ${dir}/result.tc.txt"
-    cat ${dir}/result.tc.txt
+    # resegment hypotheses based on WER
+    segmentBasedOnMWER.sh ${xml_src} ${xml_tgt} ${dir}/hyp.wrd.trn.detok ${sysid} ${tgt_lang} ${dir}/hyp.wrd.trn.detok.reseg.xml "" 1 || exit 1;
+    sed -e "/<[^>]*>/d" ${dir}/hyp.wrd.trn.detok.reseg.xml > ${dir}/hyp.wrd.trn.detok.reseg
+    sacrebleu ${dir}/ref.wrd.trn.detok -i ${dir}/hyp.wrd.trn.detok.reseg -m bleu chrf ter >> ${dir}/result.${case}.txt
+else
+    ### case-insensitive
+    # resegment hypotheses based on WER
+    segmentBasedOnMWER.sh  ${xml_src}  ${xml_tgt} ${dir}/hyp.wrd.trn.detok ${sysid} ${tgt_lang} ${dir}/hyp.wrd.trn.detok.reseg.xml "" 0 || exit 1;
+    sed -e "/<[^>]*>/d" ${dir}/hyp.wrd.trn.detok.reseg.xml > ${dir}/hyp.wrd.trn.detok.reseg
+    sacrebleu -lc ${dir}/ref.wrd.trn.detok -i ${dir}/hyp.wrd.trn.detok.reseg -m bleu chrf ter >> ${dir}/result.${case}.txt
 fi
+echo "write a case-insensitive BLEU result in ${dir}/result.${case}.txt"
+echo "########################################################################################################################" >> ${dir}/result.${case}.txt
+cat ${dir}/result.${case}.txt
 
-# lowercase
-lowercase.perl < ${xml_src} > ${dir}/src.xml
-lowercase.perl < ${xml_tgt} > ${dir}/ref.xml
-# NOTE: these are used for segementation
-
-### case-insensitive
-echo ${set} > ${dir}/result.lc.txt
-# segment hypotheses with RWTH tool
-segmentBasedOnMWER.sh ${dir}/src.xml ${dir}/ref.xml ${dir}/hyp.wrd.trn.detok ${system} de ${dir}/hyp.wrd.trn.detok.sgm.xml "" 0 || exit 1;
-sed -e "/<[^>]*>/d" ${dir}/hyp.wrd.trn.detok.sgm.xml > ${dir}/hyp.wrd.trn.detok.sgm
-multi-bleu-detok.perl -lc ${dir}/ref.wrd.trn < ${dir}/hyp.wrd.trn.detok.sgm > ${dir}/result.lc.txt
-echo "write a case-insensitive BLEU result in ${dir}/result.lc.txt"
-cat ${dir}/result.lc.txt
-
-
-# TODO(hirofumi): add TER & METEOR metrics here
+# TODO(hirofumi): add METEOR, BERTscore here
diff --git a/egs/iwslt18/st1/local/score_sclite_reseg.sh b/egs/iwslt18/st1/local/score_sclite_reseg.sh
index 22d250d6bff..4d4c90b3658 100755
--- a/egs/iwslt18/st1/local/score_sclite_reseg.sh
+++ b/egs/iwslt18/st1/local/score_sclite_reseg.sh
@@ -24,7 +24,7 @@ if [ $# != 4 ]; then
 fi
 
 dir=$1
-dic=$2
+dict=$2
 set=$4
 src=$3/${set}/IWSLT.${set}
 
@@ -35,7 +35,7 @@ xml_src=${src}/IWSLT.TED.${set}.${sl}-${tl}.${sl}.xml
 
 # sort hypotheses
 concatjson.py ${dir}/data.*.json > ${dir}/data.json
-local/json2trn_reorder.py ${dir}/data.json ${dic} ${dir}/hyp.trn.org ${src}/FILE_ORDER
+local/json2trn_reorder.py ${dir}/data.json ${dict} ${dir}/hyp.trn.org ${src}/FILE_ORDER
 
 # remove uttterance id
 perl -pe 's/\([^\)]+\)//g;' ${dir}/hyp.trn.org > ${dir}/hyp.trn
@@ -80,10 +80,10 @@ lowercase.perl < ${dir}/hyp.wrd.trn.detok > ${dir}/hyp.wrd.trn.detok.lc
 lowercase.perl < ${dir}/ref.wrd.trn.detok > ${dir}/ref.wrd.trn.detok.lc
 
 # remove punctuation (keep apostrophe)
-local/remove_punctuation.pl < ${dir}/hyp.wrd.trn.detok.lc | sed -e "s/  / /g" -e "s/'/ /g" > ${dir}/hyp.wrd.trn.detok.lc.rm
-local/remove_punctuation.pl < ${dir}/ref.wrd.trn.detok.lc | sed -e "s/  / /g" -e "s/'/ /g" > ${dir}/ref.wrd.trn.detok.lc.rm
+remove_punctuation.pl < ${dir}/hyp.wrd.trn.detok.lc | sed -e "s/  / /g" -e "s/'/ /g" > ${dir}/hyp.wrd.trn.detok.lc.rm
+remove_punctuation.pl < ${dir}/ref.wrd.trn.detok.lc | sed -e "s/  / /g" -e "s/'/ /g" > ${dir}/ref.wrd.trn.detok.lc.rm
 
-# segment hypotheses with RWTH tool
+# resegment hypotheses based on WER
 perl local/wrap-xml.perl en ${xml_src} ${system} < ${dir}/ref.wrd.trn.detok.lc.rm > ${dir}/ref.xml
 # segmentBasedOnMWER.sh ${xml_src} ${xml_src} ${dir}/hyp.wrd.trn.detok.lc.rm ${system} en ${dir}/hyp.wrd.trn.detok.lc.rm.sgm.xml "" 0 || exit 1;
 segmentBasedOnMWER.sh ${dir}/ref.xml ${dir}/ref.xml ${dir}/hyp.wrd.trn.detok.lc.rm ${system} en ${dir}/hyp.wrd.trn.detok.lc.rm.sgm.xml "" 0 || exit 1;
diff --git a/egs/iwslt18/st1/run.sh b/egs/iwslt18/st1/run.sh
index a728d678f71..e86156bf957 100755
--- a/egs/iwslt18/st1/run.sh
+++ b/egs/iwslt18/st1/run.sh
@@ -75,11 +75,11 @@ train_set=train_nodevtest_sp.de
 train_set_prefix=train_nodevtest_sp
 train_dev=train_dev.de
 trans_subset="dev.en test.en"
-trans_set="dev.de test.de dev2010.de tst2010.de tst2013.de tst2014.de tst2015.de"
+trans_set="dev.de test.de dev2010.de tst2010.de tst2013.de tst2014.de tst2015.de tst2018.de tst2019.de"
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     echo "stage -1: Data Download"
-    for part in train dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for part in train dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         local/download_and_untar.sh ${st_ted} ${part}
     done
 fi
@@ -90,7 +90,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     echo "stage 0: Data Preparation"
     local/data_prep_train.sh ${st_ted}
 
-    for part in dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for part in dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         local/data_prep_eval.sh ${st_ted} ${part}
     done
 
@@ -135,7 +135,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     fbankdir=fbank
     # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
-    for x in dev test dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for x in dev test dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
             data/${x} exp/make_fbank/${x} ${fbankdir}
     done
@@ -144,7 +144,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     speed_perturb.sh --cmd "$train_cmd" --cases "lc.rm lc tc" --langs "en de" data/train_nodevtest data/train_nodevtest_sp ${fbankdir}
 
     # Divide into source and target languages
-    for x in ${train_set_prefix} dev test dev2010 tst2010 tst2013 tst2014 tst2015; do
+    for x in ${train_set_prefix} dev test dev2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         local/divide_lang.sh ${x}
     done
 
@@ -164,19 +164,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
 
     # dump features for training
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/iwslt18/st1/dump/${train_set}/delta${do_delta}/storage \
-          ${feat_tr_dir}/storage
-    fi
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/iwslt18/st1/dump/${train_dev}/delta${do_delta}/storage \
-          ${feat_dt_dir}/storage
-    fi
     dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
         data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
-    for x in ${train_dev}　${trans_set}; do
+    for x in ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}; mkdir -p ${feat_trans_dir}
         dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
             data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${x} ${feat_trans_dir}
@@ -207,14 +197,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang de \
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
         data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-    data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang de \
+    data2json.sh --feat ${feat_dt_dir}/feats.scp --text data/${train_dev}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
         data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     for x in ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
         if [ ${x} = "dev.de" ] || [ ${x} = "test.de" ]; then
-            data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang de \
+            data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         else
             local/data2json.sh --feat ${feat_trans_dir}/feats.scp --no_text true \
@@ -274,7 +264,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
         --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
         --enc-init ${asr_model} \
-        --dec-init ${mt_model}
+        --dec-init ${mt_model} \
+        --n-iter-processes 2
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -332,11 +323,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --model ${expdir}/results/${trans_model}
 
         if [ ${x} = "dev.de" ] || [ ${x} = "test.de" ]; then
-            score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
                 ${expdir}/${decode_dir} "de" ${dict}
         else
             set=$(echo ${x} | cut -f 1 -d ".")
-            local/score_bleu_reseg.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+            local/score_bleu_reseg.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
                 ${expdir}/${decode_dir} ${dict} ${st_ted} ${set}
         fi
 
diff --git a/egs/iwslt19/asr1/cmd.sh b/egs/iwslt19/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/iwslt19/asr1/cmd.sh
+++ b/egs/iwslt19/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt19/st1/cmd.sh b/egs/iwslt19/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/iwslt19/st1/cmd.sh
+++ b/egs/iwslt19/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt19/st1/run.sh b/egs/iwslt19/st1/run.sh
index cb44e3cd152..637aa71a68b 100755
--- a/egs/iwslt19/st1/run.sh
+++ b/egs/iwslt19/st1/run.sh
@@ -284,7 +284,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case tc --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case tc --bpemodel ${bpemodel}.model \
             ${expdir}/${decode_dir} pt ${dict}
     ) &
     pids+=($!) # store background pids
diff --git a/egs/iwslt21/asr1/cmd.sh b/egs/iwslt21/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/iwslt21/asr1/cmd.sh
+++ b/egs/iwslt21/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt21/asr1/local/data2json.sh b/egs/iwslt21/asr1/local/data2json.sh
new file mode 120000
index 00000000000..551d6608b9c
--- /dev/null
+++ b/egs/iwslt21/asr1/local/data2json.sh
@@ -0,0 +1 @@
+../../../iwslt18/st1/local/data2json.sh
\ No newline at end of file
diff --git a/egs/iwslt21/asr1/local/data_prep_eval_iwslt.sh b/egs/iwslt21/asr1/local/data_prep_eval_iwslt.sh
new file mode 120000
index 00000000000..74878af2a32
--- /dev/null
+++ b/egs/iwslt21/asr1/local/data_prep_eval_iwslt.sh
@@ -0,0 +1 @@
+../../../iwslt18/st1/local/data_prep_eval.sh
\ No newline at end of file
diff --git a/egs/iwslt21/asr1/local/data_prep_wmt20.sh b/egs/iwslt21/asr1/local/data_prep_wmt20.sh
new file mode 100755
index 00000000000..c16d7efb2d4
--- /dev/null
+++ b/egs/iwslt21/asr1/local/data_prep_wmt20.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+max_length=80
+length_ratio=1.5
+
+. utils/parse_options.sh || exit 1;
+
+datasize=$1
+
+if [ ${datasize} = "5m" ]; then
+    prefix=subset5000000
+    download_url="https://www.cs.jhu.edu/~kevinduh/t/iwslt21/wmt20/wmt20-de-en-subset5m.tgz"
+elif [ ${datasize} = "10m" ]; then
+    prefix=subset10000000
+    download_url="https://www.cs.jhu.edu/~kevinduh/t/iwslt21/wmt20/wmt20-de-en-subset10m.incl_paracrawl.tgz"
+elif [ ${datasize} = "20m" ]; then
+    prefix=subset20000000
+    download_url="https://www.cs.jhu.edu/~kevinduh/t/iwslt21/wmt20/wmt20-de-en-subset20m.incl_paracrawl.tgz"
+else
+    echo "${datasize} is not supported."
+    exit 1;
+fi
+
+dst=data/local/downloads/wmt20_subset${datasize}
+mkdir -p ${dst}
+
+mkdir -p data/local/downloads
+if [ ! -f data/local/downloads/wmt20-de-en-subset${datasize}.tgz ]; then
+    wget ${download_url} -O data/local/downloads/wmt20-de-en-subset${datasize}.tgz || exit 1;
+    tar -xzvf data/local/downloads/wmt20-de-en-subset${datasize}.tgz -C ${dst} || exit 1;
+fi
+
+mkdir -p data/local/tools
+if [ ! -d data/local/tools/langid.py ]; then
+    git clone https://github.com/saffsd/langid.py data/local/tools/langid.py || exit 1;
+fi
+
+# error check
+n_en=$(cat ${dst}/${prefix}.en | wc -l)
+n_de=$(cat ${dst}/${prefix}.de | wc -l)
+[ ${n_en} -ne ${n_de} ] && echo "Warning: expected ${n_en} data files, found ${n_de}" && exit 1;
+
+for lang in en de; do
+    cat ${dst}/${prefix}.${lang} | \
+        remove-non-printing-char.perl > ${dst}/${lang}.norm
+    awk '{printf "wmt20-%08d %s\n", NR, $0 }' ${dst}/${lang}.norm | sort > ${dst}/text.${lang} || exit 1;
+done
+cut -d " " -f 1 ${dst}/text.en | awk '{print $0" "$0}'> ${dst}/utt2spk
+cut -d " " -f 1 ${dst}/text.en | awk '{print $0" "$0}'> ${dst}/spk2utt
+cp ${dst}/text.en ${dst}/text  # dummy
+
+# language identification w/ langid.py
+for lang in en de; do
+    paste -d " " <(cut -d" " -f 1 ${dst}/text.${lang}) <(cut -d" " -f 2- ${dst}/text.${lang} | python data/local/tools/langid.py/langid/langid.py --line -n) > ${dst}/langidpy.${lang}
+    # NOTE: tokenization is included in langid.py
+    grep " ('${lang}'," ${dst}/langidpy.${lang} | cut -d" " -f 1 > ${dst}/reclist.${lang}
+done
+
+# extract common lines
+comm -12 ${dst}/reclist.en ${dst}/reclist.de > ${dst}/reclist
+reduce_data_dir.sh ${dst} ${dst}/reclist ${dst}.langidpy || exit 1;
+cp ${dst}/text.en ${dst}.langidpy
+cp ${dst}/text.de ${dst}.langidpy
+utils/fix_data_dir.sh --utt_extra_files "text.en text.de" ${dst}.langidpy || exit 1;
+dst=${dst}.langidpy
+
+for lang in en de; do
+    # tokenization
+    cut -d " " -f 2- ${dst}/text.${lang} | tokenizer.perl -threads 8 -l ${lang} -q > ${dst}/${lang}.tok
+
+    paste -d " " <(cut -d " " -f 1 ${dst}/text.${lang}) <(cat ${dst}/${lang}.tok) > ${dst}/text.tok.${lang}
+done
+
+# length filtering
+clean-corpus-n.perl -ratio ${length_ratio} ${dst}/text.tok en de ${dst}/text.tok.clean 1 ${max_length} || exit 1;
+
+# character filtering
+for lang in en de; do
+    wc -l ${dst}/text.tok.clean.${lang}
+    local/filter_parentheses.py ${dst}/text.tok.clean.${lang} > ${dst}/text.tc.${lang} || exit 1;
+    wc -l ${dst}/text.tc.${lang}
+done
+
+# lowercasing, remove punctuation
+for lang in en de; do
+    paste -d " " <(cut -d " " -f 1 ${dst}/text.tc.${lang}) <(cut -d " " -f 2- ${dst}/text.tc.${lang} | lowercase.perl) > ${dst}/text.lc.${lang}
+    paste -d " " <(cut -d " " -f 1 ${dst}/text.lc.${lang}) <(cut -d " " -f 2- ${dst}/text.lc.${lang} | remove_punctuation.pl) > ${dst}/text.lc.rm.${lang}
+    cut -d" " -f 2- ${dst}/text.tc.${lang} | text2token.py -s 0 -n 1 | tr " " "\n" \
+        | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
+done
+
+# extract common lines again
+for lang in en de; do
+    cut -d" " -f 1 ${dst}/text.tc.${lang} > ${dst}/reclist.${lang}
+done
+comm -12 ${dst}/reclist.en ${dst}/reclist.de > ${dst}/reclist
+reduce_data_dir.sh ${dst} ${dst}/reclist data/tr_wmt20_subset${datasize} || exit 1;
+for lang in en de; do
+    for case in lc.rm lc tc; do
+        cp ${dst}/text.${case}.${lang} data/tr_wmt20_subset${datasize}
+    done
+done
+utils/fix_data_dir.sh --utt_extra_files "text.tc.en text.lc.en text.lc.rm.en \
+                                         text.tc.de text.lc.de text.lc.rm.de" data/tr_wmt20_subset${datasize} || exit 1;
diff --git a/egs/iwslt21/asr1/local/filter_parentheses.py b/egs/iwslt21/asr1/local/filter_parentheses.py
new file mode 100755
index 00000000000..8c27bf39d27
--- /dev/null
+++ b/egs/iwslt21/asr1/local/filter_parentheses.py
@@ -0,0 +1,69 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import codecs
+import re
+import regex
+
+parser = argparse.ArgumentParser()
+parser.add_argument("text", type=str, help="text file")
+args = parser.parse_args()
+
+
+def main():
+
+    p_kanji = regex.compile(r".*\p{Script=Han}+.*")
+    p_hiragana = regex.compile(r".*\p{Block=Hiragana}+.*")
+    p_katakana = regex.compile(r".*\p{Block=Katakana}+.*")
+    p_chinese = re.compile(".*[\u4e00-\u9fa5]+.*")
+    p_korean = re.compile(".*[\uac00-\ud7ff]+.*")
+    p_arabic = regex.compile(r".*\p{Block=Arabic}+.*")
+    p_cyrillic = regex.compile(r".*\p{Block=Cyrillic}+.*")
+    p_sanskrit = regex.compile(r".*\p{Block=Devanagari}+.*")
+    p_egyptian = regex.compile(r".*\p{Block=Egyptian_Hieroglyphs}+.*")
+    p_ethiopic = regex.compile(r".*\p{Block=Ethiopic}+.*")
+    p_hebrew = regex.compile(r".*\p{Block=Hebrew}+.*")
+    p_armenian = regex.compile(r".*\p{Block=Armenian}+.*")
+    p_thai = regex.compile(r".*\p{Block=Thai}+.*")
+    p_bengali = regex.compile(r".*\p{Block=Bengali}+.*")
+    p_myanmer = regex.compile(r".*\p{Block=Myanmar}+.*")
+    p_geogian = regex.compile(r".*\p{Block=Georgian}+.*")
+    p_lao = regex.compile(r".*\p{Block=Lao}+.*")
+
+    # exception
+    def is_dhivehi(text):
+        return "މާވަށް" in text
+
+    with codecs.open(args.text, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            sentence = " ".join(line.split(" ")[1:])
+            if (
+                p_kanji.match(sentence) is None
+                and p_hiragana.match(sentence) is None
+                and p_katakana.match(sentence) is None
+                and p_chinese.match(sentence) is None
+                and p_korean.match(sentence) is None
+                and p_arabic.match(sentence) is None
+                and p_cyrillic.match(sentence) is None
+                and p_sanskrit.match(sentence) is None
+                and p_egyptian.match(sentence) is None
+                and p_ethiopic.match(sentence) is None
+                and p_hebrew.match(sentence) is None
+                and p_armenian.match(sentence) is None
+                and p_thai.match(sentence) is None
+                and p_bengali.match(sentence) is None
+                and p_myanmer.match(sentence) is None
+                and p_geogian.match(sentence) is None
+                and p_lao.match(sentence) is None
+                and not is_dhivehi(sentence)
+            ):
+                print(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/iwslt21/asr1/local/json2trn_reorder.py b/egs/iwslt21/asr1/local/json2trn_reorder.py
new file mode 120000
index 00000000000..7c59292970a
--- /dev/null
+++ b/egs/iwslt21/asr1/local/json2trn_reorder.py
@@ -0,0 +1 @@
+../../../iwslt18/st1/local/json2trn_reorder.py
\ No newline at end of file
diff --git a/egs/iwslt21/asr1/local/merge_short_segments.py b/egs/iwslt21/asr1/local/merge_short_segments.py
new file mode 100755
index 00000000000..33f4d3932fb
--- /dev/null
+++ b/egs/iwslt21/asr1/local/merge_short_segments.py
@@ -0,0 +1,121 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Merge adjacent utterances."""
+
+
+import argparse
+import codecs
+from collections import deque
+
+parser = argparse.ArgumentParser()
+parser.add_argument("segments", type=str, help="path to segment file")
+
+parser.add_argument("output_segments", type=str, help="path to output segment file")
+parser.add_argument("output_utt2spk", type=str, help="path to output utt2spk file")
+parser.add_argument("output_spk2utt", type=str, help="path to output spk2utt file")
+
+parser.add_argument("--min_interval", type=int, default=200, help="")
+parser.add_argument(
+    "--max_duration", type=int, default=1500, help="maximum duration [frame]"
+)
+parser.add_argument(
+    "--delimiter", type=str, default="_", help="delimiter on utt_id start_time end_time"
+)
+args = parser.parse_args()
+
+
+def merge(segments, segments_dict):
+
+    while True:
+        num_merge = 0
+        new_segments = deque([])
+        utt_id_prev, start_prev, end_prev = segments.popleft()
+        utt_ids_merged = utt_id_prev
+        for utt_ids, start, end in segments:
+            interval = start - end_prev
+            duration = end - start_prev
+            if interval < args.min_interval and duration < args.max_duration:
+                # merge
+                end_prev = end
+                utt_ids_merged.extend(utt_ids)
+                num_merge += 1
+            else:
+                new_segments.append((utt_ids_merged, start_prev, end_prev))
+
+                start_prev = start
+                end_prev = end
+                utt_ids_merged = utt_ids
+
+        # for last segments
+        new_segments.append((utt_ids_merged, start_prev, end))
+        segments = new_segments
+
+        if num_merge == 0:
+            break
+
+    delimiter = args.delimiter
+    for utt_ids, _, _ in segments:
+        spk = delimiter.join(utt_ids[0].split(delimiter)[:-2])
+        s = utt_ids[0].split(delimiter)[-2]
+        e = utt_ids[-1].split(delimiter)[-1]
+        new_utt_id = "%s" % (spk + delimiter + s + delimiter + e)
+
+        segments_dict[new_utt_id] = (
+            segments_dict[utt_ids[0]][0],
+            segments_dict[utt_ids[-1]][1],
+        )
+
+        if len(utt_ids) > 1:
+            for utt_id in utt_ids:
+                del segments_dict[utt_id]
+
+    return segments_dict
+
+
+def main():
+    segments_dict = {}
+    with codecs.open(args.segments, "r", encoding="utf-8") as f:
+        for line in f:
+            utt_id, spk, start, end = line.strip().split()
+            segments_dict[utt_id] = (start, end)
+
+    segments_spk = deque([])
+    with codecs.open(args.segments, "r", encoding="utf-8") as f:
+        spk_prev = None
+        for line in f:
+            utt_id, spk, start, end = line.strip().split()
+            start = float(start) * 100  # per 10ms
+            end = float(end) * 100  # per 10ms
+            if spk_prev is not None and spk != spk_prev:
+                segments_dict = merge(segments_spk, segments_dict)
+                segments_spk = deque([])  # reset
+            segments_spk.append(([utt_id], start, end))
+            spk_prev = spk
+
+    with codecs.open(args.output_segments, "w", encoding="utf-8") as f:
+        for utt_id, (start, end) in sorted(segments_dict.items(), key=lambda x: x[0]):
+            spk = args.delimiter.join(utt_id.split(args.delimiter)[:-2])
+            f.write("%s %s %s %s\n" % (utt_id, spk, start, end))
+
+    spk2utt_dict = {}
+    with codecs.open(args.output_utt2spk, "w", encoding="utf-8") as f:
+        for utt_id, ref in sorted(segments_dict.items(), key=lambda x: x[0]):
+            spk = args.delimiter.join(utt_id.split(args.delimiter)[:-2])
+            f.write("%s %s\n" % (utt_id, spk))
+
+            if spk not in spk2utt_dict:
+                spk2utt_dict[spk] = [utt_id]
+            else:
+                spk2utt_dict[spk] += [utt_id]
+
+    with codecs.open(args.output_spk2utt, "w", encoding="utf-8") as f:
+        for spk, utt_ids in sorted(spk2utt_dict.items(), key=lambda x: x[0]):
+            f.write("%s %s\n" % (spk, " ".join(utt_ids)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/iwslt21/asr1/local/parse_xml.py b/egs/iwslt21/asr1/local/parse_xml.py
new file mode 120000
index 00000000000..431ec374ae0
--- /dev/null
+++ b/egs/iwslt21/asr1/local/parse_xml.py
@@ -0,0 +1 @@
+../../../iwslt18/st1/local/parse_xml.py
\ No newline at end of file
diff --git a/egs/iwslt21/asr1/local/score_bleu_reseg.sh b/egs/iwslt21/asr1/local/score_bleu_reseg.sh
new file mode 120000
index 00000000000..b7e4aaa6d8a
--- /dev/null
+++ b/egs/iwslt21/asr1/local/score_bleu_reseg.sh
@@ -0,0 +1 @@
+../../../iwslt18/st1/local/score_bleu_reseg.sh
\ No newline at end of file
diff --git a/egs/iwslt21/asr1/local/score_sclite_reseg.sh b/egs/iwslt21/asr1/local/score_sclite_reseg.sh
new file mode 120000
index 00000000000..4f9c92d4810
--- /dev/null
+++ b/egs/iwslt21/asr1/local/score_sclite_reseg.sh
@@ -0,0 +1 @@
+../../../iwslt18/st1/local/score_sclite_reseg.sh
\ No newline at end of file
diff --git a/egs/iwslt21/asr1/local/wrap-xml.perl b/egs/iwslt21/asr1/local/wrap-xml.perl
new file mode 120000
index 00000000000..6bb20b18cf0
--- /dev/null
+++ b/egs/iwslt21/asr1/local/wrap-xml.perl
@@ -0,0 +1 @@
+../../../iwslt18/st1/local/wrap-xml.perl
\ No newline at end of file
diff --git a/egs/iwslt21/asr1/path.sh b/egs/iwslt21/asr1/path.sh
index 813bf6153ff..ada97787df8 100644
--- a/egs/iwslt21/asr1/path.sh
+++ b/egs/iwslt21/asr1/path.sh
@@ -20,6 +20,12 @@ if ! which tokenizer.perl > /dev/null; then
     echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
     return 1
 fi
+if ! which segmentBasedOnMWER.sh > /dev/null; then
+    echo "Error: it seems that mwerSegmenter is not installed." >&2
+    echo "Error: please install mwerSegmenter as follows." >&2
+    echo "Error: cd tools && installers/install_mwerSegmenter.sh" >&2
+    return 1
+fi
 
 # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
diff --git a/egs/iwslt21/asr1/run.sh b/egs/iwslt21/asr1/run.sh
index 862270467c2..1785e174456 100755
--- a/egs/iwslt21/asr1/run.sh
+++ b/egs/iwslt21/asr1/run.sh
@@ -39,6 +39,10 @@ n_average=5                  # the number of ASR models to be averaged
 use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
                              # if false, the last `n_average` ASR models will be averaged.
 
+# iwslt segmentation related
+max_interval=200
+max_duration=2000
+
 # bpemode (unigram or bpe)
 nbpe=5000
 bpemode=bpe
@@ -61,14 +65,19 @@ stted_dir=../../iwslt18
 tedlium2_dir=../../tedlium2
 librispeech_dir=../../librispeech
 
+# test data directory
+iwslt_test_data_dir=/n/rd8/iwslt18
+
 train_set=train
 train_dev=dev
-recog_set_subset="et_mustc_tst-COMMON et_tedlium2_test et_librispeech_test_other"  # for quick decoding
+recog_subset="et_mustc_tst-COMMON et_tedlium2_test et_librispeech_test_other"  # for quick decoding
 recog_set="et_mustc_dev_org et_mustc_tst-COMMON et_mustc_tst-HE \
            et_mustcv2_dev_org et_mustcv2_tst-COMMON et_mustcv2_tst-HE \
            et_stted_dev et_stted_test \
            et_tedlium2_dev et_tedlium2_test \
            et_librispeech_dev_clean et_librispeech_dev_other et_librispeech_test_clean et_librispeech_test_other"
+iwslt_test_set="et_stted_dev2010 et_stted_tst2010 et_stted_tst2013 et_stted_tst2014 et_stted_tst2015 \
+                et_stted_tst2018 et_stted_tst2019"
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     ### Task dependent. You have to make data the following preparation part by yourself.
@@ -109,7 +118,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/asr1/data/train_dev.en          data/dt_${data_code}
     local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/asr1/data/dev.en                data/et_${data_code}_dev
     local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/asr1/data/test.en               data/et_${data_code}_test
-    for x in dev2010 tst2010 tst2010 tst2013 tst2014 tst2015; do
+    for x in dev2010 tst2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
         cp -rf ${stted_dir}/asr1/data/${x}.en data/et_${data_code}_${x}
     done
 
@@ -126,7 +135,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # additionally we copy text to text.${case}
     for x in tr_${data_code} dt_${data_code} et_${data_code}_dev et_${data_code}_test; do
         for case in tc lc lc.rm; do
-            cp data/${x}/text data/${x}/text.${case}
+            paste -d " " <(cut -d " " -f 1 data/${x}/text) <(cut -d " " -f 2- data/${x}/text | lowercase.perl | tokenizer.perl -l en -q) > data/${x}/text.${case}
         done
     done
 
@@ -150,8 +159,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         done
         cp data/${x}/text.lc.rm data/${x}/text
     done
-
-    # TODO: IWSLT21 test set
 fi
 
 feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
@@ -166,9 +173,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
             data/${x} exp/make_fbank/${x} ${fbankdir}
         utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}
+        rm data/${x}/segments
+        rm data/${x}/wav.scp
     done
-    rm data/*/segments
-    rm data/*/wav.scp
 
     utils/combine_data.sh --extra_files "text.tc text.lc text.lc.rm" data/${train_set} data/tr_mustc data/tr_mustcv2 data/tr_librispeech data/tr_stted data/tr_tedlium2
     utils/combine_data.sh --extra_files "text.tc text.lc text.lc.rm" data/${train_dev} data/dt_mustc data/dt_mustcv2 data/dt_librispeech data/dt_stted data/dt_tedlium2
@@ -200,8 +207,33 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     for x in ${train_dev} ${recog_set}; do
         feat_recog_dir=${dumpdir}/${x}/delta${do_delta}; mkdir -p ${feat_recog_dir}
         dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
-            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${x} \
-            ${feat_recog_dir}
+            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${x} ${feat_recog_dir}
+    done
+
+    # concatenate short segments
+    for x in ${iwslt_test_set}; do
+        output_dir=${x}_merge${max_interval}_duration${max_duration}
+        rm -rf data/${output_dir}
+        cp -rf data/${x} data/${output_dir}
+        rm data/${output_dir}/utt2num_frames
+
+        local/merge_short_segments.py \
+            data/${x}/segments \
+            data/${output_dir}/segments \
+            data/${output_dir}/utt2spk \
+            data/${output_dir}/spk2utt \
+            --min_interval ${max_interval} \
+            --max_duration ${max_duration} \
+            --delimiter "_" || exit 1;
+
+        # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+            data/${output_dir} exp/make_fbank/${output_dir} ${fbankdir}
+        utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${output_dir}
+
+        feat_recog_dir=${dumpdir}/${output_dir}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+            data/${output_dir}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${output_dir} ${feat_recog_dir}
     done
 fi
 
@@ -234,10 +266,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "make json files"
     data2json.sh --nj 32 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.lc.rm --bpecode ${bpemodel}.model \
         data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
-    for x in ${train_dev} ${recog_set}; do
-        feat_recog_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_recog_dir}/feats.scp --text data/${x}/text.lc.rm --bpecode ${bpemodel}.model \
-            data/${x} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    for x in ${train_dev} ${recog_set} ${iwslt_test_set}; do
+        if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
+            feat_recog_dir=${dumpdir}/${x}_merge${max_interval}_duration${max_duration}/delta${do_delta}
+            local/data2json.sh --feat ${feat_recog_dir}/feats.scp --no_text true \
+                data/${x}_merge${max_interval}_duration${max_duration} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+        else
+            feat_recog_dir=${dumpdir}/${x}/delta${do_delta}
+            data2json.sh --feat ${feat_recog_dir}/feats.scp --text data/${x}/text.lc.rm --bpecode ${bpemodel}.model \
+                data/${x} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+        fi
     done
 fi
 
@@ -346,8 +384,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     fi
 
     pids=() # initialize pids
-    for x in ${recog_set_subset}; do
+    for x in ${recog_subset}; do
     (
+        if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
+            x=${x}_merge${max_interval}_duration${max_duration}
+        fi
         decode_dir=decode_${x}_$(basename ${decode_config%.*})
         feat_recog_dir=${dumpdir}/${x}/delta${do_delta}
 
@@ -364,7 +405,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${recog_model}
 
-        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+        if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
+            set=$(echo ${x} | cut -f 3 -d "_")
+            local/score_sclite_reseg.sh --case lc.rm --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true \
+                ${expdir}/${decode_dir} ${dict} ${iwslt_test_data_dir} ${set}
+        else
+            score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+        fi
     ) &
     pids+=($!) # store background pids
     done
diff --git a/egs/iwslt21/mt1/cmd.sh b/egs/iwslt21/mt1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/iwslt21/mt1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/iwslt21/mt1/conf/gpu.conf b/egs/iwslt21/mt1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/iwslt21/mt1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/iwslt21/mt1/conf/queue.conf b/egs/iwslt21/mt1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/iwslt21/mt1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/iwslt21/mt1/conf/slurm.conf b/egs/iwslt21/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/iwslt21/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/iwslt21/mt1/conf/tuning/decode_pytorch_transformer_lc.rm.yaml b/egs/iwslt21/mt1/conf/tuning/decode_pytorch_transformer_lc.rm.yaml
new file mode 100644
index 00000000000..afac047d94f
--- /dev/null
+++ b/egs/iwslt21/mt1/conf/tuning/decode_pytorch_transformer_lc.rm.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 5
+penalty: 0.6
+maxlenratio: 1.6
+minlenratio: 0.0
diff --git a/egs/iwslt21/mt1/conf/tuning/decode_pytorch_transformer_tc.yaml b/egs/iwslt21/mt1/conf/tuning/decode_pytorch_transformer_tc.yaml
new file mode 100644
index 00000000000..a69044bec19
--- /dev/null
+++ b/egs/iwslt21/mt1/conf/tuning/decode_pytorch_transformer_tc.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 5
+penalty: 0.4
+maxlenratio: 1.1
+minlenratio: 0.0
diff --git a/egs/iwslt21/mt1/conf/tuning/train_pytorch_transformer_base.yaml b/egs/iwslt21/mt1/conf/tuning/train_pytorch_transformer_base.yaml
new file mode 100644
index 00000000000..39a6e710238
--- /dev/null
+++ b/egs/iwslt21/mt1/conf/tuning/train_pytorch_transformer_base.yaml
@@ -0,0 +1,42 @@
+# network architecture
+# encoder related
+elayers: 6
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 512
+aheads: 8
+tie-src-tgt-embedding: false
+tie-classifier: false
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-frames-inout: 4096
+maxlen-in: 100 # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 100 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 16
+grad-clip: 5
+patience: 0
+epochs: 40
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_mt_transformer:E2E"
+transformer-lr: 1.0
+transformer-warmup-steps: 8000
+transformer-attn-dropout-rate: 0.1
+transformer-length-normalized-loss: false
+transformer-init: xavier_uniform
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/iwslt21/mt1/local b/egs/iwslt21/mt1/local
new file mode 120000
index 00000000000..23830fb51b8
--- /dev/null
+++ b/egs/iwslt21/mt1/local
@@ -0,0 +1 @@
+../asr1/local
\ No newline at end of file
diff --git a/egs/iwslt21/mt1/path.sh b/egs/iwslt21/mt1/path.sh
new file mode 100644
index 00000000000..813bf6153ff
--- /dev/null
+++ b/egs/iwslt21/mt1/path.sh
@@ -0,0 +1,25 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/iwslt21/mt1/run.sh b/egs/iwslt21/mt1/run.sh
new file mode 100755
index 00000000000..0924e28f851
--- /dev/null
+++ b/egs/iwslt21/mt1/run.sh
@@ -0,0 +1,344 @@
+#!/bin/bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=0         # start from -1 if you need to start from data download
+stop_stage=5
+ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=8            # number of parallel jobs for decoding
+debugmode=1
+dumpdir=dump    # directory to dump full features
+N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0       # verbose option
+resume=         # Resume the training from snapshot
+seed=1          # seed to generate random number
+
+train_config=conf/train.yaml
+decode_config=conf/decode.yaml
+
+# decoding parameter
+trans_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+
+# model average realted (only for transformer)
+n_average=5                  # the number of MT models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best MT models will be averaged.
+                             # if false, the last `n_average` MT models will be averaged.
+metric=bleu                  # loss/acc/bleu
+
+# preprocessing related
+src_case=tc
+tgt_case=tc
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+
+# postprocessing related
+remove_nonverbal=true  # remove non-verbal labels such as "( Applaus )"
+# NOTE: IWSLT community accepts this setting and therefore we use this by default
+
+# if true, reverse source and target languages: **->English
+reverse_direction=false
+
+# use the same dict as in the ST task
+use_st_dict=false
+
+# bpemode (unigram or bpe)
+nbpe=32000
+bpemode=bpe
+
+# exp tag
+tag="" # tag for managing experiments.
+
+# data size related
+datasize=10m  # 5m/10m/20m
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# data directories
+mustc_dir=../../must_c
+mustc_v2_dir=../../must_c_v2
+stted_dir=../../iwslt18
+
+if [ ${reverse_direction} = true ]; then
+    train_set=train_${datasize}.en
+    train_dev=dev.en
+    trans_subset="et_mustc_dev_org.en et_mustc_tst-COMMON.en et_mustc_tst-HE.en"
+    trans_set="et_mustc_dev_org.en et_mustc_tst-COMMON.en et_mustc_tst-HE.en \
+               et_mustcv2_dev_org.en et_mustcv2_tst-COMMON.en et_mustcv2_tst-HE.en \
+               et_stted_dev2010.en et_stted_tst2010.en et_stted_tst2013.en et_stted_tst2014.en et_stted_tst2015.en \
+               et_stted_tst2018.en et_stted_tst2019.en"
+else
+    train_set=train_${datasize}.de
+    train_dev=dev.de
+    trans_subset="et_mustc_dev_org.de et_mustc_tst-COMMON.de et_mustc_tst-HE.de"
+    trans_set="et_mustc_dev_org.de et_mustc_tst-COMMON.de et_mustc_tst-HE.de \
+               et_mustcv2_dev_org.de et_mustcv2_tst-COMMON.de et_mustcv2_tst-HE.de \
+               et_stted_dev2010.de et_stted_tst2010.de et_stted_tst2013.de et_stted_tst2014.de et_stted_tst2015.de \
+               et_stted_tst2018.de et_stted_tst2019.de"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data Preparation"
+
+    # WMT20
+    local/data_prep_wmt20.sh --max_length 250 --length_ratio 1.5 ${datasize}
+
+    # Must-C
+    if [ ! -d "${mustc_dir}/mt1/data/train.en-de.en" ]; then
+        echo "run ${mustc_dir}/mt1/run.sh first"
+        exit 1
+    fi
+    data_code=mustc
+    for lang in en de; do
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/train.en-de.${lang}      data/tr_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/dev.en-de.${lang}        data/dt_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/dev_org.en-de.${lang}    data/et_${data_code}_dev_org.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/tst-COMMON.en-de.${lang} data/et_${data_code}_tst-COMMON.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/tst-HE.en-de.${lang}     data/et_${data_code}_tst-HE.${lang}
+    done
+
+    # Must-C v2
+    if [ ! -d "${mustc_v2_dir}/mt1/data/train.en-de.en" ]; then
+        echo "run ${mustc_v2_dir}/mt1/run.sh first"
+        exit 1
+    fi
+    data_code=mustcv2
+    for lang in en de; do
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/train.en-de.${lang}      data/tr_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/dev.en-de.${lang}        data/dt_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/dev_org.en-de.${lang}    data/et_${data_code}_dev_org.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/tst-COMMON.en-de.${lang} data/et_${data_code}_tst-COMMON.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/tst-HE.en-de.${lang}     data/et_${data_code}_tst-HE.${lang}
+    done
+
+    # ST-TED
+    if [ ! -d "${stted_dir}/mt1/data/train_nodevtest.en" ]; then
+        echo "run ${stted_dir}/mt1/run.sh first"
+        exit 1
+    fi
+    data_code=stted
+    for lang in en de; do
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/mt1/data/train_nodevtest.${lang} data/tr_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/mt1/data/train_dev.${lang}       data/dt_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/mt1/data/dev.${lang}             data/et_${data_code}_dev.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/mt1/data/test.${lang}            data/et_${data_code}_test.${lang}
+    done
+    # En-De only
+    for x in dev2010 tst2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
+        cp -rf ${stted_dir}/mt1/data/${x}.en data/et_${data_code}_${x}.en
+        cp -rf ${stted_dir}/mt1/data/${x}.de data/et_${data_code}_${x}.de
+    done
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+
+    rm data/tr_*/segments data/dt_*/segments data/et_*/segments
+    rm data/tr_*//wav.scp data/dt_*//wav.scp data/et_*//wav.scp
+
+    # Divide into source and target languages
+    divide_lang.sh tr_wmt20_subset${datasize} "en de"
+
+    for lang in en de; do
+        utils/combine_data.sh --extra_files "text.tc text.lc text.lc.rm" data/train_${datasize}.${lang} data/tr_wmt20_subset${datasize}.${lang} data/tr_mustc.${lang} data/tr_mustcv2.${lang} data/tr_stted.${lang}
+        cp -rf data/dt_mustc.${lang} data/dev.${lang}
+    done
+
+    echo "Remove offlimit"
+    for lang in en de; do
+        cp -rf data/train_${datasize}.${lang} data/train_${datasize}.${lang}.tmp
+        cp data/train_${datasize}.${lang}/utt2spk data/train_${datasize}.${lang}/utt2spk.org
+        local/filter_offlimit.py --offlimit_list local/offlimit_list --utt2spk data/train_${datasize}.${lang}/utt2spk.org > data/train_${datasize}.${lang}/utt2spk
+        utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/train_${datasize}.${lang}
+        rm -rf data/train_${datasize}.${lang}.tmp
+        # NOTE: 5 speakers are expected to be removed
+    done
+fi
+
+if [ ${use_st_dict} = true ]; then
+    if [ ${reverse_direction} = true ]; then
+        dict=../st1/data/lang_1spm/train_sp.en-de.de_${bpemode}${nbpe}_units_${src_case}.txt
+        nlsyms=../st1/data/lang_1spm/train_sp.en-de.de_non_lang_syms_${src_case}.txt
+        bpemodel=../st1/data/lang_1spm/train_sp.en-de.de_${bpemode}${nbpe}_${src_case}
+    else
+        dict=../st1/data/lang_1spm/train_sp.en-de.de_${bpemode}${nbpe}_units_${tgt_case}.txt
+        nlsyms=../st1/data/lang_1spm/train_sp.en-de.de_non_lang_syms_${tgt_case}.txt
+        bpemodel=../st1/data/lang_1spm/train_sp.en-de.de_${bpemode}${nbpe}_${tgt_case}
+    fi
+else
+    dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+    nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+    bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+fi
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+
+    if [ ${use_st_dict} = false ]; then
+        echo "make a non-linguistic symbol list for all languages"
+        cut -f 2- -d' ' data/train_${datasize}.*/text.${tgt_case} | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+        cat ${nlsyms}
+
+        echo "make a joint source and target dictionary"
+        echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+        offset=$(wc -l < ${dict})
+        cut -f 2- -d' ' data/train_${datasize}.*/text.${tgt_case} | grep -v -e '^\s*$' > data/lang_1spm/input_${datasize}_${src_case}_${tgt_case}.txt
+        spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${datasize}_${src_case}_${tgt_case}.txt \
+            --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=0.9995
+        spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${datasize}_${src_case}_${tgt_case}.txt \
+            | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+        wc -l ${dict}
+    fi
+
+    echo "make json files"
+    if [ ${reverse_direction} = true ]; then
+        data2json.sh --nj 16 --text data/train_${datasize}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
+            data/train_${datasize}.en ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        for x in ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}; mkdir -p ${feat_dir}
+            set=$(echo ${x} | cut -f 1 -d ".")
+            data2json.sh --text data/${set}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
+                data/${set}.en ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}_${datasize}.json
+        done
+
+        # update json (add source references)
+        update_json.sh --text data/train_${datasize}.de/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json data/train_${datasize}.de ${dict}
+        for x in ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}
+            data_dir=data/$(echo ${x} | cut -f 1 -d ".").de
+            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}_${datasize}.json ${data_dir} ${dict}
+        done
+    else
+        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
+            data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        for x in ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}; mkdir -p ${feat_dir}
+            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
+                data/${x} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}_${datasize}.json
+        done
+
+        # update json (add source references)
+        update_json.sh --text data/train_${datasize}.en/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json data/train_${datasize}.en ${dict}
+        for x in ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}
+            data_dir=data/$(echo ${x} | cut -f 1 -d ".").en
+            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}_${datasize}.json ${data_dir} ${dict}
+        done
+    fi
+fi
+
+# NOTE: skip stage 3: LM Preparation
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${src_case}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+else
+    expname=${train_set}_${src_case}_${tgt_case}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        mt_train.py \
+        --config ${train_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --seed ${seed} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}_${datasize}.json \
+        --n-iter-processes 2
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
+        # Average MT models
+        if ${use_valbest_average}; then
+            trans_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric ${metric}"
+        else
+            trans_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${trans_model} \
+            --num ${n_average}
+    fi
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
+
+    pids=() # initialize pids
+    for x in ${trans_subset}; do
+    (
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_dir=${dumpdir}/${x}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}_${datasize}.json
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            mt_trans.py \
+            --config ${decode_config} \
+            --ngpu ${dec_ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --trans-json ${feat_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${trans_model}
+
+        if [ ${reverse_direction} = true ]; then
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+                --remove_nonverbal ${remove_nonverbal} \
+                ${expdir}/${decode_dir} "en" ${dict}
+        else
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+                --remove_nonverbal ${remove_nonverbal} \
+                ${expdir}/${decode_dir} "de" ${dict}
+        fi
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/iwslt21/punc1/cmd.sh b/egs/iwslt21/punc1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/iwslt21/punc1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/iwslt21/punc1/conf b/egs/iwslt21/punc1/conf
new file mode 120000
index 00000000000..a453b27a5c1
--- /dev/null
+++ b/egs/iwslt21/punc1/conf
@@ -0,0 +1 @@
+../mt1/conf
\ No newline at end of file
diff --git a/egs/iwslt21/punc1/local b/egs/iwslt21/punc1/local
new file mode 120000
index 00000000000..23830fb51b8
--- /dev/null
+++ b/egs/iwslt21/punc1/local
@@ -0,0 +1 @@
+../asr1/local
\ No newline at end of file
diff --git a/egs/iwslt21/punc1/path.sh b/egs/iwslt21/punc1/path.sh
new file mode 100644
index 00000000000..813bf6153ff
--- /dev/null
+++ b/egs/iwslt21/punc1/path.sh
@@ -0,0 +1,25 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/iwslt21/punc1/run.sh b/egs/iwslt21/punc1/run.sh
new file mode 100755
index 00000000000..0851c80fcb2
--- /dev/null
+++ b/egs/iwslt21/punc1/run.sh
@@ -0,0 +1,276 @@
+#!/bin/bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=0         # start from -1 if you need to start from data download
+stop_stage=5
+ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=8            # number of parallel jobs for decoding
+debugmode=1
+dumpdir=dump    # directory to dump full features
+N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0       # verbose option
+resume=         # Resume the training from snapshot
+seed=1          # seed to generate random number
+
+train_config=conf/train.yaml
+decode_config=conf/decode.yaml
+
+# decoding parameter
+trans_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+
+# model average realted (only for transformer)
+n_average=5                  # the number of MT models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best MT models will be averaged.
+                             # if false, the last `n_average` MT models will be averaged.
+metric=bleu                  # loss/acc/bleu
+
+# preprocessing related
+src_case=lc.rm
+tgt_case=tc
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+
+# postprocessing related
+remove_nonverbal=true  # remove non-verbal labels such as "( Applaus )"
+# NOTE: IWSLT community accepts this setting and therefore we use this by default
+
+# bpemode (unigram or bpe)
+nbpe=32000
+bpemode=bpe
+
+# exp tag
+tag="" # tag for managing experiments.
+
+# data size related
+datasize=5m  # 5m/10m/20m
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# data directories
+mustc_dir=../../must_c
+mustc_v2_dir=../../must_c_v2
+stted_dir=../../iwslt18
+
+train_set=train_${datasize}.en
+train_dev=dev.en
+trans_subset="et_mustc_dev_org.en et_mustc_tst-COMMON.en et_mustc_tst-HE.en"
+trans_set="et_mustc_dev_org.en et_mustc_tst-COMMON.en et_mustc_tst-HE.en \
+            et_mustcv2_dev_org.en et_mustcv2_tst-COMMON.en et_mustcv2_tst-HE.en \
+            et_stted_dev2010.en et_stted_tst2010.en et_stted_tst2013.en et_stted_tst2014.en et_stted_tst2015.en \
+            et_stted_tst2018.en et_stted_tst2019.en"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data Preparation"
+
+    # WMT20
+    local/data_prep_wmt20.sh --max_length 250 --length_ratio 1.5 ${datasize}
+
+    # Must-C
+    if [ ! -d "${mustc_dir}/mt1/data/train.en-de.en" ]; then
+        echo "run ${mustc_dir}/mt1/run.sh first"
+        exit 1
+    fi
+    data_code=mustc
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/train.en-de.en      data/tr_${data_code}.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/dev.en-de.en        data/dt_${data_code}.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/dev_org.en-de.en    data/et_${data_code}_dev_org.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/tst-COMMON.en-de.en data/et_${data_code}_tst-COMMON.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/mt1/data/tst-HE.en-de.en     data/et_${data_code}_tst-HE.en
+
+    # Must-C v2
+    if [ ! -d "${mustc_v2_dir}/mt1/data/train.en-de.en" ]; then
+        echo "run ${mustc_v2_dir}/mt1/run.sh first"
+        exit 1
+    fi
+    data_code=mustcv2
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/train.en-de.en      data/tr_${data_code}.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/dev.en-de.en        data/dt_${data_code}.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/dev_org.en-de.en    data/et_${data_code}_dev_org.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/tst-COMMON.en-de.en data/et_${data_code}_tst-COMMON.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/mt1/data/tst-HE.en-de.en     data/et_${data_code}_tst-HE.en
+
+    # ST-TED
+    if [ ! -d "${stted_dir}/mt1/data/train_nodevtest.en" ]; then
+        echo "run ${stted_dir}/mt1/run.sh first"
+        exit 1
+    fi
+    data_code=stted
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/mt1/data/train_nodevtest.en data/tr_${data_code}.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/mt1/data/train_dev.en       data/dt_${data_code}.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/mt1/data/dev.en             data/et_${data_code}_dev.en
+    local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/mt1/data/test.en            data/et_${data_code}_test.en
+    # En-De only
+    for x in dev2010 tst2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
+        cp -rf ${stted_dir}/mt1/data/${x}.en data/et_${data_code}_${x}.en
+    done
+
+    # TODO: IWSLT21 test set
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+
+    rm data/tr_*/segments data/dt_*/segments data/et_*/segments
+    rm data/tr_*//wav.scp data/dt_*//wav.scp data/et_*//wav.scp
+
+    # Divide into source and target languages
+    divide_lang.sh tr_wmt20_subset${datasize} "en"
+
+    utils/combine_data.sh --extra_files "text.tc text.lc text.lc.rm" data/train_${datasize}.en data/tr_wmt20_subset${datasize}.en data/tr_mustc.en data/tr_mustcv2.en data/tr_stted.en
+    cp -rf data/dt_mustc.en data/dev.en
+
+    echo "Remove offlimit"
+    cp -rf data/train_${datasize}.en data/train_${datasize}.en.tmp
+    cp data/train_${datasize}.en/utt2spk data/train_${datasize}.en/utt2spk.org
+    local/filter_offlimit.py --offlimit_list local/offlimit_list --utt2spk data/train_${datasize}.en/utt2spk.org > data/train_${datasize}.en/utt2spk
+    utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/train_${datasize}.en
+    rm -rf data/train_${datasize}.en.tmp
+    # NOTE: 5 speakers are expected to be removed
+fi
+
+dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+
+    echo "make a non-linguistic symbol list for all languages"
+    cut -f 2- -d' ' data/train_${datasize}.en/text.${tgt_case} | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+    cat ${nlsyms}
+
+    echo "make a dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    cut -f 2- -d' ' data/train_${datasize}.en/text.${tgt_case} | grep -v -e '^\s*$' > data/lang_1spm/input_${src_case}_${tgt_case}.txt
+    spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${src_case}_${tgt_case}.txt \
+        --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${src_case}_${tgt_case}.txt \
+        | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
+        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+    for x in ${train_dev} ${trans_set}; do
+        feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
+        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
+            data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+    done
+
+    # update json (add source references)
+    for x in ${train_set} ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}
+        data_dir=data/$(echo ${x} | cut -f 1 -d ".").en
+        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+    done
+fi
+
+# NOTE: skip stage 3: LM Preparation
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${src_case}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+else
+    expname=${train_set}_${src_case}_${tgt_case}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        mt_train.py \
+        --config ${train_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --seed ${seed} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --n-iter-processes 2
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
+        # Average MT models
+        if ${use_valbest_average}; then
+            trans_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric ${metric}"
+        else
+            trans_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${trans_model} \
+            --num ${n_average}
+    fi
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
+
+    pids=() # initialize pids
+    for x in ${trans_subset}; do
+    (
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_trans_dir=${dumpdir}/${x}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            mt_trans.py \
+            --config ${decode_config} \
+            --ngpu ${dec_ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --trans-json ${feat_trans_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${trans_model}
+
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+            --remove_nonverbal ${remove_nonverbal} \
+            ${expdir}/${decode_dir} "en" ${dict}
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/iwslt21/punc1/steps b/egs/iwslt21/punc1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/iwslt21/punc1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/iwslt21/punc1/utils b/egs/iwslt21/punc1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/iwslt21/punc1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/iwslt21/st1/cmd.sh b/egs/iwslt21/st1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/iwslt21/st1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/iwslt21/st1/conf/decode.yaml b/egs/iwslt21/st1/conf/decode.yaml
new file mode 120000
index 00000000000..9fd0988b5e2
--- /dev/null
+++ b/egs/iwslt21/st1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer_beam4.yaml
\ No newline at end of file
diff --git a/egs/iwslt21/st1/conf/fbank.conf b/egs/iwslt21/st1/conf/fbank.conf
new file mode 100644
index 00000000000..75232358639
--- /dev/null
+++ b/egs/iwslt21/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000
+--num-mel-bins=80
diff --git a/egs/iwslt21/st1/conf/gpu.conf b/egs/iwslt21/st1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/iwslt21/st1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/iwslt21/st1/conf/pitch.conf b/egs/iwslt21/st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/iwslt21/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/iwslt21/st1/conf/queue.conf b/egs/iwslt21/st1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/iwslt21/st1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/iwslt21/st1/conf/slurm.conf b/egs/iwslt21/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/iwslt21/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/iwslt21/st1/conf/specaug.yaml b/egs/iwslt21/st1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/iwslt21/st1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/iwslt21/st1/conf/train.yaml b/egs/iwslt21/st1/conf/train.yaml
new file mode 120000
index 00000000000..aca3f74ba99
--- /dev/null
+++ b/egs/iwslt21/st1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_conformer_large.yaml
\ No newline at end of file
diff --git a/egs/iwslt21/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml b/egs/iwslt21/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml
new file mode 100644
index 00000000000..0be09a1b2d5
--- /dev/null
+++ b/egs/iwslt21/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 4
+penalty: 0.6
+maxlenratio: 0.3
+minlenratio: 0.0
diff --git a/egs/iwslt21/st1/conf/tuning/train_pytorch_conformer_large.yaml b/egs/iwslt21/st1/conf/tuning/train_pytorch_conformer_large.yaml
new file mode 100644
index 00000000000..366b2404e37
--- /dev/null
+++ b/egs/iwslt21/st1/conf/tuning/train_pytorch_conformer_large.yaml
@@ -0,0 +1,56 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 512
+aheads: 8
+
+# multitask
+mtlalpha: 0.0
+asr-weight: 0.0
+mt-weight: 0.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 50
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+# batch-bins: 15000000
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 30
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_st_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.5
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31  # worth tuning!
diff --git a/egs/iwslt21/st1/local b/egs/iwslt21/st1/local
new file mode 120000
index 00000000000..23830fb51b8
--- /dev/null
+++ b/egs/iwslt21/st1/local
@@ -0,0 +1 @@
+../asr1/local
\ No newline at end of file
diff --git a/egs/iwslt21/st1/path.sh b/egs/iwslt21/st1/path.sh
new file mode 100644
index 00000000000..ada97787df8
--- /dev/null
+++ b/egs/iwslt21/st1/path.sh
@@ -0,0 +1,31 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+if ! which segmentBasedOnMWER.sh > /dev/null; then
+    echo "Error: it seems that mwerSegmenter is not installed." >&2
+    echo "Error: please install mwerSegmenter as follows." >&2
+    echo "Error: cd tools && installers/install_mwerSegmenter.sh" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/iwslt21/st1/run.sh b/egs/iwslt21/st1/run.sh
new file mode 100755
index 00000000000..2b9b830aa0e
--- /dev/null
+++ b/egs/iwslt21/st1/run.sh
@@ -0,0 +1,352 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=0         # start from -1 if you need to start from data download
+stop_stage=5
+ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=8            # number of parallel jobs for decoding
+debugmode=1
+dumpdir=dump    # directory to dump full features
+N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0       # verbose option
+resume=         # Resume the training from snapshot
+seed=1          # seed to generate random number
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml
+decode_config=conf/decode.yaml
+
+# decoding parameter
+trans_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ST models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ST models will be averaged.
+                             # if false, the last `n_average` ST models will be averaged.
+metric=bleu                  # loss/acc/bleu
+
+# pre-training related
+asr_model=
+mt_model=
+
+# preprocessing related
+src_case=lc.rm
+tgt_case=tc
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+
+# postprocessing related
+remove_nonverbal=true  # remove non-verbal labels such as "( Applaus )"
+# NOTE: IWSLT community accepts this setting and therefore we use this by default
+
+# segmentation related
+max_interval=100
+max_duration=2000
+
+# bpemode (unigram or bpe)
+nbpe=16000
+bpemode=bpe
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# data directories
+mustc_dir=../../must_c
+mustc_v2_dir=../../must_c_v2
+stted_dir=../../iwslt18
+
+# test data directory
+iwslt_test_data_dir=/n/rd8/iwslt18
+
+train_set=train.de
+train_dev=dev.de
+trans_subset="et_mustc_dev_org.de et_mustc_tst-COMMON.de et_mustc_tst-HE.de"
+trans_set="et_mustc_dev_org.de et_mustc_tst-COMMON.de et_mustc_tst-HE.de \
+           et_mustcv2_dev_org.de et_mustcv2_tst-COMMON.de et_mustcv2_tst-HE.de"
+iwslt_test_set="et_stted_dev2010.de et_stted_tst2010.de et_stted_tst2013.de et_stted_tst2014.de et_stted_tst2015.de \
+                et_stted_tst2018.de et_stted_tst2019.de"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data Preparation"
+
+    # Must-C
+    if [ ! -d "${mustc_dir}/st1/data/train_sp.en-de.en" ]; then
+        echo "run ${mustc_dir}/st1/run.sh first"
+        exit 1
+    fi
+    data_code=mustc
+    for lang in en de; do
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/st1/data/train_sp.en-de.${lang}   data/tr_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/st1/data/dev.en-de.${lang}        data/dt_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/st1/data/dev_org.en-de.${lang}    data/et_${data_code}_dev_org.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/st1/data/tst-COMMON.en-de.${lang} data/et_${data_code}_tst-COMMON.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_dir}/st1/data/tst-HE.en-de.${lang}     data/et_${data_code}_tst-HE.${lang}
+    done
+
+    # Must-C v2
+    if [ ! -d "${mustc_v2_dir}/st1/data/train_sp.en-de.en" ]; then
+        echo "run ${mustc_v2_dir}/st1/run.sh first"
+        exit 1
+    fi
+    data_code=mustcv2
+    for lang in en de; do
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/st1/data/train_sp.en-de.${lang}   data/tr_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/st1/data/dev.en-de.${lang}        data/dt_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/st1/data/dev_org.en-de.${lang}    data/et_${data_code}_dev_org.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/st1/data/tst-COMMON.en-de.${lang} data/et_${data_code}_tst-COMMON.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${mustc_v2_dir}/st1/data/tst-HE.en-de.${lang}     data/et_${data_code}_tst-HE.${lang}
+    done
+
+    # ST-TED
+    if [ ! -d "${stted_dir}/st1/data/train_nodevtest_sp.en" ]; then
+        echo "run ${stted_dir}/st1/run.sh first"
+        exit 1
+    fi
+    data_code=stted
+    for lang in en de; do
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/st1/data/train_nodevtest_sp.${lang} data/tr_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/st1/data/train_dev.${lang}          data/dt_${data_code}.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/st1/data/dev.${lang}                data/et_${data_code}_dev.${lang}
+        local/copy_data_dir.sh --utt-prefix ${data_code}- --spk-prefix ${data_code}- ${stted_dir}/st1/data/test.${lang}               data/et_${data_code}_test.${lang}
+    done
+    # En-De only
+    for x in dev2010 tst2010 tst2010 tst2013 tst2014 tst2015 tst2018 tst2019; do
+        cp -rf ${stted_dir}/st1/data/${x}.en data/et_${data_code}_${x}.en
+        cp -rf ${stted_dir}/st1/data/${x}.de data/et_${data_code}_${x}.de
+    done
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in tr_mustc.de tr_mustcv2.de tr_stted.de dt_mustc.de dt_mustcv2.de dt_stted.de ${trans_set}; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${x}
+        rm data/${x}/segments
+        rm data/${x}/wav.scp
+    done
+
+    for lang in en de; do
+        utils/combine_data.sh --extra_files "text.tc text.lc text.lc.rm" data/train.${lang} data/tr_mustc.${lang} data/tr_mustcv2.${lang} data/tr_stted.${lang}
+        utils/combine_data.sh --extra_files "text.tc text.lc text.lc.rm" data/dev.${lang}   data/dt_mustc.${lang} data/dt_mustcv2.${lang} data/dt_stted.${lang}
+    done
+
+    echo "Remove offlimit"
+    cp -rf data/${train_set} data/${train_set}.tmp
+    cp data/${train_set}/utt2spk data/${train_set}/utt2spk.org
+    local/filter_offlimit.py --offlimit_list local/offlimit_list --utt2spk data/${train_set}/utt2spk.org > data/${train_set}/utt2spk
+    utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${train_set}
+    rm -rf data/${train_set}.tmp
+    # NOTE: 5 speakers are expected to be removed
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
+    for x in ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}; mkdir -p ${feat_dir}
+        dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${x} ${feat_dir}
+    done
+
+    # concatenate short segments
+    for x in ${iwslt_test_set}; do
+        output_dir=${x}_merge${max_interval}_duration${max_duration}
+        rm -rf data/${output_dir}
+        cp -rf data/${x} data/${output_dir}
+        rm data/${output_dir}/utt2num_frames
+
+        local/merge_short_segments.py \
+            data/${x}/segments \
+            data/${output_dir}/segments \
+            data/${output_dir}/utt2spk \
+            data/${output_dir}/spk2utt \
+            --min_interval ${max_interval} \
+            --max_duration ${max_duration} \
+            --delimiter "_" || exit 1;
+
+        # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+            data/${output_dir} exp/make_fbank/${output_dir} ${fbankdir}
+        utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${output_dir}
+
+        feat_dir=${dumpdir}/${output_dir}/delta${do_delta}; mkdir -p ${feat_dir}
+        dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+            data/${output_dir}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${output_dir} ${feat_dir}
+    done
+fi
+
+dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+
+    echo "make a non-linguistic symbol list for all languages"
+    grep sp1.0 data/train.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+    cat ${nlsyms}
+
+    echo "make a joint source and target dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    grep sp1.0 data/train.*/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${src_case}_${tgt_case}.txt
+    spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${src_case}_${tgt_case}.txt \
+        --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${src_case}_${tgt_case}.txt \
+        | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
+        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+    for x in ${train_dev} ${trans_set} ${iwslt_test_set}; do
+        if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
+            feat_dir=${dumpdir}/${x}_merge${max_interval}_duration${max_duration}/delta${do_delta}
+            local/data2json.sh --feat ${feat_dir}/feats.scp --no_text true \
+                data/${x}_merge${max_interval}_duration${max_duration} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        else
+            feat_dir=${dumpdir}/${x}/delta${do_delta}
+            data2json.sh --feat ${feat_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "de" \
+                data/${x} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        fi
+    done
+
+    # update json (add source references)
+    for x in ${train_set} ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data_dir=data/$(echo ${x} | cut -f 1 -d ".").en
+        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+    done
+fi
+
+# NOTE: skip stage 3: LM Preparation
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+else
+    expname=${train_set}_${tgt_case}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        st_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --seed ${seed} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --enc-init ${asr_model} \
+        --dec-init ${mt_model} \
+        --n-iter-processes 3
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+       [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]]; then
+        # Average ST models
+        if ${use_valbest_average}; then
+            trans_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric ${metric}"
+        else
+            trans_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${trans_model} \
+            --num ${n_average}
+    fi
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
+
+    pids=() # initialize pids
+    for x in ${trans_subset}; do
+    (
+        if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
+            x=${x}_merge${max_interval}_duration${max_duration}
+        fi
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            st_trans.py \
+            --config ${decode_config} \
+            --ngpu ${dec_ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --trans-json ${feat_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${trans_model}
+
+        if [[ ${x} = *tst20* ]] || [[ ${x} = *dev20* ]]; then
+            set=$(echo ${x} | cut -f 1 -d "." | cut -f 3 -d "_")
+            local/score_bleu_reseg.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+                --remove_nonverbal ${remove_nonverbal} \
+                ${expdir}/${decode_dir} ${dict} ${iwslt_test_data_dir} ${set}
+        else
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+                --remove_nonverbal ${remove_nonverbal} \
+                ${expdir}/${decode_dir} "de" ${dict}
+        fi
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/iwslt21/st1/steps b/egs/iwslt21/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/iwslt21/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/iwslt21/st1/utils b/egs/iwslt21/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/iwslt21/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/iwslt21_low_resource/asr1/cmd.sh b/egs/iwslt21_low_resource/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/iwslt21_low_resource/asr1/cmd.sh
+++ b/egs/iwslt21_low_resource/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt21_low_resource/asr1/run.sh b/egs/iwslt21_low_resource/asr1/run.sh
index dead4edc7d2..34af96bdf65 100755
--- a/egs/iwslt21_low_resource/asr1/run.sh
+++ b/egs/iwslt21_low_resource/asr1/run.sh
@@ -114,7 +114,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in train_sp.${src_lang}-${tgt_lang} valid.${src_lang}-${tgt_lang}; do
-        local/divide_lang.sh ${x} ${src_lang} ${tgt_lang}
+        divide_lang.sh ${x} "${src_lang} ${tgt_lang}"
     done
     for lang in ${tgt_lang} ${src_lang}; do
         cp -rf data/valid.${src_lang}-${tgt_lang}.${lang} data/valid_org.${src_lang}-${tgt_lang}.${lang}
diff --git a/egs/iwslt21_low_resource/st1/cmd.sh b/egs/iwslt21_low_resource/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/iwslt21_low_resource/st1/cmd.sh
+++ b/egs/iwslt21_low_resource/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/iwslt21_low_resource/st1/local/divide_lang.sh b/egs/iwslt21_low_resource/st1/local/divide_lang.sh
deleted file mode 100755
index a6b8a7e5258..00000000000
--- a/egs/iwslt21_low_resource/st1/local/divide_lang.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2019 Kyoto University (Hirofumi Inaguma)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-. ./path.sh
-
-if [ "$#" -ne 3 ]; then
-    echo "Usage: $0 <set> <src_lang> <tgt_lang>"
-    echo "e.g.: $0 dev"
-    exit 1
-fi
-
-set=$1
-src_lang=$2
-tgt_lang=$3
-
-# Copy stuff into its final locations [this has been moved from the format_data script]
-# for ${src_lang}
-mkdir -p data/${set}.${src_lang}
-for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
-    if [ -f data/${set}/${f} ]; then
-        sort data/${set}/${f} > data/${set}.${src_lang}/${f}
-    fi
-done
-sort data/${set}/text.lc.rm.${src_lang} > data/${set}.${src_lang}/text  # dummy
-sort data/${set}/text.tc.${src_lang} > data/${set}.${src_lang}/text.tc
-sort data/${set}/text.lc.${src_lang} > data/${set}.${src_lang}/text.lc
-sort data/${set}/text.lc.rm.${src_lang} > data/${set}.${src_lang}/text.lc.rm
-utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${src_lang}
-if [ -f data/${set}.${src_lang}/feats.scp ]; then
-    utils/validate_data_dir.sh data/${set}.${src_lang} || exit 1;
-else
-    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${src_lang} || exit 1;
-fi
-
-# for target language
-mkdir -p data/${set}.${tgt_lang}
-for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
-    if [ -f data/${set}/${f} ]; then
-        sort data/${set}/${f} > data/${set}.${tgt_lang}/${f}
-    fi
-done
-sort data/${set}/text.tc.${tgt_lang} > data/${set}.${tgt_lang}/text  # dummy
-sort data/${set}/text.tc.${tgt_lang} > data/${set}.${tgt_lang}/text.tc
-sort data/${set}/text.lc.${tgt_lang} > data/${set}.${tgt_lang}/text.lc
-sort data/${set}/text.lc.rm.${tgt_lang} > data/${set}.${tgt_lang}/text.lc.rm
-utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${tgt_lang}
-if [ -f data/${set}.${tgt_lang}/feats.scp ]; then
-    utils/validate_data_dir.sh data/${set}.${tgt_lang} || exit 1;
-else
-    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${tgt_lang} || exit 1;
-fi
diff --git a/egs/iwslt21_low_resource/st1/run.sh b/egs/iwslt21_low_resource/st1/run.sh
index 3ef24103bf9..febf865d3e3 100755
--- a/egs/iwslt21_low_resource/st1/run.sh
+++ b/egs/iwslt21_low_resource/st1/run.sh
@@ -112,7 +112,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in train_sp.${src_lang}-${tgt_lang} valid.${src_lang}-${tgt_lang}; do
-        local/divide_lang.sh ${x} ${src_lang} ${tgt_lang}
+        divide_lang.sh ${x} "${src_lang} ${tgt_lang}"
     done
     for lang in ${tgt_lang} ${src_lang}; do
         cp -rf data/valid.${src_lang}-${tgt_lang}.${lang} data/valid_org.${src_lang}-${tgt_lang}.${lang}
@@ -271,7 +271,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             ${expdir}/${decode_dir} ${tgt_lang} ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
diff --git a/egs/jesc/mt1/cmd.sh b/egs/jesc/mt1/cmd.sh
index 9f648974ff4..3099918dd5d 100644
--- a/egs/jesc/mt1/cmd.sh
+++ b/egs/jesc/mt1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/jnas/asr1/cmd.sh b/egs/jnas/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100755
--- a/egs/jnas/asr1/cmd.sh
+++ b/egs/jnas/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/jnas/tts1/cmd.sh b/egs/jnas/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/jnas/tts1/cmd.sh
+++ b/egs/jnas/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/jnas/tts1/run.sh b/egs/jnas/tts1/run.sh
index d13e1227f4b..c3219e96c10 100755
--- a/egs/jnas/tts1/run.sh
+++ b/egs/jnas/tts1/run.sh
@@ -112,7 +112,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     ### Task dependent. You have to design training and dev name by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 1: Feature Generation"
-    # Trim silence parts at the begining and the end of audio
+    # Trim silence parts at the beginning and the end of audio
     if ${do_trimming}; then
         for name in ${org_set} ${eval_set}; do
             trim_silence.sh --cmd "${train_cmd}" \
diff --git a/egs/jsalt18e2e/asr1/cmd.sh b/egs/jsalt18e2e/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/jsalt18e2e/asr1/cmd.sh
+++ b/egs/jsalt18e2e/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/jsut/asr1/cmd.sh b/egs/jsut/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/jsut/asr1/cmd.sh
+++ b/egs/jsut/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/jsut/tts1/cmd.sh b/egs/jsut/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/jsut/tts1/cmd.sh
+++ b/egs/jsut/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/jsut/tts1/local/download.sh b/egs/jsut/tts1/local/download.sh
index c1c1345f9f6..236f8b7ec85 100755
--- a/egs/jsut/tts1/local/download.sh
+++ b/egs/jsut/tts1/local/download.sh
@@ -26,7 +26,7 @@ else
     echo "Already exists. Skipped."
 fi
 
-if [ ! -e ${download_dir}/jsut_lab ]; then
+if [ ! -e ${download_dir}/jsut-lab ]; then
     echo "Downloading full-context labels for jsut v1.1..."
     cd ${download_dir}
     git clone https://github.com/r9y9/jsut-lab
diff --git a/egs/jvs/tts1/cmd.sh b/egs/jvs/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/jvs/tts1/cmd.sh
+++ b/egs/jvs/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/jvs/tts1/local/pretrained_model_download.sh b/egs/jvs/tts1/local/pretrained_model_download.sh
index 5a251ee3990..fc21597b64e 100755
--- a/egs/jvs/tts1/local/pretrained_model_download.sh
+++ b/egs/jvs/tts1/local/pretrained_model_download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Tomoki Hayashi
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
@@ -28,4 +29,4 @@ if [ ! -e ${dir}/.complete ]; then
     download_from_google_drive.sh ${share_url} ${dir} ".tar.gz"
     touch ${dir}/.complete
 fi
-echo "Successfully finished donwload of pretrained model."
+echo "Successfully finished download of pretrained model."
diff --git a/egs/ksponspeech/asr1/cmd.sh b/egs/ksponspeech/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/ksponspeech/asr1/cmd.sh
+++ b/egs/ksponspeech/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/li10/asr1/cmd.sh b/egs/li10/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/li10/asr1/cmd.sh
+++ b/egs/li10/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/li42/asr1/cmd.sh b/egs/li42/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/li42/asr1/cmd.sh
+++ b/egs/li42/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/libri_css/asr1/cmd.sh b/egs/libri_css/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/libri_css/asr1/cmd.sh
+++ b/egs/libri_css/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/libri_css/asr1/diarization/VB_diarization.py b/egs/libri_css/asr1/diarization/VB_diarization.py
index f062b75753c..f53503ff2da 100644
--- a/egs/libri_css/asr1/diarization/VB_diarization.py
+++ b/egs/libri_css/asr1/diarization/VB_diarization.py
@@ -27,202 +27,286 @@
 import numpy as np
 from scipy.sparse import coo_matrix
 import scipy.linalg as spl
-import numexpr as ne # the dependency on this modul can be avoided by replacing
-                     # logsumexp_ne and exp_ne with logsumexp and np.exp
-
-#[gamma pi Li] =
-def VB_diarization(X, m, invSigma, w, V, pi=None, gamma=None,
-                   maxSpeakers = 10, maxIters = 10,
-                   epsilon = 1e-4, loopProb = 0.99, statScale = 1.0,
-                   alphaQInit = 1.0, downsample = None, VtinvSigmaV = None, ref=None,
-                   plot=False, sparsityThr=0.001, llScale=1.0, minDur=1, Fa=1.0, Fb=1.0):
-
-  """
-  This a generalized version of speaker diarization described in:
-
-  Diez. M., Burget. L., Landini. F., Cernocky. J.
-  Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
-
-  Variable names and equation numbers refer to those used the paper
-
-  Inputs:
-  X  - T x D array, where columns are D dimensional feature vectors for T frames
-  m  - C x D array of GMM component means
-  invSigma - C x D array of GMM component inverse covariance matrix diagonals
-  w  - C dimensional column vector of GMM component weights
-  V  - R x C x D array of eigenvoices
-  maxSpeakers - maximum number of speakers expected in the utterance
-  maxIters    - maximum number of algorithm iterations
-  epsilon     - stop iterating, if obj. fun. improvement is less than epsilon
-  loopProb    - probability of not switching speakers between frames
-  statScale   - deprecated, use Fa instead
-  Fa          - scale sufficient statiscits collected using UBM
-  Fb          - speaker regularization coefficient Fb (controls final # of speaker)
-  llScale     - scale UBM likelihood (i.e. llScale < 1.0 make atribution of
-                frames to UBM componets more uncertain)
-  sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory
-                as the posteriors are represented by sparse matrix)
-  alphaQInit  - Dirichlet concentraion parameter for initializing gamma
-  downsample  - perform diarization on input downsampled by this factor
-  VtinvSigmaV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
-                VtinvSigmaV is None. However, it can be pre-calculated using function
-                precalculate_VtinvSigmaV(V) and used across calls of VB_diarization.
-  minDur      - minimum number of frames between speaker turns imposed by linear
-                chains of HMM states corresponding to each speaker. All the states
-                in a chain share the same output distribution
-  ref         - T dim. integer vector with reference speaker ID (0:maxSpeakers)
-                per frame
-  plot        - if set to True, plot per-frame speaker posteriors.
-
-   Outputs:
-   gamma  - S x T matrix of posteriors attribution each frame to one of S possible
-        speakers, where S is given by opts.maxSpeakers
-   pi - S dimensional column vector of ML learned speaker priors. Ideally, these
-        should allow to estimate # of speaker in the utterance as the
-        probabilities of the redundant speaker should converge to zero.
-   Li - values of auxiliary function (and DER and frame cross-entropy between gamma  
-        and reference if 'ref' is provided) over iterations.
-  """
-
-  # The references to equations corresponds to
-  # Diez. M., Burget. L., Landini. F., Cernocky. J.
-  # Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
-
-  D=X.shape[1]  # feature dimensionality
-  C=len(w)      # number of mixture components
-  R=V.shape[0]  # subspace rank
-  nframes=X.shape[0]
-
-  if VtinvSigmaV is None:
-    VtinvSigmaV = precalculate_VtinvSigmaV(V, invSigma)
-
-  V = V.reshape(V.shape[0],-1)
-
-  if pi is None:
-    pi = np.ones(maxSpeakers)/maxSpeakers
-  else:
-    maxSpeakers = len(pi)
-
-  if gamma is None:
-    # initialize gamma from flat Dirichlet prior with concentration parameter alphaQInit
-    gamma = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
-    gamma = gamma / gamma.sum(1, keepdims=True)
-
-  # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
-  ll = (X**2).dot(-0.5*invSigma.T) + X.dot(invSigma.T*m.T)-0.5*((invSigma * m**2 - np.log(invSigma)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi))
-  ll *= llScale
-  G = logsumexp_ne(ll, axis=1) 
-  zeta =  exp_ne(ll - G[:,np.newaxis])  
-  zeta[zeta<sparsityThr] = 0.0
-  zeta = zeta * statScale
-  G = G * statScale
-
-  #Kx = np.sum(zeta * (np.log(w) - np.log(zeta)), 1)
-  zeta = coo_matrix(zeta) # represent zero-order stats using sparse matrix
-  print('Sparsity: ', len(zeta.row), float(len(zeta.row))/np.prod(zeta.shape))
-  LL = np.sum(G) # total log-likelihod as calculated using UBM
-
-  mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))))
-
-  #G = np.sum((zeta.multiply(ll - np.log(w))).toarray(), 1) + Kx  # from eq. (30) # Aleready calculated above
-
-  # Calculate per-frame first order statistics projected into the R-dim. subspace
-  # V^T \Sigma^{-1} F_m
-  F_s =coo_matrix((((X[zeta.row]-m[zeta.col])*zeta.data[:,np.newaxis]).flat,
-                   (zeta.row.repeat(D), zeta.col.repeat(D)*D+np.tile(range(D), len(zeta.col)))))
-  rho = F_s.tocsr().dot((invSigma.flat * V).T) ; del F_s
-  ## The code above is only efficient implementation of the following comented code
-  #rho = 0;
-  #for ii in range(C):
-  #  rho = rho + V[ii*D:(ii+1)*D,:].T.dot(zeta[ii,:] * invSigma[:,[ii]] *  (X - m[:,[ii]]))
-
-  if downsample is not None:
-    # Downsample zeta, rho, G and gamma by summing the statistic over 'downsample' frames
-    # This speeds-up diarization for the price of lowering its frame resolution
-    #downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
-    downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
-    zeta  = downsampler.dot(zeta)
-    rho   = downsampler.dot(rho)
-    G     = downsampler.dot(G)
-    gamma = downsampler.dot(gamma) / downsample
-  else:
-    downsampler=np.array(1)
-
-  Li = [[LL*Fa]] # for the 0-th iteration,
-  if ref is not None:
-    Li[-1] += [DER(downsampler.T.dot(gamma), ref), DER(downsampler.T.dot(gamma), ref, xentropy=True)]
-
-  ln_p = np.zeros_like(gamma)
-  tr = np.eye(minDur*maxSpeakers, k=1)
-  ip = np.zeros(minDur*maxSpeakers)
-  for ii in range(maxIters):
-    ELBO = 0                                                                   # objective function (11) (i.e. VB lower-bound on the evidence)
-    sum_gamma_zeta =   zeta.T.dot(gamma).T                                     # corresponds to the last sum in eq. (26) for all 's'
-    invLnoI_flat = sum_gamma_zeta.astype(VtinvSigmaV.dtype).dot(VtinvSigmaV)   # eq. (26) except for 'I' and the F_A F_B factors for all 's'
-    sum_gamma_rho = gamma.T.dot(rho)                                           # summation in eq. (17) 
-    for sid in range(maxSpeakers):
-        invL = np.linalg.inv(np.eye(R) + tril_to_sym(invLnoI_flat[sid])*Fa/Fb) # eq. (18) inverse
-        a = invL.dot(sum_gamma_rho[sid])*Fa/Fb                                 # eq. (17)
-        ln_p[:,sid] = Fa * (G + rho.dot(a) - 0.5 * zeta.dot(mixture_sum.dot(((invL+np.outer(a,a)).astype(V.dtype).dot(V) * (invSigma.flat * V)).sum(0)))) #eq. (23)
-        ELBO += Fb* 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a**2, 0) + R)
-
-    # Construct transition probability matrix with linear chain of 'minDur'
-    # states for each of 'maxSpeaker' speaker. The last state in each chain has
-    # self-loop probability 'loopProb' and the transition probabilities to the
-    # initial chain states given by vector '(1-loopProb) * pi'. From all other,
-    # states, one must move to the next state in the chain with probability one.
-    tr[minDur-1::minDur,0::minDur]=(1-loopProb)*pi
-    tr[(np.arange(1,maxSpeakers+1)*minDur-1,)*2] += loopProb
-    ip[::minDur]=pi
-    # per-frame HMM state posteriors. Note that we can have linear chain of minDur states
-    # for each speaker.
-    gamma, tll, lf, lb = forward_backward(ln_p.repeat(minDur,axis=1), tr, ip) #, np.arange(1,maxSpeakers+1)*minDur-1)
-
-    # Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
-    # ELBO now contains -KL{q(Y)||p(Y)}. Therefore, ELBO+ttl is correct value for ELBO.
-    ELBO += tll
-    Li.append([ELBO])
-
-    # ML estimate of speaker prior probabilities, eq. (24)
-    pi = gamma[0,::minDur] + np.exp(logsumexp(lf[:-1,minDur-1::minDur],axis=1)[:,np.newaxis]
-                           + lb[1:,::minDur] + ln_p[1:] + np.log((1-loopProb)*pi)-tll).sum(0)
-    pi = pi / pi.sum()
-
-    # per-frame speaker posteriors (eq. (19)), obtained by summing
-    # HMM state posteriors corresponding to each speaker
-    gamma = gamma.reshape(len(gamma),maxSpeakers,minDur).sum(axis=2)
-
-
-    # if reference is provided, report DER, cross-entropy and plot the figures
-    if ref is not None:
-      Li[-1] += [DER(downsampler.T.dot(gamma), ref), DER(downsampler.T.dot(gamma), ref, xentropy=True)]
+import numexpr as ne  # the dependency on this modul can be avoided by replacing
+
+# logsumexp_ne and exp_ne with logsumexp and np.exp
+
+# [gamma pi Li] =
+def VB_diarization(
+    X,
+    m,
+    invSigma,
+    w,
+    V,
+    pi=None,
+    gamma=None,
+    maxSpeakers=10,
+    maxIters=10,
+    epsilon=1e-4,
+    loopProb=0.99,
+    statScale=1.0,
+    alphaQInit=1.0,
+    downsample=None,
+    VtinvSigmaV=None,
+    ref=None,
+    plot=False,
+    sparsityThr=0.001,
+    llScale=1.0,
+    minDur=1,
+    Fa=1.0,
+    Fb=1.0,
+):
+
+    """
+    This a generalized version of speaker diarization described in:
+
+    Diez. M., Burget. L., Landini. F., Cernocky. J.
+    Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
+
+    Variable names and equation numbers refer to those used the paper
+
+    Inputs:
+    X  - T x D array, where columns are D dimensional feature vectors for T frames
+    m  - C x D array of GMM component means
+    invSigma - C x D array of GMM component inverse covariance matrix diagonals
+    w  - C dimensional column vector of GMM component weights
+    V  - R x C x D array of eigenvoices
+    maxSpeakers - maximum number of speakers expected in the utterance
+    maxIters    - maximum number of algorithm iterations
+    epsilon     - stop iterating, if obj. fun. improvement is less than epsilon
+    loopProb    - probability of not switching speakers between frames
+    statScale   - deprecated, use Fa instead
+    Fa          - scale sufficient statiscits collected using UBM
+    Fb          - speaker regularization coefficient Fb (controls final # of speaker)
+    llScale     - scale UBM likelihood (i.e. llScale < 1.0 make atribution of
+                  frames to UBM componets more uncertain)
+    sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory
+                  as the posteriors are represented by sparse matrix)
+    alphaQInit  - Dirichlet concentration parameter for initializing gamma
+    downsample  - perform diarization on input downsampled by this factor
+    VtinvSigmaV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
+                  VtinvSigmaV is None. However, it can be pre-calculated using function
+                  precalculate_VtinvSigmaV(V) and used across calls of VB_diarization.
+    minDur      - minimum number of frames between speaker turns imposed by linear
+                  chains of HMM states corresponding to each speaker. All the states
+                  in a chain share the same output distribution
+    ref         - T dim. integer vector with reference speaker ID (0:maxSpeakers)
+                  per frame
+    plot        - if set to True, plot per-frame speaker posteriors.
+
+     Outputs:
+     gamma  - S x T matrix of posteriors attribution each frame to one of S possible
+          speakers, where S is given by opts.maxSpeakers
+     pi - S dimensional column vector of ML learned speaker priors. Ideally, these
+          should allow to estimate # of speaker in the utterance as the
+          probabilities of the redundant speaker should converge to zero.
+     Li - values of auxiliary function (and DER and frame cross-entropy between gamma
+          and reference if 'ref' is provided) over iterations.
+    """
+
+    # The references to equations corresponds to
+    # Diez. M., Burget. L., Landini. F., Cernocky. J.
+    # Analysis of Speaker Diarization based on Bayesian HMM with Eigenvoice Priors
 
-      if plot:
-        import matplotlib.pyplot
-        if ii == 0: matplotlib.pyplot.clf()
-        matplotlib.pyplot.subplot(maxIters, 1, ii+1)
-        matplotlib.pyplot.plot(downsampler.T.dot(gamma), lw=2)
-        matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
-                                 cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
-      print(ii, Li[-2])
+    D = X.shape[1]  # feature dimensionality
+    C = len(w)  # number of mixture components
+    R = V.shape[0]  # subspace rank
+    nframes = X.shape[0]
 
+    if VtinvSigmaV is None:
+        VtinvSigmaV = precalculate_VtinvSigmaV(V, invSigma)
 
-    if ii > 0 and ELBO - Li[-2][0] < epsilon:
-      if ELBO - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!')
-      break
+    V = V.reshape(V.shape[0], -1)
 
-  if downsample is not None:
-    # upsample resulting gamma to match number of frames in the input utterance
-    gamma = downsampler.T.dot(gamma)
+    if pi is None:
+        pi = np.ones(maxSpeakers) / maxSpeakers
+    else:
+        maxSpeakers = len(pi)
+
+    if gamma is None:
+        # initialize gamma from flat Dirichlet prior with concentration parameter alphaQInit
+        gamma = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
+        gamma = gamma / gamma.sum(1, keepdims=True)
+
+    # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
+    ll = (
+        (X ** 2).dot(-0.5 * invSigma.T)
+        + X.dot(invSigma.T * m.T)
+        - 0.5
+        * (
+            (invSigma * m ** 2 - np.log(invSigma)).sum(1)
+            - 2 * np.log(w)
+            + D * np.log(2 * np.pi)
+        )
+    )
+    ll *= llScale
+    G = logsumexp_ne(ll, axis=1)
+    zeta = exp_ne(ll - G[:, np.newaxis])
+    zeta[zeta < sparsityThr] = 0.0
+    zeta = zeta * statScale
+    G = G * statScale
+
+    # Kx = np.sum(zeta * (np.log(w) - np.log(zeta)), 1)
+    zeta = coo_matrix(zeta)  # represent zero-order stats using sparse matrix
+    print("Sparsity: ", len(zeta.row), float(len(zeta.row)) / np.prod(zeta.shape))
+    LL = np.sum(G)  # total log-likelihod as calculated using UBM
+
+    mixture_sum = coo_matrix((np.ones(C * D), (np.repeat(range(C), D), range(C * D))))
+
+    # G = np.sum((zeta.multiply(ll - np.log(w))).toarray(), 1) + Kx  # from eq. (30) # Aleready calculated above
+
+    # Calculate per-frame first order statistics projected into the R-dim. subspace
+    # V^T \Sigma^{-1} F_m
+    F_s = coo_matrix(
+        (
+            ((X[zeta.row] - m[zeta.col]) * zeta.data[:, np.newaxis]).flat,
+            (
+                zeta.row.repeat(D),
+                zeta.col.repeat(D) * D + np.tile(range(D), len(zeta.col)),
+            ),
+        )
+    )
+    rho = F_s.tocsr().dot((invSigma.flat * V).T)
+    del F_s
+    ## The code above is only efficient implementation of the following comented code
+    # rho = 0;
+    # for ii in range(C):
+    #  rho = rho + V[ii*D:(ii+1)*D,:].T.dot(zeta[ii,:] * invSigma[:,[ii]] *  (X - m[:,[ii]]))
+
+    if downsample is not None:
+        # Downsample zeta, rho, G and gamma by summing the statistic over 'downsample' frames
+        # This speeds-up diarization for the price of lowering its frame resolution
+        # downsampler = coo_matrix((np.ones(nframes), (np.ceil(np.arange(nframes)/downsample).astype(int), np.arange(nframes))))
+        downsampler = coo_matrix(
+            (
+                np.ones(nframes),
+                (
+                    np.ceil(np.arange(nframes) / downsample).astype(int),
+                    np.arange(nframes),
+                ),
+            )
+        )
+        zeta = downsampler.dot(zeta)
+        rho = downsampler.dot(rho)
+        G = downsampler.dot(G)
+        gamma = downsampler.dot(gamma) / downsample
+    else:
+        downsampler = np.array(1)
 
-  return gamma, pi, Li
+    Li = [[LL * Fa]]  # for the 0-th iteration,
+    if ref is not None:
+        Li[-1] += [
+            DER(downsampler.T.dot(gamma), ref),
+            DER(downsampler.T.dot(gamma), ref, xentropy=True),
+        ]
+
+    ln_p = np.zeros_like(gamma)
+    tr = np.eye(minDur * maxSpeakers, k=1)
+    ip = np.zeros(minDur * maxSpeakers)
+    for ii in range(maxIters):
+        ELBO = 0  # objective function (11) (i.e. VB lower-bound on the evidence)
+        sum_gamma_zeta = zeta.T.dot(
+            gamma
+        ).T  # corresponds to the last sum in eq. (26) for all 's'
+        invLnoI_flat = sum_gamma_zeta.astype(VtinvSigmaV.dtype).dot(
+            VtinvSigmaV
+        )  # eq. (26) except for 'I' and the F_A F_B factors for all 's'
+        sum_gamma_rho = gamma.T.dot(rho)  # summation in eq. (17)
+        for sid in range(maxSpeakers):
+            invL = np.linalg.inv(
+                np.eye(R) + tril_to_sym(invLnoI_flat[sid]) * Fa / Fb
+            )  # eq. (18) inverse
+            a = invL.dot(sum_gamma_rho[sid]) * Fa / Fb  # eq. (17)
+            ln_p[:, sid] = Fa * (
+                G
+                + rho.dot(a)
+                - 0.5
+                * zeta.dot(
+                    mixture_sum.dot(
+                        (
+                            (invL + np.outer(a, a)).astype(V.dtype).dot(V)
+                            * (invSigma.flat * V)
+                        ).sum(0)
+                    )
+                )
+            )  # eq. (23)
+            ELBO += Fb * 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a ** 2, 0) + R)
+
+        # Construct transition probability matrix with linear chain of 'minDur'
+        # states for each of 'maxSpeaker' speaker. The last state in each chain has
+        # self-loop probability 'loopProb' and the transition probabilities to the
+        # initial chain states given by vector '(1-loopProb) * pi'. From all other,
+        # states, one must move to the next state in the chain with probability one.
+        tr[minDur - 1 :: minDur, 0::minDur] = (1 - loopProb) * pi
+        tr[(np.arange(1, maxSpeakers + 1) * minDur - 1,) * 2] += loopProb
+        ip[::minDur] = pi
+        # per-frame HMM state posteriors. Note that we can have linear chain of minDur states
+        # for each speaker.
+        gamma, tll, lf, lb = forward_backward(
+            ln_p.repeat(minDur, axis=1), tr, ip
+        )  # , np.arange(1,maxSpeakers+1)*minDur-1)
+
+        # Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
+        # ELBO now contains -KL{q(Y)||p(Y)}. Therefore, ELBO+ttl is correct value for ELBO.
+        ELBO += tll
+        Li.append([ELBO])
+
+        # ML estimate of speaker prior probabilities, eq. (24)
+        pi = gamma[0, ::minDur] + np.exp(
+            logsumexp(lf[:-1, minDur - 1 :: minDur], axis=1)[:, np.newaxis]
+            + lb[1:, ::minDur]
+            + ln_p[1:]
+            + np.log((1 - loopProb) * pi)
+            - tll
+        ).sum(0)
+        pi = pi / pi.sum()
+
+        # per-frame speaker posteriors (eq. (19)), obtained by summing
+        # HMM state posteriors corresponding to each speaker
+        gamma = gamma.reshape(len(gamma), maxSpeakers, minDur).sum(axis=2)
+
+        # if reference is provided, report DER, cross-entropy and plot the figures
+        if ref is not None:
+            Li[-1] += [
+                DER(downsampler.T.dot(gamma), ref),
+                DER(downsampler.T.dot(gamma), ref, xentropy=True),
+            ]
+
+            if plot:
+                import matplotlib.pyplot
+
+                if ii == 0:
+                    matplotlib.pyplot.clf()
+                matplotlib.pyplot.subplot(maxIters, 1, ii + 1)
+                matplotlib.pyplot.plot(downsampler.T.dot(gamma), lw=2)
+                matplotlib.pyplot.imshow(
+                    np.atleast_2d(ref),
+                    interpolation="none",
+                    aspect="auto",
+                    cmap=matplotlib.pyplot.cm.Pastel1,
+                    extent=(0, len(ref), -0.05, 1.05),
+                )
+            print(ii, Li[-2])
+
+        if ii > 0 and ELBO - Li[-2][0] < epsilon:
+            if ELBO - Li[-1][0] < 0:
+                print("WARNING: Value of auxiliary function has decreased!")
+            break
+
+    if downsample is not None:
+        # upsample resulting gamma to match number of frames in the input utterance
+        gamma = downsampler.T.dot(gamma)
+
+    return gamma, pi, Li
 
 
 def precalculate_VtinvSigmaV(V, invSigma):
     tril_ind = np.tril_indices(V.shape[0])
-    VtinvSigmaV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype)
+    VtinvSigmaV = np.empty((V.shape[1], len(tril_ind[0])), V.dtype)
     for c in range(V.shape[1]):
-        VtinvSigmaV[c,:] = np.dot(V[:,c,:]*invSigma[np.newaxis,c,:], V[:,c,:].T)[tril_ind]
+        VtinvSigmaV[c, :] = np.dot(
+            V[:, c, :] * invSigma[np.newaxis, c, :], V[:, c, :].T
+        )[tril_ind]
     return VtinvSigmaV
 
 
@@ -230,15 +314,15 @@ def precalculate_VtinvSigmaV(V, invSigma):
 # (vector of per-frame zero based integer speaker IDs)
 def frame_labels2posterior_mx(labels):
     # initialize from reference
-    pmx = np.zeros((len(labels), labels.max()+1))
+    pmx = np.zeros((len(labels), labels.max() + 1))
     pmx[np.arange(len(labels)), labels] = 1
     return pmx
 
 
 # Calculates Diarization Error Rate (DER) or per-frame cross-entropy between
-# reference (vector of per-frame zero based integer speaker IDs) and gamma 
-# (per-frame speaker posteriors). If expected=False, gamma is converted into 
-# hard labels before calculating DER. If expected=TRUE, posteriors in gamma 
+# reference (vector of per-frame zero based integer speaker IDs) and gamma
+# (per-frame speaker posteriors). If expected=False, gamma is converted into
+# hard labels before calculating DER. If expected=TRUE, posteriors in gamma
 # are used to calculate "expected" DER.
 def DER(gamma, ref, expected=True, xentropy=False):
     from itertools import permutations
@@ -249,9 +333,9 @@ def DER(gamma, ref, expected=True, xentropy=False):
         gamma = np.zeros_like(gamma)
         gamma[range(len(gamma)), hard_labels] = 1
 
-    err_mx = np.empty((ref.max()+1, gamma.shape[1]))
+    err_mx = np.empty((ref.max() + 1, gamma.shape[1]))
     for s in range(err_mx.shape[0]):
-        tmpgamma = gamma[ref == s,:]
+        tmpgamma = gamma[ref == s, :]
         err_mx[s] = (-np.log(tmpgamma) if xentropy else tmpgamma).sum(0)
 
     if err_mx.shape[0] < err_mx.shape[1]:
@@ -259,12 +343,14 @@ def DER(gamma, ref, expected=True, xentropy=False):
 
     # try all alignments (permutations) of reference and detected speaker
     # could be written in more efficient way using dynamic programing
-    acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum()
-              for perm in permutations(range(err_mx.shape[0]))]
+    acc = [
+        err_mx[perm[: err_mx.shape[1]], range(err_mx.shape[1])].sum()
+        for perm in permutations(range(err_mx.shape[0]))
+    ]
     if xentropy:
-       return min(acc)/float(len(ref))
+        return min(acc) / float(len(ref))
     else:
-       return (len(ref) - max(acc))/float(len(ref))
+        return (len(ref) - max(acc)) / float(len(ref))
 
 
 ###############################################################################
@@ -275,13 +361,13 @@ def logsumexp(x, axis=0):
     x = xmax + np.log(np.sum(np.exp(x - np.expand_dims(xmax, axis)), axis))
     infs = np.isinf(xmax)
     if np.ndim(x) > 0:
-      x[infs] = xmax[infs]
+        x[infs] = xmax[infs]
     elif infs:
-      x = xmax
+        x = xmax
     return x
 
 
-# The folowing two functions are only versions optimized for speed using numexpr
+# The following two functions are only versions optimized for speed using numexpr
 # module and can be replaced by logsumexp and np.exp functions to avoid
 # the dependency on the module.
 def logsumexp_ne(x, axis=0):
@@ -291,9 +377,9 @@ def logsumexp_ne(x, axis=0):
     x = ne.evaluate("xmax + log(x)")
     infs = np.isinf(xmax)
     if np.ndim(x) > 0:
-      x[infs] = xmax[infs]
+        x[infs] = xmax[infs]
     elif infs:
-      x = xmax
+        x = xmax
     return x
 
 
@@ -303,16 +389,16 @@ def exp_ne(x, out=None):
 
 # Convert vector with lower-triangular coefficients into symetric matrix
 def tril_to_sym(tril):
-    R = np.sqrt(len(tril)*2).astype(int)
+    R = np.sqrt(len(tril) * 2).astype(int)
     tril_ind = np.tril_indices(R)
-    S = np.empty((R,R))
-    S[tril_ind]       = tril
+    S = np.empty((R, R))
+    S[tril_ind] = tril
     S[tril_ind[::-1]] = tril
     return S
 
 
 def logdet(A):
-    return 2*np.sum(np.log(np.diag(spl.cholesky(A))))
+    return 2 * np.sum(np.log(np.diag(spl.cholesky(A))))
 
 
 def forward_backward(lls, tr, ip):
@@ -335,11 +421,11 @@ def forward_backward(lls, tr, ip):
     lfw[0] = lls[0] + np.log(ip)
     lbw[-1] = 0.0
 
-    for ii in  range(1,len(lls)):
-        lfw[ii] =  lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1)
+    for ii in range(1, len(lls)):
+        lfw[ii] = lls[ii] + logsumexp(lfw[ii - 1] + ltr.T, axis=1)
 
-    for ii in reversed(range(len(lls)-1)):
-        lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1)
+    for ii in reversed(range(len(lls) - 1)):
+        lbw[ii] = logsumexp(ltr + lls[ii + 1] + lbw[ii + 1], axis=1)
 
     tll = logsumexp(lfw[-1])
     sp = np.exp(lfw + lbw - tll)
diff --git a/egs/libri_css/asr1/diarization/calc_cossim_scores.py b/egs/libri_css/asr1/diarization/calc_cossim_scores.py
index 076abc22d33..232be35a6c1 100644
--- a/egs/libri_css/asr1/diarization/calc_cossim_scores.py
+++ b/egs/libri_css/asr1/diarization/calc_cossim_scores.py
@@ -10,26 +10,28 @@
 
 
 def LoadReco2Utt(file):
-    if ':' in file:
-        file = file.split(':')[1]
-    IDs=dict()
-    with open(file,'r') as f:
+    if ":" in file:
+        file = file.split(":")[1]
+    IDs = dict()
+    with open(file, "r") as f:
         for line in f:
             ids = line.strip().split()
             IDs[ids[0]] = ids[1:]
     return IDs
 
+
 def ReadXvecs(rspec):
-    xvecs=dict()
+    xvecs = dict()
     with ReadHelper(rspec) as reader:
         for utid, xvec in reader:
             xvecs[utid] = xvec
     reader.close()
     return xvecs
 
+
 def Normalize(xvecs_in):
     N = len(xvecs_in)
-    xvec_mean=np.zeros(xvecs_in[0].shape)
+    xvec_mean = np.zeros(xvecs_in[0].shape)
     for i in range(N):
         xvec_mean += xvecs_in[i]
     xvec_mean /= N
@@ -39,33 +41,49 @@ def Normalize(xvecs_in):
         xvecs[i] = xvecs[i] / np.linalg.norm(xvecs[i])
     return xvecs
 
+
 def CalcCosSim(vecs):
-    return 1 - squareform(pdist(np.asarray(vecs), 'cosine'))
+    return 1 - squareform(pdist(np.asarray(vecs), "cosine"))
+
 
 def WriteDistMatrices(D, wspec):
     with WriteHelper(wspec) as writer:
         for id in D:
             writer(id, D[id])
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Usage: calc_cossim_scores.py <reco2utt-rspec> <xvec-rspec> <simmat-wspec>\nComputes matrices of the cosine similarity scores between normalized x-vectors for each recording')
-    parser.add_argument('reco2utt', type=str, help='Kaldi-style rspecifier of recording to segments correspondence')
-    parser.add_argument('xvec_rspec', type=str, help='Kaldi-style rspecifier of segment xvectors to read')
-    parser.add_argument('simmat_wspec', type=str, help='Kaldi-style wspecifier of similarity matrices to write')
-    args = parser.parse_args()
 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Usage: calc_cossim_scores.py <reco2utt-rspec> <xvec-rspec> <simmat-wspec>\nComputes matrices of the cosine similarity scores between normalized x-vectors for each recording"
+    )
+    parser.add_argument(
+        "reco2utt",
+        type=str,
+        help="Kaldi-style rspecifier of recording to segments correspondence",
+    )
+    parser.add_argument(
+        "xvec_rspec",
+        type=str,
+        help="Kaldi-style rspecifier of segment xvectors to read",
+    )
+    parser.add_argument(
+        "simmat_wspec",
+        type=str,
+        help="Kaldi-style wspecifier of similarity matrices to write",
+    )
+    args = parser.parse_args()
 
-    print('Computing cosine similarity matrix between ivectors')
-    print('Parameters:')
-    print('Reco2Utt rspecifier: {}'.format(args.reco2utt))
-    print('Xvectors rspecifier: {}'.format(args.xvec_rspec))
-    print('Similarity matrices wspecifier: {}'.format(args.simmat_wspec))
+    print("Computing cosine similarity matrix between ivectors")
+    print("Parameters:")
+    print("Reco2Utt rspecifier: {}".format(args.reco2utt))
+    print("Xvectors rspecifier: {}".format(args.xvec_rspec))
+    print("Similarity matrices wspecifier: {}".format(args.simmat_wspec))
 
     IDs = LoadReco2Utt(args.reco2utt)
     xvecs_all = ReadXvecs(args.xvec_rspec)
     D = dict()
     for reco_id in IDs:
-        xvecs = [ xvecs_all[id] for id in IDs[reco_id] ]
-        xvecs = Normalize(xvecs)                              # !!!! Normalize per recording (session) !!!!
+        xvecs = [xvecs_all[id] for id in IDs[reco_id]]
+        xvecs = Normalize(xvecs)  # !!!! Normalize per recording (session) !!!!
         D[reco_id] = CalcCosSim(xvecs)
-    WriteDistMatrices(D, args.simmat_wspec)
\ No newline at end of file
+    WriteDistMatrices(D, args.simmat_wspec)
diff --git a/egs/libri_css/asr1/diarization/make_rttm.py b/egs/libri_css/asr1/diarization/make_rttm.py
index fc32eafd530..6d39371b0da 100755
--- a/egs/libri_css/asr1/diarization/make_rttm.py
+++ b/egs/libri_css/asr1/diarization/make_rttm.py
@@ -38,91 +38,107 @@
 
 
 def get_args():
-  parser = argparse.ArgumentParser(
-    description="""This script converts a segments and labels file
+    parser = argparse.ArgumentParser(
+        description="""This script converts a segments and labels file
     to a NIST RTTM file. It handles overlapping segments (e.g. the
-    output of a sliding-window diarization system).""")
+    output of a sliding-window diarization system)."""
+    )
 
-  parser.add_argument("segments", type=str,
-                      help="Input segments file")
-  parser.add_argument("labels", type=str,
-                      help="Input labels file")
-  parser.add_argument("rttm_file", type=str,
-                      help="Output RTTM file")
-  parser.add_argument("--rttm-channel", type=int, default=0,
-                      help="The value passed into the RTTM channel field. \
-                      Only affects the format of the RTTM file.")
+    parser.add_argument("segments", type=str, help="Input segments file")
+    parser.add_argument("labels", type=str, help="Input labels file")
+    parser.add_argument("rttm_file", type=str, help="Output RTTM file")
+    parser.add_argument(
+        "--rttm-channel",
+        type=int,
+        default=0,
+        help="The value passed into the RTTM channel field. \
+                      Only affects the format of the RTTM file.",
+    )
+
+    args = parser.parse_args()
+    return args
 
-  args = parser.parse_args()
-  return args
 
 def main():
-  args = get_args()
-
-  # File containing speaker labels per segment
-  seg2label = {}
-  with codecs.open(args.labels, 'r', 'utf-8') as labels_file:
-    for line in labels_file:
-      seg, label = line.strip().split()
-      seg2label[seg] = label
-
-  # Segments file
-  reco2segs = {}
-  with codecs.open(args.segments, 'r', 'utf-8') as segments_file:
-    for line in segments_file:
-      seg, reco, start, end = line.strip().split()
-      try:
-        if reco in reco2segs:
-          reco2segs[reco] = reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg]
-        else:
-          reco2segs[reco] = reco + " " + start + "," + end + "," + seg2label[seg]
-      except KeyError:
-        raise RuntimeError("Missing label for segment {0}".format(seg))
-
-  # Cut up overlapping segments so they are contiguous
-  contiguous_segs = []
-  for reco in sorted(reco2segs):
-    segs = reco2segs[reco].strip().split()
-    new_segs = ""
-    for i in range(1, len(segs)-1):
-      start, end, label = segs[i].split(',')
-      next_start, next_end, next_label = segs[i+1].split(',')
-      if float(end) > float(next_start):
-        done = False
-        avg = str((float(next_start) + float(end)) / 2.0)
-        segs[i+1] = ','.join([avg, next_end, next_label])
-        new_segs += " " + start + "," + avg + "," + label
-      else:
+    args = get_args()
+
+    # File containing speaker labels per segment
+    seg2label = {}
+    with codecs.open(args.labels, "r", "utf-8") as labels_file:
+        for line in labels_file:
+            seg, label = line.strip().split()
+            seg2label[seg] = label
+
+    # Segments file
+    reco2segs = {}
+    with codecs.open(args.segments, "r", "utf-8") as segments_file:
+        for line in segments_file:
+            seg, reco, start, end = line.strip().split()
+            try:
+                if reco in reco2segs:
+                    reco2segs[reco] = (
+                        reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg]
+                    )
+                else:
+                    reco2segs[reco] = (
+                        reco + " " + start + "," + end + "," + seg2label[seg]
+                    )
+            except KeyError:
+                raise RuntimeError("Missing label for segment {0}".format(seg))
+
+    # Cut up overlapping segments so they are contiguous
+    contiguous_segs = []
+    for reco in sorted(reco2segs):
+        segs = reco2segs[reco].strip().split()
+        new_segs = ""
+        for i in range(1, len(segs) - 1):
+            start, end, label = segs[i].split(",")
+            next_start, next_end, next_label = segs[i + 1].split(",")
+            if float(end) > float(next_start):
+                done = False
+                avg = str((float(next_start) + float(end)) / 2.0)
+                segs[i + 1] = ",".join([avg, next_end, next_label])
+                new_segs += " " + start + "," + avg + "," + label
+            else:
+                new_segs += " " + start + "," + end + "," + label
+        start, end, label = segs[-1].split(",")
         new_segs += " " + start + "," + end + "," + label
-    start, end, label = segs[-1].split(',')
-    new_segs += " " + start + "," + end + "," + label
-    contiguous_segs.append(reco + new_segs)
-
-  # Merge contiguous segments of the same label
-  merged_segs = []
-  for reco_line in contiguous_segs:
-    segs = reco_line.strip().split()
-    reco = segs[0]
-    new_segs = ""
-    for i in range(1, len(segs)-1):
-      start, end, label = segs[i].split(',')
-      next_start, next_end, next_label = segs[i+1].split(',')
-      if float(end) == float(next_start) and label == next_label:
-        segs[i+1] = ','.join([start, next_end, next_label])
-      else:
+        contiguous_segs.append(reco + new_segs)
+
+    # Merge contiguous segments of the same label
+    merged_segs = []
+    for reco_line in contiguous_segs:
+        segs = reco_line.strip().split()
+        reco = segs[0]
+        new_segs = ""
+        for i in range(1, len(segs) - 1):
+            start, end, label = segs[i].split(",")
+            next_start, next_end, next_label = segs[i + 1].split(",")
+            if float(end) == float(next_start) and label == next_label:
+                segs[i + 1] = ",".join([start, next_end, next_label])
+            else:
+                new_segs += " " + start + "," + end + "," + label
+        start, end, label = segs[-1].split(",")
         new_segs += " " + start + "," + end + "," + label
-    start, end, label = segs[-1].split(',')
-    new_segs += " " + start + "," + end + "," + label
-    merged_segs.append(reco + new_segs)
-
-  with codecs.open(args.rttm_file, 'w', 'utf-8') as rttm_writer:
-    for reco_line in merged_segs:
-      segs = reco_line.strip().split()
-      reco = segs[0]
-      for i in range(1, len(segs)):
-        start, end, label = segs[i].strip().split(',')
-        print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format(
-          reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer)
-
-if __name__ == '__main__':
-  main()
+        merged_segs.append(reco + new_segs)
+
+    with codecs.open(args.rttm_file, "w", "utf-8") as rttm_writer:
+        for reco_line in merged_segs:
+            segs = reco_line.strip().split()
+            reco = segs[0]
+            for i in range(1, len(segs)):
+                start, end, label = segs[i].strip().split(",")
+                print(
+                    "SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format(
+                        reco,
+                        args.rttm_channel,
+                        float(start),
+                        float(end) - float(start),
+                        label,
+                    ),
+                    file=rttm_writer,
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libri_css/asr1/diarization/spec_clust.py b/egs/libri_css/asr1/diarization/spec_clust.py
index 30dc85dce6c..37a1c0ed924 100755
--- a/egs/libri_css/asr1/diarization/spec_clust.py
+++ b/egs/libri_css/asr1/diarization/spec_clust.py
@@ -11,25 +11,27 @@
 import scipy
 from sklearn.cluster import SpectralClustering
 
-'''
+"""
    Spectral Clustering based on binarization and automatic thresholding
    Paper: T.Park, K.Han, M.Kumar, and S.Narayanan, Auto-tuning spectral clustering for speaker diarization using normalized maximumeigengap, IEEE Signal Processing Letters, vol. 27, pp. 381-385,2019
-'''
+"""
 
 #   Input-output routines
 
+
 def LoadAffinityMatrix(file):
-    Matrices=dict()
+    Matrices = dict()
     with ReadHelper(file) as reader:
         for key, np_arr in reader:
             Matrices[key] = np_arr
     return Matrices
 
+
 def LoadReco2Utt(file):
-    if ':' in file:
-        file = file.split(':')[1]
-    IDs=dict()
-    with open(file,'r') as f:
+    if ":" in file:
+        file = file.split(":")[1]
+    IDs = dict()
+    with open(file, "r") as f:
         for line in f:
             ids = line.strip().split()
             IDs[ids[0]] = ids[1:]
@@ -37,22 +39,24 @@ def LoadReco2Utt(file):
 
 
 def LoadReco2NumSpk(file):
-    if ':' in file:
-        file = file.split(':')[1]
-    NumSpk=dict()
-    with open(file,'r') as f:
+    if ":" in file:
+        file = file.split(":")[1]
+    NumSpk = dict()
+    with open(file, "r") as f:
         for line in f:
             ids = line.strip().split()
             NumSpk[ids[0]] = int(ids[1])
     return NumSpk
 
+
 def SaveLabels(IDs, labels, file):
-    if ':' in file:
-        file = file.split(':')[1]
-    with open(file,'w') as f:
+    if ":" in file:
+        file = file.split(":")[1]
+    with open(file, "w") as f:
         for id in IDs:
             for i in range(len(IDs[id])):
-                f.write('{} {}\n'.format(IDs[id][i], labels[id][i]+1))
+                f.write("{} {}\n".format(IDs[id][i], labels[id][i] + 1))
+
 
 #   NME low-level operations
 
@@ -66,45 +70,49 @@ def get_kneighbors_conn(X_dist, p_neighbors):
         X_dist_out[indices, i] = 1
     return X_dist_out
 
+
 # Thresolds affinity matrix to leave p maximum non-zero elements in each row
 def Threshold(A, p):
     N = A.shape[0]
-    Ap = np.zeros((N,N))
+    Ap = np.zeros((N, N))
     for i in range(N):
-        thr = sorted(A[i,:], reverse=True)[p]
-        Ap[i,A[i,:]>thr] = A[i,A[i,:]>thr]
+        thr = sorted(A[i, :], reverse=True)[p]
+        Ap[i, A[i, :] > thr] = A[i, A[i, :] > thr]
     return Ap
 
+
 # Computes Laplacian of a matrix
 def Laplacian(A):
-    d = np.sum(A, axis=1)-np.diag(A)
+    d = np.sum(A, axis=1) - np.diag(A)
     D = np.diag(d)
     return D - A
 
+
 # Calculates eigengaps (differences between adjacent eigenvalues sorted in descending order)
 def Eigengap(S):
     S = sorted(S)
     return np.diff(S)
 
+
 # Computes parameters of normalized eigenmaps for automatic thresholding selection
 def ComputeNMEParameters(A, p, max_num_clusters):
     # p-Neighbour binarization
     Ap = get_kneighbors_conn(A, p)
     # Symmetrization
-    Ap = (Ap + np.transpose(Ap))/2
+    Ap = (Ap + np.transpose(Ap)) / 2
     # Laplacian matrix computation
     Lp = Laplacian(Ap)
     # EigenValue Decomposition
     S, eig_vecs = scipy.linalg.eigh(Lp)
     # Eigengap computation
     e = Eigengap(S)
-    g = np.max(e[:max_num_clusters])/(np.max(S)+1e-10)
-    r = p/g
+    g = np.max(e[:max_num_clusters]) / (np.max(S) + 1e-10)
+    r = p / g
     k = np.argmax(e[:max_num_clusters])
     return (e, g, k, r)
 
 
-'''
+"""
 Performs spectral clustering with Normalized Maximum Eigengap (NME)
 Parameters:
    A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
@@ -113,75 +121,123 @@ def ComputeNMEParameters(A, p, max_num_clusters):
    pmax: maximum count for matrix binarization (should be at least 2)
    pbest: best count for matrix binarization (if 0, determined automatically)
 Returns: cluster assignments for every speaker embedding   
-'''
-def NME_SpectralClustering(A, num_clusters = None, max_num_clusters = 10, pbest = 0, pmax = 20):
-    if pbest==0:
-        print('Selecting best number of neighbors for affinity matrix thresolding:')
+"""
+
+
+def NME_SpectralClustering(A, num_clusters=None, max_num_clusters=10, pbest=0, pmax=20):
+    if pbest == 0:
+        print("Selecting best number of neighbors for affinity matrix thresolding:")
         rbest = None
         kbest = None
-        for p in range(2, pmax+1):
+        for p in range(2, pmax + 1):
             e, g, k, r = ComputeNMEParameters(A, p, max_num_clusters)
-            print('p={}, r={}'.format(p,r))
+            print("p={}, r={}".format(p, r))
             if rbest is None or rbest > r:
                 rbest = r
                 pbest = p
                 kbest = k
-        print('Best number of neighbors is {}'.format(pbest))
-        return NME_SpectralClustering_sklearn(A, num_clusters if num_clusters is not None else (kbest+1), pbest)
+        print("Best number of neighbors is {}".format(pbest))
+        return NME_SpectralClustering_sklearn(
+            A, num_clusters if num_clusters is not None else (kbest + 1), pbest
+        )
     if num_clusters is None:
-        print('Compute number of clusters to generate:')
+        print("Compute number of clusters to generate:")
         e, g, r, k = ComputeNMEParameters(A, p)
-        print('Number of clusters to generate is {}'.format(k+1))
-        return NME_SpectralClustering_sklearn(A, k+1, pbest)
+        print("Number of clusters to generate is {}".format(k + 1))
+        return NME_SpectralClustering_sklearn(A, k + 1, pbest)
     return NME_SpectralClustering_sklearn(A, num_clusters, pbest)
 
-'''
+
+"""
 Performs spectral clustering with Normalized Maximum Eigengap (NME) with fixed threshold and number of clusters
 Parameters:
    A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings)
    num_clusters: number of clusters to generate
    pbest: best count for matrix binarization
 Returns: cluster assignments for every speaker embedding   
-'''
+"""
+
+
 def NME_SpectralClustering_sklearn(A, num_clusters, pbest):
     Ap = Threshold(A, pbest)
     Ap = (Ap + np.transpose(Ap)) / 2
-    model = SpectralClustering(n_clusters = num_clusters, affinity='precomputed', random_state=0)
+    model = SpectralClustering(
+        n_clusters=num_clusters, affinity="precomputed", random_state=0
+    )
     labels = model.fit_predict(Ap)
     return labels
 
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Usage: spec_clust.py [options] <scores-rspec> <reco2utt-rspec> <labels-wspec>\n' +
-                                                 'Performs spectral clustering of xvectors according to pairwise similarity scores\n' +
-                                                 'Auto-selects binarization threshold')
-    parser.add_argument('simmat_rspec', type=str, help='Kaldi-style rspecifier of similarity scores matrices to read')
-    parser.add_argument('reco2utt_rspec', type=str, help='Kaldi-style rspecifier of recording-to-utterances correspondence')
-    parser.add_argument('labels_wspec', type=str, help='Kaldi-style wspecifier to save xvector cluster labels')
-    parser.add_argument('--max_neighbors', type=int, default=20, help='Maximum number of neighbors to threshold similarity matrix')
-    parser.add_argument('--reco2num_spk', type=str, default='', help='Kaldi-style rspecifier of recording-to-numofspeakers correspondence')
-    parser.add_argument('--num_clusters', type=int, default=None, help='Number of clusters to generate. Ignored if --reco2num_spk is given')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Usage: spec_clust.py [options] <scores-rspec> <reco2utt-rspec> <labels-wspec>\n"
+        + "Performs spectral clustering of xvectors according to pairwise similarity scores\n"
+        + "Auto-selects binarization threshold"
+    )
+    parser.add_argument(
+        "simmat_rspec",
+        type=str,
+        help="Kaldi-style rspecifier of similarity scores matrices to read",
+    )
+    parser.add_argument(
+        "reco2utt_rspec",
+        type=str,
+        help="Kaldi-style rspecifier of recording-to-utterances correspondence",
+    )
+    parser.add_argument(
+        "labels_wspec",
+        type=str,
+        help="Kaldi-style wspecifier to save xvector cluster labels",
+    )
+    parser.add_argument(
+        "--max_neighbors",
+        type=int,
+        default=20,
+        help="Maximum number of neighbors to threshold similarity matrix",
+    )
+    parser.add_argument(
+        "--reco2num_spk",
+        type=str,
+        default="",
+        help="Kaldi-style rspecifier of recording-to-numofspeakers correspondence",
+    )
+    parser.add_argument(
+        "--num_clusters",
+        type=int,
+        default=None,
+        help="Number of clusters to generate. Ignored if --reco2num_spk is given",
+    )
     args = parser.parse_args()
 
-    assert args.max_neighbors > 1, 'Maximum number of neighpors should be at least 2, {} passed\n'.format(args.max_neighbors)
+    assert (
+        args.max_neighbors > 1
+    ), "Maximum number of neighpors should be at least 2, {} passed\n".format(
+        args.max_neighbors
+    )
 
-    print('Spectral clustering of xvector according to precomputed similarity scores matrix')
-    print('Parameters:')
-    print('Similarity matrix rspecifier: {}'.format(args.simmat_rspec))
-    print('Reco2Utt rspecifier: {}'.format(args.reco2utt_rspec))
-    print('Labels wspecifier: {}'.format(args.labels_wspec))
-    print('Number of clusters to generate: {}'.format(args.num_clusters))
-    print('Maximum number of nighbors to threshold similarity matrix: {}\n'.format(args.max_neighbors))
-    print('Reco2NumSpk rspecifier: {}'.format(args.reco2num_spk))
+    print(
+        "Spectral clustering of xvector according to precomputed similarity scores matrix"
+    )
+    print("Parameters:")
+    print("Similarity matrix rspecifier: {}".format(args.simmat_rspec))
+    print("Reco2Utt rspecifier: {}".format(args.reco2utt_rspec))
+    print("Labels wspecifier: {}".format(args.labels_wspec))
+    print("Number of clusters to generate: {}".format(args.num_clusters))
+    print(
+        "Maximum number of nighbors to threshold similarity matrix: {}\n".format(
+            args.max_neighbors
+        )
+    )
+    print("Reco2NumSpk rspecifier: {}".format(args.reco2num_spk))
 
-    print('Loading affinity matrices...', end='')
+    print("Loading affinity matrices...", end="")
     Matrices = LoadAffinityMatrix(args.simmat_rspec)
-    print('done')
-    print('Loading Reco2Utt correspondence...', end='')
+    print("done")
+    print("Loading Reco2Utt correspondence...", end="")
     IDs = LoadReco2Utt(args.reco2utt_rspec)
-    print('done')
+    print("done")
 
-    if args.reco2num_spk != '':
+    if args.reco2num_spk != "":
         NumSpk = LoadReco2NumSpk(args.reco2num_spk)
 
     Labels = dict()
@@ -189,12 +245,18 @@ def NME_SpectralClustering_sklearn(A, num_clusters, pbest):
         A = Matrices[id]
         IDList = IDs[id]
 
-        num_clusters = args.num_clusters if args.reco2num_spk == '' else NumSpk[id]
-        assert num_clusters is None or num_clusters > 0, 'Positive number of clusters expected for {}, {} found\n'.format(id, num_clusters)
+        num_clusters = args.num_clusters if args.reco2num_spk == "" else NumSpk[id]
+        assert (
+            num_clusters is None or num_clusters > 0
+        ), "Positive number of clusters expected for {}, {} found\n".format(
+            id, num_clusters
+        )
 
-        print('Start clustering for recording {}...'.format(id))
-        Labels[id] = NME_SpectralClustering(A, num_clusters = num_clusters, pmax = args.max_neighbors)
-        print('Clustering done')
-    print( 'Saving labels...')
+        print("Start clustering for recording {}...".format(id))
+        Labels[id] = NME_SpectralClustering(
+            A, num_clusters=num_clusters, pmax=args.max_neighbors
+        )
+        print("Clustering done")
+    print("Saving labels...")
     SaveLabels(IDs, Labels, args.labels_wspec)
-    print('done')
\ No newline at end of file
+    print("done")
diff --git a/egs/libri_css/asr1/diarization/vb_hmm_xvector.py b/egs/libri_css/asr1/diarization/vb_hmm_xvector.py
index 2907cc2d114..76e128bbf1f 100644
--- a/egs/libri_css/asr1/diarization/vb_hmm_xvector.py
+++ b/egs/libri_css/asr1/diarization/vb_hmm_xvector.py
@@ -20,50 +20,71 @@
 
 ########### HELPER FUNCTIONS #####################################
 
+
 def get_args():
     parser = argparse.ArgumentParser(
         description="""This script performs Bayesian HMM-based
             clustering of x-vectors for one recording""",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--init-smoothing", type=float, default=10,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--init-smoothing",
+        type=float,
+        default=10,
         help="AHC produces hard assignments of x-vetors to speakers."
         " These are smoothed to soft assignments as the initialization"
         " for VB-HMM. This parameter controls the amount of smoothing."
-        " Not so important, high value (e.g. 10) is OK  => keeping hard assigment")
-    parser.add_argument("--loop-prob", type=float, default=0.80,
-                        help="probability of not switching speakers between frames")
-    parser.add_argument("--fa", type=float, default=0.4,
-                        help="scale sufficient statistics collected using UBM")
-    parser.add_argument("--fb", type=float, default=11,
-                        help="speaker regularization coefficient Fb (controls final # of speaker)")
-    parser.add_argument("xvector_ark_file", type=str,
-                        help="Ark file containing xvectors for all subsegments")
-    parser.add_argument("plda", type=str,
-                        help="path to PLDA model")
-    parser.add_argument("input_label_file", type=str,
-                        help="path of input label file")
-    parser.add_argument("output_label_file", type=str,
-                        help="path of output label file")
+        " Not so important, high value (e.g. 10) is OK  => keeping hard assigment",
+    )
+    parser.add_argument(
+        "--loop-prob",
+        type=float,
+        default=0.80,
+        help="probability of not switching speakers between frames",
+    )
+    parser.add_argument(
+        "--fa",
+        type=float,
+        default=0.4,
+        help="scale sufficient statistics collected using UBM",
+    )
+    parser.add_argument(
+        "--fb",
+        type=float,
+        default=11,
+        help="speaker regularization coefficient Fb (controls final # of speaker)",
+    )
+    parser.add_argument(
+        "xvector_ark_file",
+        type=str,
+        help="Ark file containing xvectors for all subsegments",
+    )
+    parser.add_argument("plda", type=str, help="path to PLDA model")
+    parser.add_argument("input_label_file", type=str, help="path of input label file")
+    parser.add_argument("output_label_file", type=str, help="path of output label file")
     args = parser.parse_args()
     return args
 
+
 def read_labels_file(label_file):
     segments = []
     labels = []
-    with open(label_file, 'r') as f:
+    with open(label_file, "r") as f:
         for line in f.readlines():
             segment, label = line.strip().split()
             segments.append(segment)
             labels.append(int(label))
     return segments, labels
 
+
 def write_labels_file(seg2label, out_file):
-    f = open(out_file, 'w')
+    f = open(out_file, "w")
     for seg in sorted(seg2label.keys()):
         f.write("{} {}\n".format(seg, seg2label[seg]))
     f.close()
     return
 
+
 def read_args(args):
     segments, labels = read_labels_file(args.input_label_file)
     xvec_all = dict(kaldi_io.read_vec_flt_ark(args.xvector_ark_file))
@@ -76,40 +97,62 @@ def read_args(args):
 
 ###################################################################
 
+
 def vb_hmm(segments, in_labels, xvectors, plda_psi, init_smoothing, loop_prob, fa, fb):
     x = np.array(xvectors)
     dim = x.shape[1]
 
     # Smooth the hard labels obtained from AHC to soft assignments of x-vectors to speakers
-    q_init = np.zeros((len(in_labels), np.max(in_labels)+1))
+    q_init = np.zeros((len(in_labels), np.max(in_labels) + 1))
     q_init[range(len(in_labels)), in_labels] = 1.0
-    q_init = softmax(q_init*init_smoothing, axis=1)
+    q_init = softmax(q_init * init_smoothing, axis=1)
 
     # Prepare model for VB-HMM clustering
     ubmWeights = np.array([1.0])
-    ubmMeans = np.zeros((1,dim))
-    invSigma= np.ones((1,dim))
-    V=np.diag(np.sqrt(plda_psi[:dim]))[:,np.newaxis,:]
+    ubmMeans = np.zeros((1, dim))
+    invSigma = np.ones((1, dim))
+    V = np.diag(np.sqrt(plda_psi[:dim]))[:, np.newaxis, :]
 
     # Use VB-HMM for x-vector clustering. Instead of i-vector extractor model, we use PLDA
-    # => GMM with only 1 component, V derived across-class covariance, and invSigma is inverse 
+    # => GMM with only 1 component, V derived across-class covariance, and invSigma is inverse
     # within-class covariance (i.e. identity)
-    q, _, _ = VB_diarization.VB_diarization(x, ubmMeans, invSigma, ubmWeights, V, pi=None, 
-        gamma=q_init, maxSpeakers=q_init.shape[1], maxIters=40, epsilon=1e-6, loopProb=loop_prob,
-        Fa=fa, Fb=fb)
-
-    labels = np.unique(q.argmax(1), return_inverse=True)[1] 
+    q, _, _ = VB_diarization.VB_diarization(
+        x,
+        ubmMeans,
+        invSigma,
+        ubmWeights,
+        V,
+        pi=None,
+        gamma=q_init,
+        maxSpeakers=q_init.shape[1],
+        maxIters=40,
+        epsilon=1e-6,
+        loopProb=loop_prob,
+        Fa=fa,
+        Fb=fb,
+    )
+
+    labels = np.unique(q.argmax(1), return_inverse=True)[1]
+
+    return {seg: label for seg, label in zip(segments, labels)}
 
-    return {seg:label for seg,label in zip(segments,labels)}
 
 def main():
     args = get_args()
     xvectors, segments, labels, plda_psi = read_args(args)
 
-    seg2label_vb = vb_hmm(segments, labels, xvectors, plda_psi, args.init_smoothing, 
-        args.loop_prob, args.fa, args.fb)
+    seg2label_vb = vb_hmm(
+        segments,
+        labels,
+        xvectors,
+        plda_psi,
+        args.init_smoothing,
+        args.loop_prob,
+        args.fa,
+        args.fb,
+    )
     write_labels_file(seg2label_vb, args.output_label_file)
 
-if __name__=="__main__":
-    main()
 
+if __name__ == "__main__":
+    main()
diff --git a/egs/libri_trans/asr1/RESULTS.md b/egs/libri_trans/asr1/RESULTS.md
index 0f20f4eb71d..5d1123937b5 100644
--- a/egs/libri_trans/asr1/RESULTS.md
+++ b/egs/libri_trans/asr1/RESULTS.md
@@ -1,50 +1,50 @@
-# Transformer results
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/decode_dev.en_decode_pytorch_transformer_bpe|1071|18651|94.6|4.9|0.5|1.1|**6.5**|47.6|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/decode_test.en_decode_pytorch_transformer_bpe|2048|36336|94.6|4.9|0.5|1.0|**6.4**|45.9|
-
-- Model files (archived to train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=1zH7Gx-rUvnVby6ww9--Hc7hMwO1dprqo
-  - training config file: `conf/tuning/train_pytorch_transformer_bpe_long.yaml`
-  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe.yaml`
+# Conformer results
+
+### train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug
+
+| dataset                                                                                                         | Snt  | Wrd   | Corr | Sub | Del | Ins | Err     | S.Err |
+| --------------------------------------------------------------------------------------------------------------- | ---- | ----- | ---- | --- | --- | --- | ------- | ----- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/decode_dev.en_decode_pytorch_transformer  | 1071 | 18651 | 95.2 | 4.3 | 0.5 | 1.0 | **5.8** | 41.5  |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/decode_test.en_decode_pytorch_transformer | 2048 | 36336 | 95.2 | 4.3 | 0.4 | 0.9 | **5.6** | 41.0  |
+
+- Model files (archived to train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug.tar.gz by `$ pack_model.sh`)
+  - model link: https://drive.google.com/file/d/1Ewj551fKYFWF6NsXXs4dzD9T_AeQMRjg/view?usp=sharing
+  - training config file: `conf/tuning/train_pytorch_conformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer.yaml`
   - preprocess config file: `conf/specaug.yaml`
   - cmvn file: `data/train_sp.en/cmvn.ark`
-  - e2e file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/results/model.val5.avg.best`
-  - e2e JSON file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_long_bpe1000_specaug/results/model.json`
+  - e2e file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug/results/model.json`
   - lm file: `exp/train_sp.en_lc.rm_rnnlm_pytorch_lm_lc.rm_bpe1000/rnnlm.model.best`
   - lm JSON file: `exp/train_sp.en_lc.rm_rnnlm_pytorch_lm_lc.rm_bpe1000/model.json`
 - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
 
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_dev.en_decode_pytorch_transformer_bpe|1071|18651|93.7|5.6|0.7|1.4|**7.6**|51.3|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe_bpe1000/decode_test.en_decode_pytorch_transformer_bpe|2048|36336|93.7|5.6|0.7|1.2|**7.5**|49.2|
+# Transformer results
 
-### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe38
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe38/decode_dev.en_decode_pytorch_transformer|1071|18651|93.1|6.2|0.6|1.1|**8.0**|52.3|
-|exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe38/decode_test.en_decode_pytorch_transformer|2048|36336|92.7|6.7|0.7|1.0|**8.4**|53.9|
+### train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug
 
+| dataset                                                                                                           | Snt  | Wrd   | Corr | Sub | Del | Ins | Err     | S.Err |
+| ----------------------------------------------------------------------------------------------------------------- | ---- | ----- | ---- | --- | --- | --- | ------- | ----- |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/decode_dev.en_decode_pytorch_transformer  | 1071 | 18651 | 94.8 | 4.7 | 0.6 | 0.9 | **6.2** | 45.5  |
+| exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/decode_test.en_decode_pytorch_transformer | 2048 | 36336 | 94.3 | 5.1 | 0.6 | 0.9 | **6.6** | 47.0  |
+
+- Model files (archived to train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug.tar.gz by `$ pack_model.sh`)
+  - model link: https://drive.google.com/file/d/1gYhTYfN005PwLP4S8EHpt-MTTucSztZz/view?usp=sharing
+  - training config file: `conf/tuning/train_pytorch_transformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer.yaml`
+  - preprocess config file: `conf/specaug.yaml`
+  - cmvn file: `data/train_sp.en/cmvn.ark`
+  - e2e file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train_sp.en_lc.rm_pytorch_train_pytorch_transformer_bpe1000_specaug/results/model.json`
+  - lm file: `exp/train_sp.en_lc.rm_rnnlm_pytorch_lm_lc.rm_bpe1000/rnnlm.model.best`
+  - lm JSON file: `exp/train_sp.en_lc.rm_rnnlm_pytorch_lm_lc.rm_bpe1000/model.json`
+- Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
 
 # RNN results
-### train_sp.en_lc.rm_pytorch_train_rnn_bpe_bpe1000
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe_bpe1000/decode_dev.en_decode_rnn_char|1071|18651|93.4|6.0|0.6|1.1|**7.7**|54.3|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe_bpe1000/decode_test.en_decode_rnn_char|2048|36336|93.1|6.1|0.8|1.0|**7.9**|53.1|
-
-### train_sp.en_lc.rm_pytorch_train_bpe38
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_bpe38/decode_dev.en_decode_rnn_char|1071|18651|92.4|6.9|0.7|1.0|**8.6**|58.3|
-|exp/train_sp.en_lc.rm_pytorch_train_bpe38/decode_test.en_decode_rnn_char|2048|36336|92.2|7.0|0.8|1.1|**8.9**|56.5|
-
-### train_sp.en_lc.rm_pytorch_train_rnn_char
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_char/decode_dev.en_decode_rnn_char|1071|18651|91.3|7.8|0.8|1.0|**9.6**|62.7|
-|exp/train_sp.en_lc.rm_pytorch_train_rnn_char/decode_test.en_decode_rnn_char|2048|36336|90.7|8.3|0.9|1.2|**10.4**|62.1|
+
+### train_sp.en_lc.rm_pytorch_train_rnn_bpe1000
+
+| dataset                                                                   | Snt  | Wrd   | Corr | Sub | Del | Ins | Err     | S.Err |
+| ------------------------------------------------------------------------- | ---- | ----- | ---- | --- | --- | --- | ------- | ----- |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe1000/decode_dev.en_decode_rnn  | 1071 | 18651 | 93.4 | 6.0 | 0.6 | 1.1 | **7.7** | 54.3  |
+| exp/train_sp.en_lc.rm_pytorch_train_rnn_bpe1000/decode_test.en_decode_rnn | 2048 | 36336 | 93.1 | 6.1 | 0.8 | 1.0 | **7.9** | 53.1  |
diff --git a/egs/libri_trans/asr1/cmd.sh b/egs/libri_trans/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/libri_trans/asr1/cmd.sh
+++ b/egs/libri_trans/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/libri_trans/asr1/conf/lm.yaml b/egs/libri_trans/asr1/conf/lm.yaml
index fcc4edf79c9..8e07325bc05 120000
--- a/egs/libri_trans/asr1/conf/lm.yaml
+++ b/egs/libri_trans/asr1/conf/lm.yaml
@@ -1 +1 @@
-tuning/lm_char.yaml
\ No newline at end of file
+tuning/lm.yaml
\ No newline at end of file
diff --git a/egs/libri_trans/asr1/conf/train.yaml b/egs/libri_trans/asr1/conf/train.yaml
index 6eab360eb6b..6619e5b1e4f 120000
--- a/egs/libri_trans/asr1/conf/train.yaml
+++ b/egs/libri_trans/asr1/conf/train.yaml
@@ -1 +1 @@
-tuning/train_pytorch_transformer_bpe.yaml
\ No newline at end of file
+tuning/train_pytorch_conformer.yaml
\ No newline at end of file
diff --git a/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer.yaml b/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer.yaml
new file mode 100644
index 00000000000..b092ab58fd7
--- /dev/null
+++ b/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.2
diff --git a/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer_bpe.yaml b/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer_bpe.yaml
deleted file mode 100644
index c13963377c9..00000000000
--- a/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer_bpe.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-batchsize: 0
-beam-size: 10
-penalty: 0.0
-maxlenratio: 0.0
-minlenratio: 0.0
-ctc-weight: 0.5
-lm-weight: 0.7
diff --git a/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer_nolm.yaml b/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer_nolm.yaml
new file mode 100644
index 00000000000..bbaf4fe4068
--- /dev/null
+++ b/egs/libri_trans/asr1/conf/tuning/decode_pytorch_transformer_nolm.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.5
+lm-weight: 0.0
diff --git a/egs/libri_trans/asr1/conf/tuning/decode_rnn_bpe.yaml b/egs/libri_trans/asr1/conf/tuning/decode_rnn.yaml
similarity index 100%
rename from egs/libri_trans/asr1/conf/tuning/decode_rnn_bpe.yaml
rename to egs/libri_trans/asr1/conf/tuning/decode_rnn.yaml
diff --git a/egs/libri_trans/asr1/conf/tuning/decode_rnn_char.yaml b/egs/libri_trans/asr1/conf/tuning/decode_rnn_char.yaml
deleted file mode 100644
index 3995ea3ee63..00000000000
--- a/egs/libri_trans/asr1/conf/tuning/decode_rnn_char.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-lm-weight: 0.3
-beam-size: 20
-penalty: 0.0
-maxlenratio: 0.0
-minlenratio: 0.0
-ctc-weight: 0.3
diff --git a/egs/libri_trans/asr1/conf/tuning/lm.yaml b/egs/libri_trans/asr1/conf/tuning/lm.yaml
new file mode 100644
index 00000000000..db427b09487
--- /dev/null
+++ b/egs/libri_trans/asr1/conf/tuning/lm.yaml
@@ -0,0 +1,11 @@
+layer: 2
+unit: 1024
+opt: adam        # or sgd
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 256 # batch size in LM training
+epoch: 60      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 150      # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
+
+tie-weights: true
+emb-dropout-rate: 0.3
diff --git a/egs/libri_trans/asr1/conf/tuning/lm_char.yaml b/egs/libri_trans/asr1/conf/tuning/lm_char.yaml
deleted file mode 100644
index 96b10cecff9..00000000000
--- a/egs/libri_trans/asr1/conf/tuning/lm_char.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-layer: 2
-unit: 1024
-opt: adam        # or sgd
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-batchsize: 256 # batch size in LM training
-epoch: 60      # if the data size is large, we can reduce this
-patience: 3
-maxlen: 150      # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
diff --git a/egs/libri_trans/asr1/conf/tuning/train_pytorch_conformer.yaml b/egs/libri_trans/asr1/conf/tuning/train_pytorch_conformer.yaml
new file mode 100644
index 00000000000..0078b03d84b
--- /dev/null
+++ b/egs/libri_trans/asr1/conf/tuning/train_pytorch_conformer.yaml
@@ -0,0 +1,53 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 120
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# Report CER & WER
+report-cer: true
+report-wer: true
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/libri_trans/asr1/conf/tuning/train_pytorch_transformer_bpe_long.yaml b/egs/libri_trans/asr1/conf/tuning/train_pytorch_transformer.yaml
similarity index 100%
rename from egs/libri_trans/asr1/conf/tuning/train_pytorch_transformer_bpe_long.yaml
rename to egs/libri_trans/asr1/conf/tuning/train_pytorch_transformer.yaml
diff --git a/egs/libri_trans/asr1/conf/tuning/train_pytorch_transformer_bpe.yaml b/egs/libri_trans/asr1/conf/tuning/train_pytorch_transformer_bpe.yaml
deleted file mode 100644
index 75574cab937..00000000000
--- a/egs/libri_trans/asr1/conf/tuning/train_pytorch_transformer_bpe.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# hybrid CTC/attention
-mtlalpha: 0.3
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 100
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 5.0
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/libri_trans/asr1/conf/tuning/train_rnn_bpe.yaml b/egs/libri_trans/asr1/conf/tuning/train_rnn.yaml
similarity index 100%
rename from egs/libri_trans/asr1/conf/tuning/train_rnn_bpe.yaml
rename to egs/libri_trans/asr1/conf/tuning/train_rnn.yaml
diff --git a/egs/libri_trans/asr1/conf/tuning/train_rnn_char.yaml b/egs/libri_trans/asr1/conf/tuning/train_rnn_char.yaml
deleted file mode 100644
index 53a9ab2dd79..00000000000
--- a/egs/libri_trans/asr1/conf/tuning/train_rnn_char.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-# network architecture
-# encoder related
-etype: vggblstm     # encoder architecture type
-elayers: 5
-eunits: 1024
-eprojs: 1024
-subsample: "1_2_2_1_1" # skip every n frame from input to nth layers
-# decoder related
-dlayers: 2
-dunits: 1024
-context-residual: true
-# attention related
-atype: location
-adim: 1024
-aconv-chans: 10
-aconv-filts: 100
-
-# hybrid CTC/attention
-mtlalpha: 0.5
-
-# minibatch related
-batch-size: 10  # for char
-maxlen-in: 800  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# other config
-dropout-rate: 0.3
-dropout-rate-decoder: 0.0
-lsm-weight: 0.1
-weight-decay: 0
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: adadelta
-epochs: 10
-patience: 3
-
-# scheduled sampling option
-sampling-probability: 0.0
diff --git a/egs/libri_trans/asr1/run.sh b/egs/libri_trans/asr1/run.sh
index 706286216e0..e9ff6dfafdf 100755
--- a/egs/libri_trans/asr1/run.sh
+++ b/egs/libri_trans/asr1/run.sh
@@ -38,6 +38,8 @@ recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.bes
 n_average=5                  # the number of ASR models to be averaged
 use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
                              # if false, the last `n_average` ASR models will be averaged.
+metric=acc                   # loss/acc/cer/cer_ctc
+max_epoch=400
 
 # preprocessing related
 src_case=lc.rm
@@ -104,7 +106,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into En Fr, Fr (google trans)
     for x in ${train_set_prefix} dev test; do
-        local/divide_lang.sh ${x}
+        divide_lang.sh ${x} "en fr fr.gtranslate"
     done
 
     for lang in en fr fr.gtranslate; do
@@ -123,23 +125,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
 
     # dump features for training
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/libri_trans/asr1/dump/${train_set}/delta${do_delta}/storage \
-          ${feat_tr_dir}/storage
-    fi
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/libri_trans/asr1/dump/${train_dev}/delta${do_delta}/storage \
-          ${feat_dt_dir}/storage
-    fi
     dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
         data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
     for x in ${train_dev} ${recog_set}; do
         feat_recog_dir=${dumpdir}/${x}/delta${do_delta}; mkdir -p ${feat_recog_dir}
         dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
-            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${x} \
-            ${feat_recog_dir}
+            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${x} ${feat_recog_dir}
     done
 fi
 
@@ -168,11 +159,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # NOTE: ASR vocab is created with a source language only
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${src_case} --bpecode ${bpemodel}.model \
-        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}.json
-    for x in ${train_dev} ${recog_set}; do
+    for x in ${train_set} ${train_dev} ${recog_set}; do
         feat_recog_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_recog_dir}/feats.scp --text data/${x}/text.${src_case} --bpecode ${bpemodel}.model \
+        data2json.sh  --nj 16 --feat ${feat_recog_dir}/feats.scp --text data/${x}/text.${src_case} --bpecode ${bpemodel}.model \
             data/${x} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.${src_case}.json
     done
 fi
@@ -240,19 +229,21 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --verbose ${verbose} \
         --resume ${resume} \
         --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}.json \
-        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}.json
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}.json \
+        --n-iter-processes 2
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "stage 5: Decoding"
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
        [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+       [[ $(get_yaml.py ${train_config} model-module) = *maskctc* ]] || \
        [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
        [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
         # Average ASR models
         if ${use_valbest_average}; then
             recog_model=model.val${n_average}.avg.best
-            opt="--log ${expdir}/results/log"
+            opt="--log ${expdir}/results/log --metric ${metric}"
         else
             recog_model=model.last${n_average}.avg.best
             opt="--log"
@@ -262,8 +253,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --backend ${backend} \
             --snapshots ${expdir}/results/snapshot.ep.* \
             --out ${expdir}/results/${recog_model} \
-            --num ${n_average}
-    fi
+            --num ${n_average} \
+            --max-epoch ${max_epoch}
+ fi
 
     if [ ${dec_ngpu} = 1 ]; then
         nj=1
diff --git a/egs/libri_trans/mt1/RESULTS.md b/egs/libri_trans/mt1/RESULTS.md
index cc5a3e10d36..b7c5334bfba 100644
--- a/egs/libri_trans/mt1/RESULTS.md
+++ b/egs/libri_trans/mt1/RESULTS.md
@@ -1,22 +1,76 @@
 # Transformer results
-### train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000/decode_dev.fr_decode_pytorch_transformer_bpe|**19.64**|51.9|25.8|14.6|8.5|0.972|0.972|21791|22408|
-|exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000/decode_test.fr_decode_pytorch_transformer_bpe|**18.09**|50.0|23.8|13.1|7.4|0.982|0.982|43115|43904|
+
+### train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_large_bpe8000 (sacreBLEU)
+
+| dataset                                                                                                               | BLEU     | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------------------------------------------------------------- | -------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_large_bpe8000/decode_dev.fr_decode_pytorch_transformer_bpe8k  | **24.8** | 53.4   | 29.0   | 18.7   | 13.0   | 1.000 | 1.005 | 22467   | 22365   |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_large_bpe8000/decode_test.fr_decode_pytorch_transformer_bpe8k | **18.3** | 49.7   | 23.5   | 13.0   | 7.5    | 0.996 | 0.996 | 43664   | 43837   |
+
+- Model files (archived to train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_large_bpe8000.tar.gz by `$ pack_model.sh`)
+  - https://drive.google.com/file/d/1kod8WVLjsF9_5G_WpTAE0XNpdn1cJg5d/view?usp=sharing
+  - training config file: `conf/tuning/train_pytorch_transformer_large.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe8k.yaml`
+  - e2e file: `exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_large_bpe8000/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_large_bpe8000/results/model.json`
+- Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
+
+#### cascade results from Conformer ASR (sacreBLEU)
+
+| dataset                                                                                                               | BLEU     | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------------------------------------------------------------- | -------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_large_bpe8000/decode_dev.fr_decode_pytorch_transformer_bpe8k  | **23.0** | 51.6   | 27.2   | 17.1   | 11.7   | 1.000 | 1.008 | 22555   | 22365   |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_large_bpe8000/decode_test.fr_decode_pytorch_transformer_bpe8k | **17.5** | 48.5   | 22.5   | 12.3   | 7.0    | 0.999 | 0.999 | 43808   | 43837   |
+
+### train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe8000 (sacreBLEU)
+
+| dataset                                                                                                         | BLEU     | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------------------------------------------------------- | -------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe8000/decode_dev.fr_decode_pytorch_transformer_bpe8k  | **21.5** | 52.2   | 26.7   | 15.9   | 9.9    | 0.992 | 0.992 | 22183   | 22365   |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe8000/decode_test.fr_decode_pytorch_transformer_bpe8k | **18.3** | 49.7   | 23.7   | 13.0   | 7.5    | 0.993 | 0.993 | 43552   | 43837   |
+
+- Model files (archived to train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe8000.tar.gz by `$ pack_model.sh`)
+  - https://drive.google.com/file/d/1W-1AxcpSneITalSIQPC-RuhTnZ7l6pjC/view?usp=sharing
+  - training config file: `conf/tuning/train_pytorch_transformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe8k.yaml`
+  - e2e file: `exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe8000/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe8000/results/model.json`
+- Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
+
+#### cascade results from Conformer ASR (sacreBLEU)
+
+| dataset                                                                                                         | BLEU     | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------------------------------------------------------- | -------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe8000/decode_dev.fr_decode_pytorch_transformer_bpe8k  | **20.2** | 50.7   | 25.2   | 14.8   | 9.1    | 0.994 | 0.994 | 22228   | 22365   |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe8000/decode_test.fr_decode_pytorch_transformer_bpe8k | **17.4** | 48.4   | 22.5   | 12.2   | 7.0    | 1.000 | 1.000 | 43851   | 43837   |
+
+### train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000 (sacreBLEU)
+
+| dataset                                                                                                   | BLEU     | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------------------------------------------------- | -------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000/decode_dev.fr_decode_pytorch_transformer  | **20.1** | 51.7   | 25.9   | 14.9   | 8.7    | 0.984 | 0.984 | 22007   | 22365   |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000/decode_test.fr_decode_pytorch_transformer | **18.3** | 49.8   | 23.6   | 13.0   | 7.4    | 0.995 | 0.995 | 43618   | 43837   |
 
 - Model files (archived to train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=1W9sfhO2qHXSkSyr_qwAUWdagjkVg9KhF
-  - training config file: `conf/tuning/train_pytorch_transformer_bpe.yaml`
-  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe.yaml`
+  - model link: https://drive.google.com/file/d/1MY06WhBmg3Z5ZQKDduairTnqTFIVkDEd/view?usp=sharing
+  - training config file: `conf/tuning/train_pytorch_transformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer.yaml`
   - e2e file: `exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000/results/model.val5.avg.best`
   - e2e JSON file: `exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000/results/model.json`
 - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
 
+#### cascade results from Conformer ASR (sacreBLEU)
+
+| dataset                                                                                                   | BLEU     | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------------------------------------------------- | -------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000/decode_dev.fr_decode_pytorch_transformer  | **18.7** | 50.1   | 24.2   | 13.6   | 7.7    | 0.987 | 0.987 | 22078   | 22365   |
+| exp/train.fr_lc.rm_tc_pytorch_train_pytorch_transformer_bpe1000/decode_test.fr_decode_pytorch_transformer | **17.5** | 49.0   | 22.9   | 12.5   | 7.0    | 0.988 | 0.988 | 43329   | 43837   |
 
 # RNN baseline
+
 ### train.fr_lc.rm_lc_pytorch_train (character)
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train.fr_lc.rm_lc_pytorch_train/decode_dev.fr_decode|**20.01**|54.0|27.3|15.8|9.4|0.926|0.929|20860|22462|
-|exp/train.fr_lc.rm_lc_pytorch_train/decode_test.fr_decode|**18.39**|52.6|25.6|14.3|8.1|0.926|0.929|40950|44080|
+
+| dataset                                                   | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train.fr_lc.rm_lc_pytorch_train/decode_dev.fr_decode  | **20.01** | 54.0   | 27.3   | 15.8   | 9.4    | 0.926 | 0.929 | 20860   | 22462   |
+| exp/train.fr_lc.rm_lc_pytorch_train/decode_test.fr_decode | **18.39** | 52.6   | 25.6   | 14.3   | 8.1    | 0.926 | 0.929 | 40950   | 44080   |
diff --git a/egs/libri_trans/mt1/cmd.sh b/egs/libri_trans/mt1/cmd.sh
index 9f648974ff4..3099918dd5d 100644
--- a/egs/libri_trans/mt1/cmd.sh
+++ b/egs/libri_trans/mt1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/libri_trans/mt1/conf/decode.yaml b/egs/libri_trans/mt1/conf/decode.yaml
index ef2b5af9c12..1f358f011d4 120000
--- a/egs/libri_trans/mt1/conf/decode.yaml
+++ b/egs/libri_trans/mt1/conf/decode.yaml
@@ -1 +1 @@
-tuning/decode_rnn_char.yaml
\ No newline at end of file
+tuning/decode_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs/libri_trans/mt1/conf/train.yaml b/egs/libri_trans/mt1/conf/train.yaml
index 7f06d3e0fb0..4a8f9de8969 120000
--- a/egs/libri_trans/mt1/conf/train.yaml
+++ b/egs/libri_trans/mt1/conf/train.yaml
@@ -1 +1 @@
-tuning/train_rnn_char.yaml
\ No newline at end of file
+tuning/train_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer.2en.yaml b/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer.2en.yaml
new file mode 100644
index 00000000000..c7df1166882
--- /dev/null
+++ b/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer.2en.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.4
+maxlenratio: 1.3
+minlenratio: 0.0
diff --git a/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer_bpe.yaml b/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer.yaml
similarity index 100%
rename from egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer_bpe.yaml
rename to egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer.yaml
diff --git a/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer_bpe8k.yaml b/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer_bpe8k.yaml
new file mode 100644
index 00000000000..bfe4f85bd62
--- /dev/null
+++ b/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer_bpe8k.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.9
+maxlenratio: 2.0
+minlenratio: 0.0
diff --git a/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer_char.yaml b/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer_char.yaml
deleted file mode 100644
index 315e0232040..00000000000
--- a/egs/libri_trans/mt1/conf/tuning/decode_pytorch_transformer_char.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-batchsize: 0
-beam-size: 10
-penalty: 0.2
-maxlenratio: 3.0
-minlenratio: 0.0
diff --git a/egs/libri_trans/mt1/conf/tuning/decode_rnn_spm.yaml b/egs/libri_trans/mt1/conf/tuning/decode_rnn.yaml
similarity index 100%
rename from egs/libri_trans/mt1/conf/tuning/decode_rnn_spm.yaml
rename to egs/libri_trans/mt1/conf/tuning/decode_rnn.yaml
diff --git a/egs/libri_trans/mt1/conf/tuning/decode_rnn_char.yaml b/egs/libri_trans/mt1/conf/tuning/decode_rnn_char.yaml
deleted file mode 100644
index acf4bd99063..00000000000
--- a/egs/libri_trans/mt1/conf/tuning/decode_rnn_char.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-beam-size: 10
-penalty: 0.1
-maxlenratio: 3.0
-minlenratio: 0.0
diff --git a/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer.yaml b/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..2fd25676893
--- /dev/null
+++ b/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,42 @@
+# network architecture
+# encoder related
+elayers: 6
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+tie-src-tgt-embedding: false
+tie-classifier: false
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 96  # for BPE
+maxlen-in: 100  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 100 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 1
+grad-clip: 5
+patience: 0
+epochs: 200
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_mt_transformer:E2E"
+transformer-lr: 1.0
+transformer-warmup-steps: 8000
+transformer-attn-dropout-rate: 0.1
+transformer-length-normalized-loss: false
+transformer-init: xavier_uniform
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer_bpe.yaml b/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer_bpe.yaml
deleted file mode 100644
index ae3e001279c..00000000000
--- a/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer_bpe.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# network architecture
-# encoder related
-elayers: 6
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-tie-src-tgt-embedding: false
-tie-classifier: false
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 96  # for BPE
-maxlen-in: 100  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 100 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 1
-grad-clip: 5
-patience: 0
-epochs: 100
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_mt_transformer:E2E"
-transformer-lr: 1.0
-transformer-warmup-steps: 8000
-transformer-attn-dropout-rate: 0.1
-transformer-length-normalized-loss: false
-transformer-init: xavier_uniform
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer_large.yaml b/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer_large.yaml
new file mode 100644
index 00000000000..ed38b6dcadb
--- /dev/null
+++ b/egs/libri_trans/mt1/conf/tuning/train_pytorch_transformer_large.yaml
@@ -0,0 +1,42 @@
+# network architecture
+# encoder related
+elayers: 6
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 512
+aheads: 8
+tie-src-tgt-embedding: false
+tie-classifier: false
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 96
+maxlen-in: 100  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 100 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 1
+grad-clip: 5
+patience: 0
+epochs: 200
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_mt_transformer:E2E"
+transformer-lr: 1.0
+transformer-warmup-steps: 8000
+transformer-attn-dropout-rate: 0.1
+transformer-length-normalized-loss: false
+transformer-init: xavier_uniform
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/libri_trans/mt1/conf/tuning/train_rnn_spm.yaml b/egs/libri_trans/mt1/conf/tuning/train_rnn.yaml
similarity index 100%
rename from egs/libri_trans/mt1/conf/tuning/train_rnn_spm.yaml
rename to egs/libri_trans/mt1/conf/tuning/train_rnn.yaml
diff --git a/egs/libri_trans/mt1/conf/tuning/train_rnn_char.yaml b/egs/libri_trans/mt1/conf/tuning/train_rnn_char.yaml
deleted file mode 100644
index 00ae98c8ff3..00000000000
--- a/egs/libri_trans/mt1/conf/tuning/train_rnn_char.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-# network architecture
-# encoder related
-etype: blstm     # encoder architecture type
-elayers: 2
-eunits: 1024
-eprojs: 1024
-subsample: "1_1" # skip every n frame from input to nth layers
-# decoder related
-dlayers: 2
-dunits: 1024
-context-residual: true
-tie-src-tgt-embedding: false
-# attention related
-atype: add
-adim: 1024
-
-# minibatch related
-batch-size: 32  # for char
-maxlen-in: 100  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 100 # if output length > maxlen-out, batchsize is automatically reduced
-
-# other config
-dropout-rate: 0.3
-dropout-rate-decoder: 0.3
-lsm-weight: 0.1
-weight-decay: 1e-6
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: adadelta
-epochs: 25
-patience: 3
-
-# scheduled sampling option
-sampling-probability: 0.0
diff --git a/egs/libri_trans/mt1/run.sh b/egs/libri_trans/mt1/run.sh
index f1d894fb768..7360d76b5f2 100755
--- a/egs/libri_trans/mt1/run.sh
+++ b/egs/libri_trans/mt1/run.sh
@@ -7,7 +7,7 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=-1        # start from -1 if you need to start from data download
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
@@ -33,9 +33,13 @@ use_valbest_average=true     # if true, the validation `n_average`-best MT model
 metric=bleu                  # loss/acc/bleu
 
 # cascaded-ST related
-asr_model=
+asr_model_dir=
 decode_config_asr=
 dict_asr=
+# example:
+# asr_model_dir=../asr1/exp/train_sp.en_lc.rm_pytorch_train_pytorch_conformer_bpe1000_specaug
+# decode_config_asr=../asr1/config/tuning/decode_pytorch_transformer.yaml
+# dict_asr=../asr1/data/lang_1spm/train_sp.en_bpe1000_units_lc.rm.txt
 
 # preprocessing related
 src_case=lc.rm
@@ -107,7 +111,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into En Fr, Fr (google trans)
     for x in train dev test; do
-        local/divide_lang.sh ${x}
+        divide_lang.sh ${x} "en fr fr.gtranslate"
     done
 
     for lang in en fr fr.gtranslate; do
@@ -162,15 +166,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 
     echo "make json files"
     if [ ${reverse_direction} = true ]; then
-        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
-            data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
-            data/${train_set} ${dict} > ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-        for x in ${train_dev} ${trans_set}; do
+        for x in ${train_set} ${train_dev} ${trans_set}; do
             feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
-            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang en \
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         done
+        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
+            data/${train_set} ${dict} > ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json
 
         # update json (add source references)
         update_json.sh --text data/"$(echo ${train_set} | cut -f 1 -d ".")".fr/text.${src_case} --bpecode ${bpemodel}.model \
@@ -183,15 +185,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
                 ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json data/"$(echo ${x} | cut -f 1 -d ".")".fr ${dict}
         done
     else
-        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
-            data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-        data2json.sh --nj 16 --text data/train.fr.gtranslate/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
-            data/train.fr.gtranslate ${dict} > ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-        for x in ${train_dev} ${trans_set}; do
+        for x in ${train_set} ${train_dev} ${trans_set}; do
             feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
-            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "fr" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         done
+        data2json.sh --nj 16 --text data/train.fr.gtranslate/text.${tgt_case} --bpecode ${bpemodel}.model --lang "fr" \
+            data/train.fr.gtranslate ${dict} > ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json
 
         # update json (add source references)
         update_json.sh --text data/"$(echo ${train_set} | cut -f 1 -d ".")".en/text.${src_case} --bpecode ${bpemodel}.model \
@@ -208,7 +208,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # concatenate Fr and Fr (Google translation) jsons
     concat_json_multiref.py \
         ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
-        ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json > ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        > ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
 fi
 
 # NOTE: skip stage 3: LM Preparation
@@ -290,11 +291,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --model ${expdir}/results/${trans_model}
 
         if [ ${reverse_direction} = true ]; then
-            score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
-                ${expdir}/${decode_dir} en ${dict}
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+                ${expdir}/${decode_dir} "en" ${dict}
         else
-            score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
-                ${expdir}/${decode_dir} fr ${dict}
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+                ${expdir}/${decode_dir} "fr" ${dict}
         fi
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
@@ -306,7 +307,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "Finished"
 fi
 
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -n "${decode_config_asr}" ] && [ -n "${dict_asr}" ] && [ ${reverse_direction} = false ]; then
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model_dir}" ] && [ -n "${decode_config_asr}" ] && [ -n "${dict_asr}" ] && [ ${reverse_direction} = false ]; then
     echo "stage 6: Cascaded-ST decoding"
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
         # Average MT models
@@ -318,17 +319,17 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
     fi
 
     for x in ${trans_set}; do
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev); mkdir -p ${feat_trans_dir}
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}; mkdir -p ${feat_trans_dir}
         rtask=$(echo ${x} | cut -f -1 -d ".").en
         data_dir=data/${rtask}
 
         # ASR outputs
         asr_decode_dir=decode_${rtask}_$(basename ${decode_config_asr%.*})
-        json2text.py ${asr_model}/${asr_decode_dir}/data.json ${dict_asr} ${data_dir}/text_asr_ref.${src_case} ${data_dir}/text_asr_hyp.${src_case}
-        spm_decode --model=${bpemodel}.model --input_format=piece < ${data_dir}/text_asr_hyp.${src_case} | sed -e "s/▁/ /g" \
+        json2text.py ${asr_model_dir}/${asr_decode_dir}/data.json ${dict_asr} ${data_dir}/text_asr_ref.${src_case} ${data_dir}/text_asr_hyp.${src_case}
+        paste -d " " <(cut -d " " -f 1 ${data_dir}/text_asr_hyp.${src_case}) <(cut -d " " -f 2- ${data_dir}/text_asr_hyp.${src_case} | spm_decode --model=${bpemodel}.model --input_format=piece | sed -e "s/▁/ /g" | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
             > ${data_dir}/text_asr_hyp.wrd.${src_case}
 
-        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
+        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "fr" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         update_json.sh --text ${data_dir}/text_asr_hyp.wrd.${src_case} --bpecode ${bpemodel}.model \
             ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
@@ -342,7 +343,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
     for x in ${trans_set}; do
     (
         decode_dir=decode_${x}_$(basename ${decode_config%.*})_pipeline
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev)
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}
 
         # reset log for RTF calculation
         if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
@@ -362,8 +363,8 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
-            ${expdir}/${decode_dir} fr ${dict}
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+            ${expdir}/${decode_dir} "fr" ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
     ) &
diff --git a/egs/libri_trans/st1/RESULTS.md b/egs/libri_trans/st1/RESULTS.md
index 71171099515..99f486e5fe1 100644
--- a/egs/libri_trans/st1/RESULTS.md
+++ b/egs/libri_trans/st1/RESULTS.md
@@ -1,76 +1,89 @@
 # Transformer results
+
 ### ensemble (1) + (2) + (3)
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_dev.fr_decode_pytorch_transformer_bpe_pretrain_ensemble3|**19.17**|51.3|25.0|14.1|8.2|0.974|0.975|21842|22408|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_test.fr_decode_pytorch_transformer_bpe_pretrain_ensemble3|**17.40**|49.8|23.4|12.7|7.0|0.972|0.972|42676|43904|
-
-### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_short_long_bpe1000_specaug_asrtrans_mttrans
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_short_long_bpe1000_specaug_asrtrans_mttrans/decode_dev.fr_decode_pytorch_transformer_bpe_pretrain|**18.20**|50.5|24.1|13.3|7.6|0.971|0.971|21760|22408|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_short_long_bpe1000_specaug_asrtrans_mttrans/decode_test.fr_decode_pytorch_transformer_bpe_pretrain|**16.76**|48.6|22.3|12.0|6.7|0.976|0.976|42850|43904|
-
-### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans (1)
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_dev.fr_decode_pytorch_transformer_bpe_pretrain|**18.07**|50.4|23.9|13.2|7.5|0.974|0.974|21823|22408|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/decode_test.fr_decode_pytorch_transformer_bpe_pretrain|**16.70**|48.9|22.4|11.9|6.6|0.976|0.976|42845|43904|
-
-- Model files (archived to train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans.tar.gz by `$ pack_model.sh`)
+
+| dataset                                                                                                                                            | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| -------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_dev.fr_decode_pytorch_transformer_pretrain_ensemble3  | **19.17** | 51.3   | 25.0   | 14.1   | 8.2    | 0.974 | 0.975 | 21842   | 22408   |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_test.fr_decode_pytorch_transformer_pretrain_ensemble3 | **17.40** | 49.8   | 23.4   | 12.7   | 7.0    | 0.972 | 0.972 | 42676   | 43904   |
+
+### train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000_specaug_asrtrans_mttrans
+
+| dataset                                                                                                                                                  | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| -------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000_specaug_asrtrans_mttrans/decode_dev.fr_decode_pytorch_transformer_pretrain  | **18.20** | 50.5   | 24.1   | 13.3   | 7.6    | 0.971 | 0.971 | 21760   | 22408   |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000_specaug_asrtrans_mttrans/decode_test.fr_decode_pytorch_transformer_pretrain | **16.76** | 48.6   | 22.3   | 12.0   | 6.7    | 0.976 | 0.976 | 42850   | 43904   |
+
+### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans (1)
+
+| dataset                                                                                                                                  | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ---------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_dev.fr_decode_pytorch_transformer_pretrain  | **18.07** | 50.4   | 23.9   | 13.2   | 7.5    | 0.974 | 0.974 | 21823   | 22408   |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/decode_test.fr_decode_pytorch_transformer_pretrain | **16.70** | 48.9   | 22.4   | 11.9   | 6.6    | 0.976 | 0.976 | 42845   | 43904   |
+
+- Model files (archived to train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans.tar.gz by `$ pack_model.sh`)
   - model link: https://drive.google.com/open?id=1n9LOCN-k_3HMH6uawe440zAntDBDf16N
-  - training config file: `conf/tuning/train_pytorch_transformer_bpe_short_long.yaml`
-  - decoding config file: `conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml`
+  - training config file: `conf/tuning/train_pytorch_transformer.yaml`
+  - decoding config file: `conf/tuning/decode_pytorch_transformer_pretrain.yaml`
   - preprocess config file: `conf/specaug.yaml`
   - cmvn file: `data/train_sp.fr/cmvn.ark`
-  - e2e file: `exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/results/model.val5.avg.best`
-  - e2e JSON file: `exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_long_bpe1000_specaug_asrtrans_mttrans/results/model.json`
+  - e2e file: `exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/results/model.val5.avg.best`
+  - e2e JSON file: `exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_specaug_asrtrans_mttrans/results/model.json`
   - NOTE: This model is initialized with the Transformer ASR model (BPE1k, use SpecAugment) on the encoder side and Transformer MT model (BPE1k) on the decoder side.
 - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
 - NOTE: longer version of "short" for SpecAugment: 30ep->50ep
 
-### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans (2)
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans/decode_dev.fr_decode_pytorch_transformer_bpe_pretrain|**17.45**|49.5|23.1|12.6|7.1|0.976|0.976|21880|22408|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans_mttrans/decode_test.fr_decode_pytorch_transformer_bpe_pretrain|**16.22**|48.4|22.0|11.6|6.2|0.974|0.975|42798|43904|s
+### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans (2)
+
+| dataset                                                                                                                          | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| -------------------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- | --- |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans/decode_dev.fr_decode_pytorch_transformer_pretrain  | **17.45** | 49.5   | 23.1   | 12.6   | 7.1    | 0.976 | 0.976 | 21880   | 22408   |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_asrtrans_mttrans/decode_test.fr_decode_pytorch_transformer_pretrain | **16.22** | 48.4   | 22.0   | 11.6   | 6.2    | 0.974 | 0.975 | 42798   | 43904   | s   |
+
 - NOTE: shorten the total number epochs when pre-training the model: 100ep->30ep
 
-### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans (3)
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans/decode_dev.fr_decode_pytorch_transformer_bpe_pretrain|**16.50**|47.5|21.5|11.5|6.3|1.000|1.012|22669|22408|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_short_bpe1000_asrtrans/decode_test.fr_decode_pytorch_transformer_bpe_pretrain|**15.53**|47.0|20.9|10.8|5.6|0.994|0.995|43663|43904|
+### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_asrtrans (3)
+
+| dataset                                                                                                                  | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ------------------------------------------------------------------------------------------------------------------------ | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_asrtrans/decode_dev.fr_decode_pytorch_transformer_pretrain  | **16.50** | 47.5   | 21.5   | 11.5   | 6.3    | 1.000 | 1.012 | 22669   | 22408   |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe1000_asrtrans/decode_test.fr_decode_pytorch_transformer_pretrain | **15.53** | 47.0   | 20.9   | 10.8   | 5.6    | 0.994 | 0.995 | 43663   | 43904   |
+
 - NOTE: shorten the total number epochs when pre-training the model: 100ep->30ep
 
-### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000/decode_dev.fr_decode_pytorch_transformer_bpe|**16.66**|47.9|21.9|11.7|6.4|0.996|0.996|22308|22408|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_bpe1000/decode_test.fr_decode_pytorch_transformer_bpe|**15.47**|47.0|20.9|10.8|5.8|0.981|0.981|43076|43904|
+### train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000
+
+| dataset                                                                                                                | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ---------------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000/decode_dev.fr_decode_pytorch_transformer  | **16.66** | 47.9   | 21.9   | 11.7   | 6.4    | 0.996 | 0.996 | 22308   | 22408   |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.2_mt0.2_bpe1000/decode_test.fr_decode_pytorch_transformer | **15.47** | 47.0   | 20.9   | 10.8   | 5.8    | 0.981 | 0.981 | 43076   | 43904   |
 
-### train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000/decode_dev.fr_decode_pytorch_transformer_bpe|**16.24**|46.7|21.3|11.3|6.2|1.000|1.035|23190|22408|
-|exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_bpe_ctc_asr0.3_bpe1000/decode_test.fr_decode_pytorch_transformer_bpe|**15.30**|46.1|20.4|10.5|5.5|1.000|1.012|44448|43904|
+### train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000
 
+| dataset                                                                                                          | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ---------------------------------------------------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000/decode_dev.fr_decode_pytorch_transformer  | **16.24** | 46.7   | 21.3   | 11.3   | 6.2    | 1.000 | 1.035 | 23190   | 22408   |
+| exp/train_sp.fr_lc_pytorch_train_pytorch_transformer_ctcasr0.3_bpe1000/decode_test.fr_decode_pytorch_transformer | **15.30** | 46.1   | 20.4   | 10.5   | 5.5    | 1.000 | 1.012 | 44448   | 43904   |
 
 # RNN results
+
 ### train_sp.fr_lc_pytorch_train_asrtrans_mttrans
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_asrtrans_mttrans/decode_dev.fr_decode|**17.59**|48.6|23.0|12.5|7.1|0.992|0.992|22273|22462|
-|exp/train_sp.fr_lc_pytorch_train_asrtrans_mttrans/decode_test.fr_decode|**16.68**|48.1|22.0|11.9|6.6|0.984|0.984|43389|44080|
+
+| dataset                                                                 | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| ----------------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_asrtrans_mttrans/decode_dev.fr_decode  | **17.59** | 48.6   | 23.0   | 12.5   | 7.1    | 0.992 | 0.992 | 22273   | 22462   |
+| exp/train_sp.fr_lc_pytorch_train_asrtrans_mttrans/decode_test.fr_decode | **16.68** | 48.1   | 22.0   | 11.9   | 6.6    | 0.984 | 0.984 | 43389   | 44080   |
 
 ### train_sp.fr_lc_pytorch_train_asrtrans
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_asrtrans/decode_dev.fr_decode|**17.79**|48.4|22.8|12.6|7.3|0.995|0.995|22357|22462|
-|exp/train_sp.fr_lc_pytorch_train_asrtrans/decode_test.fr_decode|**16.30**|47.3|21.6|11.4|6.2|0.993|0.993|43772|44080|
+
+| dataset                                                         | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_asrtrans/decode_dev.fr_decode  | **17.79** | 48.4   | 22.8   | 12.6   | 7.3    | 0.995 | 0.995 | 22357   | 22462   |
+| exp/train_sp.fr_lc_pytorch_train_asrtrans/decode_test.fr_decode | **16.30** | 47.3   | 21.6   | 11.4   | 6.2    | 0.993 | 0.993 | 43772   | 44080   |
 
 ### train_sp.fr_lc_pytorch_train
-|dataset|BLEU|1-gram|2-gram|3-gram|4-gram|BP|ratio|hyp_len|ref_len|
-|---|---|---|---|---|---|---|---|---|---|
-|exp/train_sp.fr_lc_pytorch_train_asrtrans/decode_dev.fr_decode|**16.67**|46.9|21.6|11.7|6.5|1.000|1.009|22668|22462|
-|exp/train_sp.fr_lc_pytorch_train_asrtrans/decode_test.fr_decode|**15.71**|45.9|20.8|10.9|5.9|1.000|1.010|44533|44080|
+
+| dataset                                                         | BLEU      | 1-gram | 2-gram | 3-gram | 4-gram | BP    | ratio | hyp_len | ref_len |
+| --------------------------------------------------------------- | --------- | ------ | ------ | ------ | ------ | ----- | ----- | ------- | ------- |
+| exp/train_sp.fr_lc_pytorch_train_asrtrans/decode_dev.fr_decode  | **16.67** | 46.9   | 21.6   | 11.7   | 6.5    | 1.000 | 1.009 | 22668   | 22462   |
+| exp/train_sp.fr_lc_pytorch_train_asrtrans/decode_test.fr_decode | **15.71** | 45.9   | 20.8   | 10.9   | 5.9    | 1.000 | 1.010 | 44533   | 44080   |
diff --git a/egs/libri_trans/st1/cmd.sh b/egs/libri_trans/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/libri_trans/st1/cmd.sh
+++ b/egs/libri_trans/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/libri_trans/st1/conf/decode.yaml b/egs/libri_trans/st1/conf/decode.yaml
index ef2b5af9c12..1f358f011d4 120000
--- a/egs/libri_trans/st1/conf/decode.yaml
+++ b/egs/libri_trans/st1/conf/decode.yaml
@@ -1 +1 @@
-tuning/decode_rnn_char.yaml
\ No newline at end of file
+tuning/decode_pytorch_transformer.yaml
\ No newline at end of file
diff --git a/egs/libri_trans/st1/conf/train.yaml b/egs/libri_trans/st1/conf/train.yaml
index 7f06d3e0fb0..6619e5b1e4f 120000
--- a/egs/libri_trans/st1/conf/train.yaml
+++ b/egs/libri_trans/st1/conf/train.yaml
@@ -1 +1 @@
-tuning/train_rnn_char.yaml
\ No newline at end of file
+tuning/train_pytorch_conformer.yaml
\ No newline at end of file
diff --git a/egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer_bpe.yaml b/egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer.yaml
similarity index 100%
rename from egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer_bpe.yaml
rename to egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer.yaml
diff --git a/egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml b/egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml
deleted file mode 100644
index 4d716366bf2..00000000000
--- a/egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer_bpe_pretrain.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-batchsize: 0
-beam-size: 10
-penalty: 0.8
-maxlenratio: 0.3
-minlenratio: 0.0
diff --git a/egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer_pretrain.yaml b/egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer_pretrain.yaml
new file mode 100644
index 00000000000..7c11fcce657
--- /dev/null
+++ b/egs/libri_trans/st1/conf/tuning/decode_pytorch_transformer_pretrain.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.6
+maxlenratio: 0.4
+minlenratio: 0.0
diff --git a/egs/libri_trans/st1/conf/tuning/decode_rnn_char.yaml b/egs/libri_trans/st1/conf/tuning/decode_rnn.yaml
similarity index 100%
rename from egs/libri_trans/st1/conf/tuning/decode_rnn_char.yaml
rename to egs/libri_trans/st1/conf/tuning/decode_rnn.yaml
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_conformer.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_conformer.yaml
new file mode 100644
index 00000000000..b0cbd7bbef4
--- /dev/null
+++ b/egs/libri_trans/st1/conf/tuning/train_pytorch_conformer.yaml
@@ -0,0 +1,55 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# multitask
+mtlalpha: 0.0  # CTC weight
+asr-weight: 0.0
+mt-weight: 0.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50  # longer version of "short" for SpecAugment: 30ep->50ep
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_st_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.5
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..1f4ee26859c
--- /dev/null
+++ b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,46 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# multitask
+mtlalpha: 0.0  # CTC weight
+asr-weight: 0.0
+mt-weight: 0.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 50  # longer version of "short" for SpecAugment: 30ep->50ep
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.5
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe.yaml
deleted file mode 100644
index a0180a61bfb..00000000000
--- a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 1.0
-asr-weight: 0.3
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 100
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2.yaml
deleted file mode 100644
index ffbf5d5dbe4..00000000000
--- a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 1.0
-asr-weight: 0.2
-mt-weight: 0.2
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 90
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_short_long.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_short_long.yaml
deleted file mode 100644
index 26de23dd7ee..00000000000
--- a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.2_mt0.2_short_long.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 1.0
-asr-weight: 0.2
-mt-weight: 0.2
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 50
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.3.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.3.yaml
deleted file mode 100644
index 6fb00bb9edd..00000000000
--- a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_ctc_asr0.3.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 1.0
-asr-weight: 0.3
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 70
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
-
-# Report CER & WER
-report-cer: true
-report-wer: true
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_short.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_short.yaml
deleted file mode 100644
index 9c4b5631631..00000000000
--- a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_short.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 0.0
-asr-weight: 0.0
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 30  # shorten the total number epochs when pre-training the model: 100ep->30ep
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_short_long.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_short_long.yaml
deleted file mode 100644
index 2feb880f05b..00000000000
--- a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_bpe_short_long.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 0.0
-asr-weight: 0.0
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 64  # for BPE
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 50  # longer version of "short" for SpecAugment: 30ep->50ep
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_char_short.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_char_short.yaml
deleted file mode 100644
index 9847a99e1ac..00000000000
--- a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_char_short.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# network architecture
-# encoder related
-elayers: 12
-eunits: 2048
-# decoder related
-dlayers: 6
-dunits: 2048
-# attention related
-adim: 256
-aheads: 4
-
-# multitask
-mtlalpha: 0.0
-asr-weight: 0.0
-mt-weight: 0.0
-
-# label smoothing
-lsm-weight: 0.1
-
-# minibatch related
-batch-size: 32  # for char
-maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-
-# optimization related
-sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-opt: noam
-accum-grad: 2
-grad-clip: 5
-patience: 0
-epochs: 50
-dropout-rate: 0.1
-
-# transformer specific setting
-backend: pytorch
-model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
-transformer-input-layer: conv2d     # encoder architecture type
-transformer-lr: 2.5
-transformer-warmup-steps: 25000
-transformer-attn-dropout-rate: 0.0
-transformer-length-normalized-loss: false
-transformer-init: pytorch
-
-# pre-training related
-enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
-dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_ctcasr0.2_mt0.2.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_ctcasr0.2_mt0.2.yaml
new file mode 100644
index 00000000000..584037599a7
--- /dev/null
+++ b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_ctcasr0.2_mt0.2.yaml
@@ -0,0 +1,50 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# multitask
+mtlalpha: 1.0  # CTC-weight
+asr-weight: 0.2
+mt-weight: 0.2
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 90
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.5
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
+
+# Report CER & WER
+report-cer: true
+report-wer: true
diff --git a/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_ctcasr0.3.yaml b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_ctcasr0.3.yaml
new file mode 100644
index 00000000000..2aeeb4e9822
--- /dev/null
+++ b/egs/libri_trans/st1/conf/tuning/train_pytorch_transformer_ctcasr0.3.yaml
@@ -0,0 +1,50 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# multitask
+mtlalpha: 1.0  # CTC-weight
+asr-weight: 0.3
+mt-weight: 0.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 64
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 70
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_st_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.5
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
+
+# Report CER & WER
+report-cer: true
+report-wer: true
diff --git a/egs/libri_trans/st1/conf/tuning/train_rnn_char.yaml b/egs/libri_trans/st1/conf/tuning/train_rnn.yaml
similarity index 100%
rename from egs/libri_trans/st1/conf/tuning/train_rnn_char.yaml
rename to egs/libri_trans/st1/conf/tuning/train_rnn.yaml
diff --git a/egs/libri_trans/st1/local/data_prep.sh b/egs/libri_trans/st1/local/data_prep.sh
index aa5471bace6..e4c9ef93dc8 100755
--- a/egs/libri_trans/st1/local/data_prep.sh
+++ b/egs/libri_trans/st1/local/data_prep.sh
@@ -37,7 +37,8 @@ n_fr2=$(cat ${src}/${set}_gtranslate.fr | wc -l)
 # extract meta data
 sed -e 's/\s\+/ /g' ${src}/alignments.meta | sed -e '1d' | while read line; do
     file_name=$(echo $line | cut -f 5 -d " ")
-    if [ ${set} = other ] && [ $(soxi -t ${src}/audiofiles/${file_name}.wav) = flac ]; then
+    format=$(soxi -t ${src}/audiofiles/${file_name}.wav)
+    if [ ${set} = other ] && [ -z "${format}" ]; then
         # NOTE: some utterances in other directory are flac rather than wav files
         echo ${file_name} | awk -v "dir=${src}/audiofiles" '{printf "%s flac -c -d -s %s/%s.wav |\n", $0, dir, $0}' >> ${wav_scp} || exit 1;
     else
@@ -88,21 +89,15 @@ for lang in en fr fr.gtranslate; do
     remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
 
     # tokenization
-    if [ ${lang} = fr.gtranslate ]; then
-        tokenizer.perl -l fr -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
-        tokenizer.perl -l fr -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
-        tokenizer.perl -l fr -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
-    else
-        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.tc > ${dst}/${lang}.norm.tc.tok
-        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.tok
-        tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.lc.rm > ${dst}/${lang}.norm.lc.rm.tok
-    fi
-    paste -d " " <(awk '{print $1}' ${wav_scp}) <(cat ${dst}/${lang}.norm.tc.tok | awk '{if(NF>0) {print $0;} else {print "emptyuttrance";}}') \
-        > ${dst}/text.tc.${lang}
-    paste -d " " <(awk '{print $1}' ${wav_scp}) <(cat ${dst}/${lang}.norm.lc.tok | awk '{if(NF>0) {print $0;} else {print "emptyuttrance";}}') \
-        > ${dst}/text.lc.${lang}
-    paste -d " " <(awk '{print $1}' ${wav_scp}) <(cat ${dst}/${lang}.norm.lc.rm.tok | awk '{if(NF>0) {print $0;} else {print "emptyuttrance";}}') \
-        > ${dst}/text.lc.rm.${lang}
+    for case in lc.rm lc tc; do
+        if [ ${lang} = fr.gtranslate ]; then
+            tokenizer.perl -l fr -q < ${dst}/${lang}.norm.${case} > ${dst}/${lang}.norm.${case}.tok
+        else
+            tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.${case} > ${dst}/${lang}.norm.${case}.tok
+        fi
+        paste -d " " <(awk '{print $1}' ${wav_scp}) <(cat ${dst}/${lang}.norm.${case}.tok | awk '{if(NF>0) {print $0;} else {print "emptyuttrance";}}') \
+            > ${dst}/text.${case}.${lang}
+    done
 
     # save original and cleaned punctuation
     text2token.py -s 0 -n 1 ${dst}/${lang}.org | tr " " "\n" \
@@ -135,19 +130,11 @@ mkdir -p data/${set}
 for f in spk2utt utt2spk wav.scp; do
     cp ${dst}/${f} data/${set}/${f}
 done
-# en
-cp ${dst}/text.tc.en data/${set}/text.tc.en
-cp ${dst}/text.lc.en data/${set}/text.lc.en
-cp ${dst}/text.lc.rm.en data/${set}/text.lc.rm.en
-# fr
-cp ${dst}/text.tc.fr data/${set}/text.tc.fr
-cp ${dst}/text.lc.fr data/${set}/text.lc.fr
-cp ${dst}/text.lc.rm.fr data/${set}/text.lc.rm.fr
-# fr.gtranslate
-cp ${dst}/text.tc.fr.gtranslate data/${set}/text.tc.fr.gtranslate
-cp ${dst}/text.lc.fr.gtranslate data/${set}/text.lc.fr.gtranslate
-cp ${dst}/text.lc.rm.fr.gtranslate data/${set}/text.lc.rm.fr.gtranslate
-
+for lang in en fr fr.gtranslate; do
+    for case in lc.rm lc tc; do
+        cp ${dst}/text.${case}.${lang} data/${set}/text.${case}.${lang}
+    done
+done
 
 # remove empty and sort utterances
 cp -rf data/${set} data/${set}.tmp
diff --git a/egs/libri_trans/st1/local/divide_lang.sh b/egs/libri_trans/st1/local/divide_lang.sh
deleted file mode 100755
index 23f6bd5a0fd..00000000000
--- a/egs/libri_trans/st1/local/divide_lang.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2018 Kyoto University (Hirofumi Inaguma)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 <set>"
-    echo "e.g.: $0 dev"
-    exit 1
-fi
-
-set=$1
-
-# Copy stuff into its final locations [this has been moved from the format_data script]
-# for En
-mkdir -p data/${set}.en
-for f in spk2utt utt2spk wav.scp feats.scp utt2num_frames; do
-    if [ -f data/${set}/${f} ]; then
-        sort data/${set}/${f} > data/${set}.en/${f}
-    fi
-done
-sort data/${set}/text.lc.rm.en > data/${set}.en/text  # dummy
-sort data/${set}/text.tc.en > data/${set}.en/text.tc
-sort data/${set}/text.lc.en > data/${set}.en/text.lc
-sort data/${set}/text.lc.rm.en > data/${set}.en/text.lc.rm
-
-utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
-utils/validate_data_dir.sh --no-feats data/${set}.en || exit 1;
-
-# for Fr
-for lang in fr fr.gtranslate; do
-    mkdir -p data/${set}.${lang}
-    for f in spk2utt utt2spk wav.scp feats.scp utt2num_frames; do
-        if [ -f data/${set}/${f} ]; then
-            sort data/${set}/${f} > data/${set}.${lang}/${f}
-        fi
-    done
-    sort data/${set}/text.tc.${lang} > data/${set}.${lang}/text  # dummy
-    sort data/${set}/text.tc.${lang} > data/${set}.${lang}/text.tc
-    sort data/${set}/text.lc.${lang} > data/${set}.${lang}/text.lc
-    sort data/${set}/text.lc.rm.${lang} > data/${set}.${lang}/text.lc.rm
-
-    utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
-    utils/validate_data_dir.sh --no-feats data/${set}.${lang} || exit 1;
-done
diff --git a/egs/libri_trans/st1/run.sh b/egs/libri_trans/st1/run.sh
index eb6f7ee0bf6..e04711da3e1 100755
--- a/egs/libri_trans/st1/run.sh
+++ b/egs/libri_trans/st1/run.sh
@@ -7,7 +7,7 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=-1        # start from -1 if you need to start from data download
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
@@ -34,6 +34,7 @@ n_average=5                  # the number of ST models to be averaged
 use_valbest_average=true     # if true, the validation `n_average`-best ST models will be averaged.
                              # if false, the last `n_average` ST models will be averaged.
 metric=bleu                  # loss/acc/bleu
+max_epoch=50                 # for checkpoint selection
 
 # pre-training related
 asr_model=
@@ -107,7 +108,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into En Fr, Fr (google trans)
     for x in ${train_set_prefix} dev test; do
-        local/divide_lang.sh ${x}
+        divide_lang.sh ${x} "en fr fr.gtranslate"
     done
 
     for lang in en fr fr.gtranslate; do
@@ -126,23 +127,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
 
     # dump features for training
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/libri_trans/st1/dump/${train_set}/delta${do_delta}/storage \
-          ${feat_tr_dir}/storage
-    fi
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/libri_trans/st1/dump/${train_dev}/delta${do_delta}/storage \
-          ${feat_dt_dir}/storage
-    fi
     dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
         data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
     for x in ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}; mkdir -p ${feat_trans_dir}
         dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
-            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${x} \
-            ${feat_trans_dir}
+            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${x} ${feat_trans_dir}
     done
 fi
 
@@ -170,15 +160,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
-        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set_prefix}.fr.gtranslate/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
-        data/${train_set_prefix}.fr.gtranslate ${dict} > ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json
-    for x in ${train_dev} ${trans_set}; do
+    for x in /${train_set} ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang fr \
+        data2json.sh --nj 16 --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "fr" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     done
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set_prefix}.fr.gtranslate/text.${tgt_case} --bpecode ${bpemodel}.model --lang "fr" \
+        data/${train_set_prefix}.fr.gtranslate ${dict} > ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json
 
     # update json (add source references)
     update_json.sh --text data/"$(echo ${train_set} | cut -f 1 -d ".")".en/text.${src_case} --bpecode ${bpemodel}.model \
@@ -191,7 +179,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # concatenate Fr and Fr (Google translation) jsons
     concat_json_multiref.py \
         ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
-        ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json > ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        ${feat_tr_dir}/data_gtranslate${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        > ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
 fi
 
 # NOTE: skip stage 3: LM Preparation
@@ -237,7 +226,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --train-json ${feat_tr_dir}/data_2ref_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
         --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
         --enc-init ${asr_model} \
-        --dec-init ${mt_model}
+        --dec-init ${mt_model} \
+        --n-iter-processes 2
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -257,7 +247,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --backend ${backend} \
             --snapshots ${expdir}/results/snapshot.ep.* \
             --out ${expdir}/results/${trans_model} \
-            --num ${n_average}
+            --num ${n_average} \
+            --max-epoch ${max_epoch}
     fi
 
     if [ ${dec_ngpu} = 1 ]; then
@@ -288,8 +279,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
-            ${expdir}/${decode_dir} fr ${dict}
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+            ${expdir}/${decode_dir} "fr" ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
     ) &
diff --git a/egs/librispeech/asr1/RESULTS.md b/egs/librispeech/asr1/RESULTS.md
index 385fddb2837..f4140eac0ee 100644
--- a/egs/librispeech/asr1/RESULTS.md
+++ b/egs/librispeech/asr1/RESULTS.md
@@ -63,6 +63,37 @@ exp/train_960_pytorch_train_pytorch_conformer_large_specaug/decode_test_other_mo
 |    Sum/Avg         |    2939        52343     |    95.3          4.1           0.6          0.6           5.3         44.8     |
 ```
 
+# pytorch large conformer-transducer with specaug + speed perturbation (4 GPUs)
+
+- Environments
+  - python version: `3.8.3 (default)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.7a1`
+  - chainer version: `chainer 6.0.0`
+  - pytorch version: `pytorch 1.10.0`
+
+- Model files (archived to model.tar.gz by `$ pack_model.sh`)
+    - model link: ([pretrained model](https://drive.google.com/file/d/1fdadICi2w_b6lqb9_7J3wfRJc3LTnnSq/view?usp=sharing))
+    - training config file: `conf/tuning/transducer/train_conformer-rnn_transducer.yaml`
+    - decoding config file: `conf/tuning/transducer/decode.yaml`
+    - cmvn file: `data/train_sp/cmvn.ark`
+    - e2e file: `exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/results/model.last10.avg.best`
+    - e2e JSON file: `exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/results/model.json`
+    - dict file: `data/lang_char`
+  - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
+```
+exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/decode_dev_clean_model.last10.avg.best/result.wrd.txt
+|    SPKR           |    # Snt       # Wrd     |    Corr          Sub          Del          Ins           Err        S.Err    |
+|    Sum/Avg        |    2703        54402     |    97.6          2.2          0.2          0.3           2.7         33.0    |
+exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/decode_dev_other_model.last10.avg.best/result.wrd.txt
+|    SPKR           |    # Snt       # Wrd     |    Corr          Sub          Del          Ins           Err        S.Err    |
+|    Sum/Avg        |    2864        50948     |    93.7          5.7          0.6          0.7           7.0         52.8    |
+exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/decode_test_clean_model.last10.avg.best/result.wrd.txt
+|    SPKR           |    # Snt        # Wrd    |    Corr          Sub           Del          Ins          Err         S.Err    |
+|    Sum/Avg        |    2620         52576    |    97.4          2.3           0.3          0.3          2.9          33.1    |
+exp/train_960_pytorch_transducer_train_conformer-rnn_transducer/decode_test_other_model.last10.avg.best/result.wrd.txt
+|    SPKR           |    # Snt        # Wrd    |    Corr          Sub           Del          Ins          Err         S.Err    |
+|    Sum/Avg        |    2939         52343    |    93.7          5.6           0.7          0.8          7.1          55.1    |
+```
 
 # Lightweight/Dynamic convolution results
 | |         | # Snt | # Wrd |Corr|Sub|Del|Ins|Err|S.Err |
diff --git a/egs/librispeech/asr1/cmd.sh b/egs/librispeech/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/librispeech/asr1/cmd.sh
+++ b/egs/librispeech/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/librispeech/asr1/conf/tuning/train_pytorch_conformer_selfconditioned.yaml b/egs/librispeech/asr1/conf/tuning/train_pytorch_conformer_selfconditioned.yaml
new file mode 100644
index 00000000000..edab5252c76
--- /dev/null
+++ b/egs/librispeech/asr1/conf/tuning/train_pytorch_conformer_selfconditioned.yaml
@@ -0,0 +1,55 @@
+# Sample config for "Relaxing the Conditional Independence Assumption of CTC-based ASR by Conditioning on Intermediate Predictions"
+# https://arxiv.org/abs/2104.02724
+
+# network architecture
+# encoder related
+elayers: 18
+eunits: 2048
+# decoder related
+# NOTE: we don't use any decoder as `mtlalpha == 1.0`.
+dlayers: 0
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+# 1.0: CTC only
+mtlalpha: 1.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0  # 10.0 may be too large for CTC-only models
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# intermediate CTC + stochastic depth
+intermediate-ctc-layer: '3,6,9,12,15'
+intermediate-ctc-weight: 0.5
+stochastic-depth-rate: 0.0
+self-conditioning: true
+
+# Report CER & WER
+report-cer: true
+report-wer: true
diff --git a/egs/librispeech/asr1/conf/tuning/transducer/decode.yaml b/egs/librispeech/asr1/conf/tuning/transducer/decode.yaml
new file mode 100644
index 00000000000..021c9b058ca
--- /dev/null
+++ b/egs/librispeech/asr1/conf/tuning/transducer/decode.yaml
@@ -0,0 +1,4 @@
+batch: 0
+beam-size: 10
+search-type: default
+score-norm: True
diff --git a/egs/librispeech/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml b/egs/librispeech/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
new file mode 100644
index 00000000000..41f4233b566
--- /dev/null
+++ b/egs/librispeech/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
@@ -0,0 +1,50 @@
+# minibatch related
+batch-size: 32
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+noam-adim: 256
+transformer-lr: 1.0
+transformer-warmup-steps: 25000
+epochs: 100
+patience: 0
+accum-grad: 4
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+enc-block-arch:
+        - type: conformer
+          d_hidden: 512
+          d_ff: 2048
+          heads: 4
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+enc-block-repeat: 12
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 1024
+dunits: 512
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 512
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+
diff --git a/egs/librispeech_100/asr1/RESULTS.md b/egs/librispeech_100/asr1/RESULTS.md
new file mode 100644
index 00000000000..a5c4499cc95
--- /dev/null
+++ b/egs/librispeech_100/asr1/RESULTS.md
@@ -0,0 +1,52 @@
+# Conformer-CTC
+  - Model files (archived to model.tar.gz by `$ pack_model.sh`)
+    - model link: (<https://drive.google.com/file/d/1w-GzALrVIbCNiMpGh3UajhvXpOGPMil_>)
+    - training config file: `conf/tuning/train_conformer_ctc.yaml`
+    - decoding config file: `conf/tuning/decode_ctc.yaml`
+    - cmvn file: `data/train_clean_100_sp/cmvn.ark`
+    - e2e file: `exp/train_clean_100_sp_pytorch_train_conformer_ctc_nbpe300_specaug/results/model.cer5.avg.best`
+    - e2e JSON file: `exp/train_clean_100_sp_pytorch_train_conformer_ctc_nbpe300_specaug/results/model.json`
+    - dict file: `data/lang_char`
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_clean_model.cer5.avg.best_decode_ctc_nolm|2703|54402|93.3|6.1|0.6|0.7|7.4|62.6|
+|decode_dev_other_model.cer5.avg.best_decode_ctc_nolm|2864|50948|81.9|16.3|1.8|2.1|20.2|84.4|
+|decode_test_clean_model.cer5.avg.best_decode_ctc_nolm|2620|52576|93.1|6.3|0.6|0.8|7.7|63.5|
+|decode_test_other_model.cer5.avg.best_decode_ctc_nolm|2939|52343|81.3|16.5|2.1|2.0|20.6|85.2|
+
+# Conformer-CTC/Attention
+  - Model files (archived to model.tar.gz by `$ pack_model.sh`)
+    - model link: (<https://drive.google.com/file/d/1QTVqk4sPSdECjuqEjr7vP0ZG7PoLgIMc>)
+    - training config file: `conf/tuning/train_conformer_ctcatt.yaml`
+    - decoding config file: `conf/tuning/decode_ctcatt.yaml`
+    - cmvn file: `data/train_clean_100_sp/cmvn.ark`
+    - e2e file: `exp/train_clean_100_sp_pytorch_train_conformer_ctcatt_nbpe300_specaug/results/model.val5.avg.best`
+    - e2e JSON file: `exp/train_clean_100_sp_pytorch_train_conformer_ctcatt_nbpe300_specaug/results/model.json`
+    - dict file: `data/lang_char`
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_clean_model.val5.avg.best_decode_ctcatt_cw0.3_nolm|2703|54402|93.7|5.3|1.0|0.9|7.2|56.9|
+|decode_dev_other_model.val5.avg.best_decode_ctcatt_cw0.3_nolm|2864|50948|83.5|14.6|1.9|2.0|18.5|81.5|
+|decode_test_clean_model.val5.avg.best_decode_ctcatt_cw0.3_nolm|2620|52576|93.4|5.5|1.1|0.7|7.3|57.7|
+|decode_test_other_model.val5.avg.best_decode_ctcatt_cw0.3_nolm|2939|52343|82.7|15.0|2.2|2.1|19.3|81.8|
+
+# Conformer-Transducer
+  - Model files (archived to model.tar.gz by `$ pack_model.sh`)
+    - model link: (<https://drive.google.com/file/d/1seIYIpMe2gVYM-bWbRbrrqjFuptr1jYb>)
+    - training config file: `conf/tuning/train_conformer_transducer.yaml`
+    - decoding config file: `conf/tuning/decode_transducer.yaml`
+    - cmvn file: `data/train_clean_100_sp/cmvn.ark`
+tar: Removing leading `/' from member names
+    - e2e file: `exp/train_clean_100_sp_pytorch_train_conformer_transducer_nbpe300_specaug/results/model.last10.avg.best`
+    - e2e JSON file: `exp/train_clean_100_sp_pytorch_train_conformer_transducer_nbpe300_specaug/results/model.json`
+tar: Removing leading `/' from member names
+    - dict file: `data/lang_char`
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_dev_clean_model.last10.avg.best_decode_transducer_nolm|2703|54402|93.4|5.9|0.7|0.7|7.3|61.6|
+|decode_dev_other_model.last10.avg.best_decode_transducer_nolm|2864|50948|82.1|15.6|2.3|1.9|19.9|84.0|
+|decode_test_clean_model.last10.avg.best_decode_transducer_nolm|2620|52576|93.0|6.1|0.9|0.8|7.8|63.6|
+|decode_test_other_model.last10.avg.best_decode_transducer_nolm|2939|52343|82.1|15.4|2.5|1.8|19.8|84.8|
diff --git a/egs/librispeech_100/asr1/cmd.sh b/egs/librispeech_100/asr1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/librispeech_100/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/an4/tts1/conf/fbank.conf b/egs/librispeech_100/asr1/conf/fbank.conf
similarity index 100%
rename from egs2/an4/tts1/conf/fbank.conf
rename to egs/librispeech_100/asr1/conf/fbank.conf
diff --git a/egs/librispeech_100/asr1/conf/gpu.conf b/egs/librispeech_100/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/pitch.conf b/egs/librispeech_100/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/librispeech_100/asr1/conf/queue.conf b/egs/librispeech_100/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/librispeech_100/asr1/conf/slurm.conf b/egs/librispeech_100/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/librispeech_100/asr1/conf/specaug.yaml b/egs/librispeech_100/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/librispeech_100/asr1/conf/tuning/decode_ctc.yaml b/egs/librispeech_100/asr1/conf/tuning/decode_ctc.yaml
new file mode 100644
index 00000000000..04b74a5d1e7
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,9 @@
+# best path decoding
+# if you want to use beam-search decoding with an LM, use '--api v2'
+batchsize: 0
+beam-size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 1.0
+lm-weight: 0.0
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/decode_ctcatt.yaml b/egs/librispeech_100/asr1/conf/tuning/decode_ctcatt.yaml
new file mode 100644
index 00000000000..a74a2c7022e
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/decode_ctcatt.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.3
+lm-weight: 0.0
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/decode_transducer.yaml b/egs/librispeech_100/asr1/conf/tuning/decode_transducer.yaml
new file mode 100644
index 00000000000..df1067ce715
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/decode_transducer.yaml
@@ -0,0 +1,5 @@
+# decoding parameters
+batch: 0
+beam-size: 1
+search-type: default
+score-norm: True
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml b/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml
new file mode 100644
index 00000000000..1d66b4b3a36
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml
@@ -0,0 +1,54 @@
+# network architecture
+# encoder related
+elayers: 18
+eunits: 1024
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 1.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 8
+grad-clip: 5
+patience: 0
+epochs: 100 # 70 epochs give reasonable results
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
+rel-pos-type: latest
+
+# Report CER & WER
+report-cer: true # important for reporting cer_ctc
+report-wer: true
+
+# for visualization
+num-save-attention: 0
+num-save-ctc: 0
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctcatt.yaml b/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctcatt.yaml
new file mode 100644
index 00000000000..465cbc4fc65
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/train_conformer_ctcatt.yaml
@@ -0,0 +1,53 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 1024
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 8
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
+rel-pos-type: latest
+
+# for visualization
+num-save-attention: 0
+num-save-ctc: 0
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/conf/tuning/train_conformer_transducer.yaml b/egs/librispeech_100/asr1/conf/tuning/train_conformer_transducer.yaml
new file mode 100644
index 00000000000..3ed6a6287cb
--- /dev/null
+++ b/egs/librispeech_100/asr1/conf/tuning/train_conformer_transducer.yaml
@@ -0,0 +1,52 @@
+# minibatch related
+batch-size: 16
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+epochs: 100 # # 70 epochs give reasonable results
+patience: 0
+accum-grad: 8
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+custom-enc-input-dropout-rate: 0.3
+enc-block-arch:
+        - type: conformer
+          d_hidden: 256
+          d_ff: 1024
+          heads: 4
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+enc-block-repeat: 18
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 300
+dunits: 300
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 300
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+
+num-save-attention: 0
+num-save-ctc: 0
diff --git a/egs/librispeech_100/asr1/local b/egs/librispeech_100/asr1/local
new file mode 120000
index 00000000000..ed938bae393
--- /dev/null
+++ b/egs/librispeech_100/asr1/local
@@ -0,0 +1 @@
+../../librispeech/asr1/local
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/path.sh b/egs/librispeech_100/asr1/path.sh
new file mode 100644
index 00000000000..8d773b5ffee
--- /dev/null
+++ b/egs/librispeech_100/asr1/path.sh
@@ -0,0 +1,15 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/librispeech_100/asr1/run.sh b/egs/librispeech_100/asr1/run.sh
new file mode 100644
index 00000000000..e031e8e0a63
--- /dev/null
+++ b/egs/librispeech_100/asr1/run.sh
@@ -0,0 +1,300 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1       # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=1         # number of gpus ("0" uses cpu, otherwise use gpu)
+nj=32
+debugmode=1
+dumpdir=dump   # directory to dump full features
+N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      # verbose option
+resume=        # Resume the training from snapshot
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=
+lm_config=
+decode_config=
+
+# rnnlm related
+skip_lm_training=true  # for training & decoding without LM
+lm_resume=             # specify a snapshot file to resume LM training
+lmtag=                 # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best  # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+lang_model=rnnlm.model.best # set a language model to be used for decoding
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=false    # if true, models with top-`n_average` validation/loss are averaged
+use_cerbest_average=false    # if true, models with top-`n_average` validation/cer_cer are averaged
+                             # if both use_{valbest,cerbest}_average are false, last `n_average` are averaged
+
+# Set this to somewhere where you want to put your data, or where
+# someone else has already put it.  You'll want to change this
+# if you're not on the CLSP grid.
+datadir=/mnt/aoni04/higuchi/data
+
+# base url for downloads.
+data_url=www.openslr.org/resources/12
+
+# bpemode (unigram or bpe)
+nbpe=300
+bpemode=unigram
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_clean_100_sp # train_clean_100 or train_clean_100_sp
+train_dev=dev
+recog_set="test_clean test_other dev_clean dev_other"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    for part in dev-clean test-clean dev-other test-other train-clean-100; do
+        local/download_and_untar.sh ${datadir} ${data_url} ${part}
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    for part in dev-clean test-clean dev-other test-other train-clean-100; do
+        # use underscore-separated names in data directories.
+        local/data_prep.sh ${datadir}/LibriSpeech/${part} data/${part//-/_}
+    done
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in dev_clean test_clean dev_other test_other train_clean_100; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+    # speed perturbation
+    mv data/train_clean_100 data/train_clean_100_org
+    utils/perturb_data_dir_speed.sh 0.9  data/train_clean_100_org  data/temp1
+    utils/perturb_data_dir_speed.sh 1.0  data/train_clean_100_org  data/temp2
+    utils/perturb_data_dir_speed.sh 1.1  data/train_clean_100_org  data/temp3
+    utils/combine_data.sh --extra-files utt2uniq data/train_clean_100_sp_org data/temp1 data/temp2 data/temp3
+
+    # create dev set
+    utils/combine_data.sh --extra_files utt2num_frames data/dev_org data/dev_clean data/dev_other
+
+    # remove utt having more than 3000 frames
+    # remove utt having more than 400 characters
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/train_clean_100_org data/train_clean_100
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/train_clean_100_sp_org data/train_clean_100_sp
+    remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/dev_org data/dev
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj  --write_utt2num_frames true \
+            data/train_clean_100_sp  exp/make_fbank/train_clean_100_sp  ${fbankdir}
+    utils/fix_data_dir.sh data/train_clean_100_sp
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_dev}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/dev ${feat_dt_dir}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+            data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \
+            ${feat_recog_dir}
+    done
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_char/
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    wc -l ${dict}
+
+    # make json labels
+    data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --nj ${nj} --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --nj ${nj} --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+if [ -z ${lmtag} ] && ! ${skip_lm_training}; then
+    lmtag=$(basename ${lm_config%.*})
+fi
+# you can skip LM training by setting skip_lm_training=true
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && ! ${skip_lm_training}; then
+    echo "stage 3: LM Preparation"
+    lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+    lmexpdir=exp/${lmexpname}
+    mkdir -p ${lmexpdir}
+
+    lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+    # use external data
+    if [ ! -e data/local/lm_train/librispeech-lm-norm.txt.gz ]; then
+        wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm_train/
+    fi
+    if [ ! -e ${lmdatadir} ]; then
+        mkdir -p ${lmdatadir}
+        cut -f 2- -d" " data/${train_set}/text | gzip -c > data/local/lm_train/${train_set}_text.gz
+        # combine external text and transcriptions and shuffle them with seed 777
+        zcat data/local/lm_train/librispeech-lm-norm.txt.gz data/local/lm_train/${train_set}_text.gz |\
+            spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+        cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
+                                                            > ${lmdatadir}/valid.txt
+    fi
+    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+        lm_train.py \
+        --config ${lm_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --verbose 1 \
+        --outdir ${lmexpdir} \
+        --tensorboard-dir tensorboard/${lmexpname} \
+        --train-label ${lmdatadir}/train.txt \
+        --valid-label ${lmdatadir}/valid.txt \
+        --resume ${lm_resume} \
+        --dict ${dict} \
+        --dump-hdf5-path ${lmdatadir}
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
+           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log"
+        elif ${use_cerbest_average}; then
+            recog_model=model.cer${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric cer_ctc"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+
+        # LM option
+        recog_opts=
+        if ${skip_lm_training}; then
+            lmtag="nolm"
+        else
+            recog_opts="--rnnlm ${lmexpdir}/${lang_model}"
+        fi
+    fi
+
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${recog_model}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        # set batchsize 0 to disable batch decoding
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            ${recog_opts}
+
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/librispeech_100/asr1/steps b/egs/librispeech_100/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/librispeech_100/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/librispeech_100/asr1/utils b/egs/librispeech_100/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/librispeech_100/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/libritts/tts1/cmd.sh b/egs/libritts/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/libritts/tts1/cmd.sh
+++ b/egs/libritts/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/ljspeech/asr1/cmd.sh b/egs/ljspeech/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/ljspeech/asr1/cmd.sh
+++ b/egs/ljspeech/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/ljspeech/tts1/cmd.sh b/egs/ljspeech/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/ljspeech/tts1/cmd.sh
+++ b/egs/ljspeech/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v4.single.yaml b/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v4.single.yaml
index 9d34c5f9eca..f3e61fa8582 100644
--- a/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v4.single.yaml
+++ b/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v4.single.yaml
@@ -2,7 +2,7 @@
 # This configuration reuqires 1 gpu in the case of each gpu memory = 12GB.
 # This configuration assumes that durations and features are already dumped,
 # in other words, the training will be followed knowledge distilation manner.
-# You need to additionaly specify pretrained teacher model path in `run.sh`
+# You need to additionally specify pretrained teacher model path in `run.sh`
 # if you want to use this configuration.
 
 # network architecture related
diff --git a/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v4.yaml b/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v4.yaml
index e2fe62fc7cd..0703a252619 100644
--- a/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v4.yaml
+++ b/egs/ljspeech/tts1/conf/tuning/train_fastspeech.v4.yaml
@@ -2,7 +2,7 @@
 # This configuration reuqires 2 gpus in the case of each gpu memory = 24GB.
 # This configuration assumes that durations and features are already dumped,
 # in other words, the training will be followed knowledge distilation manner.
-# You need to additionaly specify pretrained teacher model path in `run.sh`
+# You need to additionally specify pretrained teacher model path in `run.sh`
 # if you want to use this configuration.
 
 # network architecture related
diff --git a/egs/ljspeech/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml b/egs/ljspeech/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
index 1271dc94fe5..444a34a77b5 100644
--- a/egs/ljspeech/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
+++ b/egs/ljspeech/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
@@ -1,5 +1,5 @@
 # This configuration uses reduction factor = 1 and location-sensitive attention.
-# Furthermore, to accelerate the learning of diaogonal attention, we additionaly
+# Furthermore, to accelerate the learning of diaogonal attention, we additionally
 # use guided attention loss. This leads super fast and robust attention learning.
 
 # encoder related
diff --git a/egs/ljspeech/tts1/conf/tuning/train_pytorch_tacotron2.v4.yaml b/egs/ljspeech/tts1/conf/tuning/train_pytorch_tacotron2.v4.yaml
index f6a3d97f04e..05530051204 100644
--- a/egs/ljspeech/tts1/conf/tuning/train_pytorch_tacotron2.v4.yaml
+++ b/egs/ljspeech/tts1/conf/tuning/train_pytorch_tacotron2.v4.yaml
@@ -1,5 +1,5 @@
 # This configuration uses reduction factor = 1 and location-sensitive attention.
-# Furthermore, to accelerate the learning of diaogonal attention, we additionaly
+# Furthermore, to accelerate the learning of diaogonal attention, we additionally
 # use guided attention loss. This leads super fast and robust attention learning.
 # Also, this configuration uses length-weighted normalized loss instead of the
 # avearge over all of the bins.
diff --git a/egs/ljspeech/tts2/cmd.sh b/egs/ljspeech/tts2/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/ljspeech/tts2/cmd.sh
+++ b/egs/ljspeech/tts2/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/lrs/asr1/RESULTS.md b/egs/lrs/asr1/RESULTS.md
new file mode 100644
index 00000000000..e1b03cf913a
--- /dev/null
+++ b/egs/lrs/asr1/RESULTS.md
@@ -0,0 +1,39 @@
+## pretrain_Train_pytorch_train_specaug
+
+* Model files (archived to model.tar.gz by <code>$ pack_model.sh</code>)
+  - download link: <code>https://drive.google.com/file/d/1YUePEjk2Utgznr7sP0x4KdKCcPjbMM7C/view?usp=sharing</code>
+  - training config file: <code>conf/train.yaml</code>
+  - decoding config file: <code>conf/decode.yaml</code>
+  - preprocess config file: <code>conf/specaug.yaml</code>
+  - lm config file: <code>conf/lm.yaml</code> 
+  - cmvn file: <code>data/pretrain_Train/cmvn.ark</code>
+  - e2e file: <code>exp/pretrain_Train_pytorch_train_specaug/results/model.val5.avg.best</code>
+  - e2e json file: <code>exp/pretrain_Train_pytorch_train_specaug/results/model.json</code>
+  - lm file: <code>exp/pretrainedlm/rnnlm.model.best</code>
+  - lm JSON file: <code>exp/pretrainedlm/model.json</code>
+  - dict file: <code>data/lang_char/pretrain_Train_unigram5000_units.txt</code>
+
+
+## Environments
+- date: `Wed Feb 16 09:06:58 CET 2022`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.8`
+- chainer version: `chainer 6.0.0`
+- pytorch version: `pytorch 1.4.0`
+- Git hash: `19aabb415657c05a45467f9d8bb612db4764f6a1`
+  - Commit date: `Tue Oct 19 12:00:34 2021 +0200`
+
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_Test_model.val5.avg.best_decode_|1243|12648|96.3|1.6|2.1|0.2|3.9|15.8| 
+|decode_Val_model.val5.avg.best_decode_|1082|14858|92.7|3.2|4.1|0.9|8.2|38.2|
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_Test_model.val5.avg.best_decode_|1243|6660|96.2|2.1|1.7|0.4|4.2|15.7|
+|decode_Val_model.val5.avg.best_decode_|1082|7866|91.6|4.7|3.7|1.0|9.4|38.2|
diff --git a/egs/lrs/asr1/cmd.sh b/egs/lrs/asr1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/lrs/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/lrs/asr1/conf/decode.yaml b/egs/lrs/asr1/conf/decode.yaml
new file mode 100644
index 00000000000..98b36d1752e
--- /dev/null
+++ b/egs/lrs/asr1/conf/decode.yaml
@@ -0,0 +1,7 @@
+batchsize: 0
+beam-size: 60
+ctc-weight: 0.4
+lm-weight: 0.6
+maxlenratio: 0.0
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs2/mini_an4/tts1/conf/fbank.conf b/egs/lrs/asr1/conf/fbank.conf
similarity index 100%
rename from egs2/mini_an4/tts1/conf/fbank.conf
rename to egs/lrs/asr1/conf/fbank.conf
diff --git a/egs/lrs/asr1/conf/gpu.conf b/egs/lrs/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/lrs/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/lrs/asr1/conf/lm.yaml b/egs/lrs/asr1/conf/lm.yaml
new file mode 100644
index 00000000000..d0f3a0e0545
--- /dev/null
+++ b/egs/lrs/asr1/conf/lm.yaml
@@ -0,0 +1,9 @@
+layer: 4
+unit: 2048
+opt: sgd       # or adam
+sortagrad: 0   # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 512 # batch size in LM training
+epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+maxlen: 40     # if sentence length > lm_maxlen, lm_batchsize is automatically reduced
+dropout-rate: 0.0
diff --git a/egs/lrs/asr1/conf/pitch.conf b/egs/lrs/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/lrs/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/lrs/asr1/conf/queue.conf b/egs/lrs/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/lrs/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/lrs/asr1/conf/slurm.conf b/egs/lrs/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/lrs/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/lrs/asr1/conf/specaug.yaml b/egs/lrs/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/lrs/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/lrs/asr1/conf/train.yaml b/egs/lrs/asr1/conf/train.yaml
new file mode 100644
index 00000000000..342411d8c44
--- /dev/null
+++ b/egs/lrs/asr1/conf/train.yaml
@@ -0,0 +1,40 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+mtlalpha: 0.3
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 5.0
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
diff --git a/egs/lrs/asr1/local/README.md b/egs/lrs/asr1/local/README.md
new file mode 100644
index 00000000000..9aaa11c64e2
--- /dev/null
+++ b/egs/lrs/asr1/local/README.md
@@ -0,0 +1,52 @@
+# File Documentation
+The documentation is not finished. There are some files (especially in the subdirectories) without documentation right now.
+## Table of Contents 
+The documentation for the listed files is given below:
+- [data_preparation.sh](#data_preparation)
+- [make_files.py](#make_files)
+- [pretrain.py](#pretrain)
+
+---
+
+### data_preparation.sh
+**Short description:** Prepare Dataset basic structure script<br>
+**Parameters:**
+
+| Parameter Name | Function |
+|----------------|----------|
+| <code>sdir=$1</code> | source directory of the data |
+| <code>dset=$2</code> | dataset part (Train, Test, Val, pretrain) |
+| <code>segment=$3</code> | if do segmentation for pretrain set |
+| <code>nj=$4 </code> | if multi cpu processing, default is true |
+
+---
+
+### make_files.py
+**Short description:** Generate the text, utt2spk and wav.scp file<br>
+**Parameters:**
+
+| Parameter Name | Function |
+|----------------|----------|
+| <code>sys.argv[1]</code>, sourcedir (str) | The LRS2 dataset dir (e.g. /LRS2/data/lrs2_v1/mvlrs_v1/main) |  
+| <code>sys.argv[2]</code>, filelistdir (str) | The directory containing the dataset Filelists (METADATA) |
+| <code>sys.argv[3]</code>, savedir (str) | Save directory, datadir of the clean audio dataset  |
+| <code>sys.argv[4]</code>, dset (str) | Which set. There are pretrain, Train, Val, Test set |
+| <code>sys.argv[5]</code>, nj (str) | Number of multi processes |
+
+
+---
+
+### pretrain.py
+**Short description:** Prepare pretrain dataset<br>
+**Parameters:**
+
+| Parameter Name | Function |
+|----------------|----------|
+| <code>sys.argv[1]</code>, sourcedir (str) | The LRS2 dataset dir (e.g. /LRS2/data/lrs2_v1/mvlrs_v1/main) |  
+| <code>sys.argv[2]</code>, filelistdir (str) | The directory containing the dataset Filelists (METADATA) |
+| <code>sys.argv[3]</code>, savedir (str) | Save directory, datadir of the clean audio dataset |
+| <code>sys.argv[4]</code>, dset (str) | Which set. For this code dset is pretrain set | 
+| <code>sys.argv[5]</code>, nj (str) | Number of multi processes | 
+| <code>sys.argv[6]</code>, segment (str) |  If do segmentation | 
+
+
diff --git a/egs/lrs/asr1/local/data_preparation.sh b/egs/lrs/asr1/local/data_preparation.sh
new file mode 100644
index 00000000000..71be0ad3afd
--- /dev/null
+++ b/egs/lrs/asr1/local/data_preparation.sh
@@ -0,0 +1,53 @@
+#! /usr/bin/env bash 
+
+# Copyright 2020 Ruhr-University (Wentao Yu)
+
+# hand over parameters 
+sdir=$1					# source directory of the data
+dset=$2					# dataset part (Train, Test, Val, pretrain)
+segment=$3				# if do segmentation for pretrain set
+nj=$4  		                       	# if multi cpu processing, default is true
+
+# general configuration
+stage=0                                 # set starting stage
+stop_stage=100                          # set stop stage
+sourcedir=$sdir/data/lrs2_v1/mvlrs_v1   # main data dir of LRS2 dataset, source for Video data
+datadir=data/$dset     			# datadir of the clean audio dataset 
+metadir=data/METADATA			# datadir of the metadata
+
+mkdir -p $datadir  
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # copy the Dataset metadata
+    tmpdir=$(mktemp -d tmp-XXXXX)
+    trap 'rm -rf ${tmpdir}' EXIT
+    mkdir -p ${tmpdir}/filelists
+    mkdir -p $metadir
+    if [ "$dset" = Val ] ; then
+	if [ ! -f "$metadir/Filelist_Val" ]; then
+	    cp $sdir/Filelist_${dset} $metadir
+	fi	 	    
+    else
+	cp -f $sdir/Filelist_${dset} $metadir
+    fi	
+    if [ "$dset" = Test ] ; then
+	mv $metadir/Filelist_Test ${tmpdir}/filelists/Filelist_Test
+	cat ${tmpdir}/filelists/Filelist_Test | cut -d " " -f1 > $metadir/Filelist_Test
+    fi
+fi
+
+if [ "$dset" = pretrain ] ; then
+    echo "pretrain"
+    segmentdir=data/Dataset_processing/pretrainsegment
+    mkdir -p $segmentdir
+    python3 -u local/pretrain.py  $sourcedir/pretrain $metadir $datadir $dset $nj $segment || exit 1;
+else
+   echo $dset
+   ### Generate the text, utt2spk and wav.scp file
+   python3 -u local/make_files.py  $sourcedir/main $metadir $datadir $dset $nj || exit 1;
+   for file in text utt2spk wav.scp; do
+	sort -u $datadir/$file -o $datadir/$file || exit 1;
+   done
+fi
+
+exit 0
diff --git a/egs/lrs/asr1/local/make_files.py b/egs/lrs/asr1/local/make_files.py
new file mode 100644
index 00000000000..171df026adb
--- /dev/null
+++ b/egs/lrs/asr1/local/make_files.py
@@ -0,0 +1,94 @@
+import multiprocessing as mp
+import os
+import sys
+
+
+def main(sourcedir, filelistdir, savedir, dset, nj):
+    """Prepare the Kaldi files.
+
+    Args:
+        sourcedir (str): LRS2 dataset dir.
+        filelist (str): The dir of the mp4 file, it should be like
+                        '5535415699068794046/00001'
+        savedir (str): The dir save the Kaldi files.
+        dset (str): Which set. For this code dset is pretrain set.
+        nj (str): Number of multi processes.
+
+    """
+    nj = int(nj)
+    if nj > 1:
+        multicore = True
+    else:
+        multicore = False
+    filelistdir = filelistdir + "/" + "Filelist_" + dset
+    with open(filelistdir) as filelists:
+        filelist = filelists.readlines()
+    for i in range(len(filelist)):
+        filelist[i] = filelist[i].strip("\n")
+    if multicore is True:
+        pool = mp.Pool(nj)
+        job_args = [(i, dset, savedir, sourcedir) for i in filelist]
+        pool.map(product_helper, job_args)
+    else:
+        for i in filelist:
+            set(i, dset, savedir, sourcedir)
+
+
+def product_helper(args):
+    return set(*args)
+
+
+def set(info, s, savedir, sourcedir):
+    """Make the Kaldi files.
+
+    Args:
+        info (str): The file name.
+        s (str): Which set. For this code dset is pretrain set.
+        savedir (str): The dir save the Kaldi files.
+        sourcedir (str): LRS2 dataset dir.
+
+    """
+    textdir = savedir + "/text"
+    utt2spkdir = savedir + "/utt2spk"
+    wavdir = savedir + "/wav.scp"
+
+    info = info.split()
+    info[0] = info[0].split("/")
+    info[0] = "LRS2_" + info[0][0] + "_" + info[0][1] + "m"
+    name = info[0]
+    name = name.split("_")
+    f = os.path.join(name[1], name[2][:-1])
+
+    textfile = os.path.join(sourcedir, f + ".txt")
+    mp4dir = os.path.join(sourcedir, f + ".mp4")
+    with open(textfile) as filelists:
+        text = filelists.readlines()
+    text = text[0].split(":")[1]
+    splitname = f.split("/")
+    title = "LRS2_" + splitname[0] + "_" + splitname[1] + "m"
+    with open(textdir, "a") as textprocess:
+        textprocess.writelines(title + "" + text)
+        textprocess.close()
+
+    with open(utt2spkdir, "a") as utt:
+        utt.writelines(title + " LRS2_" + splitname[0] + "_" + splitname[1] + "m\n")
+        utt.close()
+
+    command1 = "ffmpeg -y -i"
+    command2 = "-vn -ac 2 -ar 16000 -ab 320k -f wav /tmp/tmp.$$; cat /tmp/tmp.$$ |\n"
+    wavscp = " ".join([title, command1, mp4dir, command2])
+    with open(wavdir, "a") as wav:
+        wav.writelines(wavscp)
+        wav.close()
+
+
+# hand over parameter overview
+# sys.argv[1] = sourcedir (str): The LRS2 dataset dir
+#                                (e.g. /LRS2/data/lrs2_v1/mvlrs_v1/main)
+# sys.argv[2] = filelistdir (str): The directory containing the dataset
+#                                 Filelists (METADATA)
+# sys.argv[3] = savedir (str): Save directory, datadir of the clean audio dataset
+# sys.argv[4] = dset (str): Which set. There are pretrain, Train, Val, Test set.
+# sys.argv[5] = nj (str): Number of multi processes.
+
+main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
diff --git a/egs/lrs/asr1/local/pretrain.py b/egs/lrs/asr1/local/pretrain.py
new file mode 100644
index 00000000000..29d88ae0a80
--- /dev/null
+++ b/egs/lrs/asr1/local/pretrain.py
@@ -0,0 +1,249 @@
+import multiprocessing as mp
+import os
+import sys
+
+
+def main(sourcedir, filelistdir, savedir, dset, nj, segment):
+    """Prepare the Kaldi files.
+
+    Args:
+        sourcedir (str): LRS2 dataset dir.
+        filelist (str): The dir of the mp4 file, it should be like
+                        '5535415699068794046/00001'
+        savedir (str): The dir save the Kaldi files.
+        dset (str): Which set. For this code dset is pretrain set.
+        nj (str): Number of multi processes.
+        segment (str): If use segmentation
+
+    """
+    nj = int(nj)
+    if nj > 1:
+        multicore = True
+    else:
+        multicore = False
+    if segment == "true":
+        segment = True
+    else:
+        segment = False
+    filelistdir = os.path.join(filelistdir, "Filelist_" + dset)
+    with open(filelistdir) as filelists:
+        filelist = filelists.readlines()
+    for i in range(len(filelist)):
+        filelist[i] = filelist[i].strip("\n")
+    filelist.sort()
+    if multicore is True:
+        pool = mp.Pool(nj)
+        job_args = [(i, dset, savedir, sourcedir, segment) for i in filelist]
+        pool.map(product_helper, job_args)
+    else:
+        for i in filelist:
+            set(i, dset, savedir, sourcedir, segment)
+
+
+def product_helper(args):
+    return set(*args)
+
+
+def remove(sub, s):
+    return s.replace(sub, "", -1)
+
+
+def segmentation(textfiledir, file, segment=True):
+    """Make segment information with segment interval 5s.
+
+    Args:
+        textfiledir (str): The Text and Segment File
+        file (str): The file name
+
+    """
+
+    with open(textfiledir) as filelists:
+        info = filelists.readlines()
+    info[0] = remove("Text:  ", info[0])
+    starttime = info[4].split(" ")[1]
+    endtime = float(info[-1].split(" ")[2])
+    segmentinfo = {file: {}}
+    if segment is False:
+        segmentinfo[file].update({str(0): {"segmenttime": [0.0, endtime]}})
+        segmentinfo[file][str(0)].update({"segmenttext": info[0]})
+        return segmentinfo
+    else:
+        if endtime > 6:
+            cutpoint = []
+            timer = 5
+            for j in range(4, len(info)):
+                wordendtime = float(info[j].split(" ")[2])
+                if wordendtime / timer > 1:
+                    cutpoint.append(j)
+                    timer = timer + 5
+            cuttime = []
+            for k in range(len(cutpoint)):
+                cuttime.append(info[cutpoint[k]].split(" ")[2])
+
+            cuttime = [starttime] + cuttime + [str(endtime)]
+            # if endtime - float(cuttime[-2]) <= 1.0:
+            #   del cuttime[-1]
+            for cutid in range(len(cuttime) - 1):
+                segmentinfo[file].update(
+                    {str(cutid): {"segmenttime": [cuttime[cutid], cuttime[cutid + 1]]}}
+                )
+
+            text = []
+            for m in range(4, len(info)):
+                text.append(info[m].split(" ")[0])
+            cuttext = [x - 4 for x in cutpoint]
+            cuttext.append(len(text))
+            textlen = []
+            for n in range(1, len(cuttext)):
+                textlen.append(cuttext[n] - cuttext[n - 1])
+            textlen = [cuttext[0] + 1] + textlen
+            textlen[-1] = textlen[-1] - 1
+            for p in range(len(textlen)):
+                temp = []
+                for q in range(textlen[p]):
+                    temp.append(text[q])
+                segmentinfo[file][str(p)].update({"segmenttext": " ".join(temp)})
+                text[0 : textlen[p]] = []
+
+            return segmentinfo
+
+        else:
+            segmentinfo[file].update({str(0): {"segmenttime": [0.0, endtime]}})
+            segmentinfo[file][str(0)].update({"segmenttext": info[0]})
+            return segmentinfo
+
+
+def set(file, s, savedir, sourcedir, segment):
+    """Make the Kaldi files.
+
+    Args:
+        file (str): The file name.
+        s (str): Which set. For this code dset is pretrain set.
+        sourcedir: LRS2 dataset dir.
+        savedir (str): The dir save the Kaldi files.
+
+    """
+
+    textdir = os.path.join(savedir, "text")
+    utt2spkdir = os.path.join(savedir, "utt2spk")
+    wavdir = os.path.join(savedir, "wav.scp")
+    segmentdir = os.path.join(savedir, "segments")
+
+    textfile = os.path.join(sourcedir, file + ".txt")
+    mp4dir = os.path.join(sourcedir, file + ".mp4")
+
+    if segment is False:
+        segmentinfo = segmentation(textfile, file, segment)
+        command1 = "ffmpeg -y -i"
+        command2 = (
+            "-vn -ac 1 -ar 16000 -ab 320k -f wav /tmp/tmp.$$; cat /tmp/tmp.$$ |\n"
+        )
+        splitname = file.split("/")
+        Title = "_".join(["LRS2", splitname[0], splitname[1] + "p"])
+        texttxt = [" ".join([Title, segmentinfo[file][str(0)]["segmenttext"]])]
+        wavtxt = [" ".join([Title, command1, mp4dir, command2])]
+        utttxt = [
+            Title + " " + "_".join(["LRS2", splitname[0], splitname[1] + "p"]) + "\n"
+        ]
+
+    else:
+        segdir = os.path.join(savedir, "seginfo.txt")
+        segmentinfo = segmentation(textfile, file)
+        if len(segmentinfo[file]) == 1:
+            starttime = float(segmentinfo[file][str(0)]["segmenttime"][0])
+            endtime = float(segmentinfo[file][str(0)]["segmenttime"][1])
+            command1 = "ffmpeg -y -i"
+            command2 = (
+                "-vn -ac 1 -ar 16000 -ab 320k -f wav /tmp/tmp.$$; cat /tmp/tmp.$$ |\n"
+            )
+            splitname = file.split("/")
+            Title = "_".join(
+                [
+                    "LRS2",
+                    splitname[0],
+                    splitname[1] + "p",
+                    str(int(starttime * 100)).zfill(7),
+                    str(int(endtime * 100)).zfill(7),
+                ]
+            )
+            spkerid = "_".join(["LRS2", splitname[0], splitname[1] + "p"])
+            texttxt = " ".join([Title, segmentinfo[file][str(0)]["segmenttext"]])
+            wavtxt = " ".join([spkerid, command1, mp4dir, command2])
+            segtxt = " ".join([Title, spkerid, str(starttime), str(endtime)]) + "\n"
+            utttxt = Title + " " + spkerid + "\n"
+
+        else:
+            splitname = file.split("/")
+            command1 = "ffmpeg -y -i"
+            command2 = (
+                "-vn -ac 1 -ar 16000 -ab 320k -f wav /tmp/tmp.$$; cat /tmp/tmp.$$ |\n"
+            )
+            spkerid = "_".join(["LRS2", splitname[0], splitname[1] + "p"])
+            wavtxt = [" ".join([spkerid, command1, mp4dir, command2])]
+            texttxt = []
+            utttxt = []
+            segtxt = []
+            segmentinfos = []
+            for i in range(len(segmentinfo[file])):
+                starttime = float(segmentinfo[file][str(i)]["segmenttime"][0])
+                endtime = float(segmentinfo[file][str(i)]["segmenttime"][1])
+                if segmentinfo[file][str(i)]["segmenttext"] == "":
+                    pass
+                else:
+                    Title = "_".join(
+                        [
+                            "LRS2",
+                            splitname[0],
+                            splitname[1] + "p",
+                            str(int(starttime * 100)).zfill(7),
+                            str(int(endtime * 100)).zfill(7),
+                        ]
+                    )
+                    spkerid = "_".join(["LRS2", splitname[0], splitname[1] + "p"])
+                    segtxt.append(
+                        " ".join([Title, spkerid, str(starttime), str(endtime)]) + "\n"
+                    )
+                    segmentinfos.append(
+                        " ".join([Title, mp4dir, str(starttime), str(endtime)]) + "\n"
+                    )
+
+                    temptext = " ".join(
+                        [Title, segmentinfo[file][str(i)]["segmenttext"]]
+                    )
+                    if "\n" in temptext:
+                        pass
+                    else:
+                        temptext = temptext + "\n"
+                    texttxt.append(temptext)
+                    utttxt.append(Title + " " + spkerid + "\n")
+
+            with open(segdir, "a") as segprocess:
+                segprocess.writelines(segmentinfos)
+                segprocess.close()
+
+    with open(textdir, "a") as textprocess:
+        textprocess.writelines(texttxt)
+        textprocess.close()
+    with open(utt2spkdir, "a") as utt:
+        utt.writelines(utttxt)
+        utt.close()
+    with open(wavdir, "a") as wav:
+        wav.writelines(wavtxt)
+        wav.close()
+    with open(segmentdir, "a") as segs:
+        segs.writelines(segtxt)
+        segs.close()
+
+
+# hand over parameter overview
+# sys.argv[1] = sourcedir (str): The LRS2 dataset dir
+#                                (e.g. /LRS2/data/lrs2_v1/mvlrs_v1/main)
+# sys.argv[2] = filelistdir (str): The directory containing the dataset
+#                                 Filelists (METADATA)
+# sys.argv[3] = savedir (str): Save directory, datadir of the clean audio dataset
+# sys.argv[4] = dset (str): Which set. For this code dset is pretrain set.
+# sys.argv[5] = nj (str): Number of multi processes.
+# sys.argv[6] = segment (str): If do segmentation.
+
+
+main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6])
diff --git a/egs/lrs/asr1/path.sh b/egs/lrs/asr1/path.sh
new file mode 100644
index 00000000000..fcb3ae8dd5a
--- /dev/null
+++ b/egs/lrs/asr1/path.sh
@@ -0,0 +1,17 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
\ No newline at end of file
diff --git a/egs/lrs/asr1/run.sh b/egs/lrs/asr1/run.sh
new file mode 100644
index 00000000000..06d0773f17e
--- /dev/null
+++ b/egs/lrs/asr1/run.sh
@@ -0,0 +1,361 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Ruhr-University Bochum (Wentao Yu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1       		# start from stage 0, stage -1 (Data Download has to be done by the user) 
+stop_stage=100		# stage at which to stop
+ngpu=1         		# number of gpus ("0" uses cpu, otherwise use gpu)
+nj=32
+debugmode=1
+dumpdir=dump   		# directory to dump full features
+N=0            		# number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      		# verbose option
+resume=        		# Resume the training from snapshot
+train_lm=false
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml 
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume= # specify a snapshot file to resume LM training
+lmtag=     # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best  # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+lang_model=rnnlm.model.best # set a language model to be used for decoding
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
+                             # if false, the last `n_average` ASR models will be averaged.
+lm_n_average=0               # the number of languge models to be averaged
+use_lm_valbest_average=false # if true, the validation `lm_n_average`-best language models will be averaged.
+                             # if false, the last `lm_n_average` language models will be averaged.
+
+# The LRS2 Corpus requires vertification. You have to download the 
+# dataset and set your dataset dir here
+datadir=		     # The LRS2 dataset directory e.g. /home/foo/LRS2
+
+pretrain=true		     # if use LRS2 pretrain set 
+segment=true  		     # if do segmentation for pretrain set
+
+# bpemode (unigram or bpe)
+nbpe=500
+bpemode=unigram
+
+## train_lm=false, we have to download pretrained language model
+function gdrive_download () {
+  CONFIRM=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=$1" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')
+  wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$CONFIRM&id=$1" -O $2
+  rm -rf /tmp/cookies.txt
+}
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# define sets
+if [ "$pretrain" = true ] ; then
+	train_set="pretrain_Train"
+else
+	train_set="Train"
+fi
+train_dev="Val"
+recog_set="Val Test"
+
+# Stage -1: Data Download
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    if [ -d "$datadir" ]; then
+    	echo "Dataset already exists."
+    else
+    	echo "For downloading the data, please visit 'https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html'."
+    	echo "You will need to sign a Data Sharing agreement with BBC Research & Development before getting access."
+    	echo "Please download the dataset by yourself and save the dataset directory in path.sh file"
+    	echo "Thanks!"
+    fi
+fi
+
+# Stage 0: Data preparation
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    for part in Test Val Train; do 
+        local/data_preparation.sh $datadir $part $segment $nj || exit 1;
+    done
+    if [ "$pretrain" = true ] ; then
+    	part=pretrain
+    	local/data_preparation.sh $datadir $part $segment $nj || exit 1;
+    fi
+    for part in pretrain Test Val Train; do 
+    	mv data/${part} data/${part}_org || exit 1;
+    done
+    echo "stage 0: Data preparation finished"
+
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_test_dir=${dumpdir}/Test/delta${do_delta}; mkdir -p ${feat_test_dir}
+feat_val_dir=${dumpdir}/Val/delta${do_delta}; mkdir -p ${feat_val_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in Train Val Test; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+    if [ "$pretrain" = true ] ; then
+	remove_longshortdata.sh --maxframes 3000 --maxchars 400 data/pretrain_org data/pretrain
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            data/pretrain exp/make_fbank/pretrain ${fbankdir}
+        utils/fix_data_dir.sh data/pretrain
+    	utils/combine_data.sh data/${train_set} \
+			      data/pretrain \
+			      data/Train
+    fi
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/train_set ${feat_tr_dir=}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+            data/${rtask}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/recog/${rtask} \
+            ${feat_recog_dir}
+    done
+fi
+
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+
+    if [ "$train_lm" = true ] ; then
+        mkdir -p data/lang_char/
+        echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+        cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+        spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+        spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+        wc -l ${dict}
+
+    else
+	gdrive_download '1ZXXCXSbbFS2PDlrs9kbJL9pE6-5nPPxi' 'model.v1.tar.gz'
+	tar -xf model.v1.tar.gz
+	mv avsrlrs2_3/exp/train_rnnlm_pytorch_lm_unigram500 exp/pretrainedlm
+	mv avsrlrs2_3/data/lang_char data/
+    	mv data/lang_char/train_unigram500.model data/lang_char/${train_set}_unigram500.model
+    	mv data/lang_char/train_unigram500.vocab data/lang_char/${train_set}_unigram500.vocab
+    	mv data/lang_char/train_unigram500_units.txt data/lang_char/${train_set}_unigram500_units.txt
+  	rm -rf avsrlrs2_3
+	rm -rf model.v1.tar.gz
+	
+	##### it is depands on your corpus, if the corpus text transcription is uppercase, use this to convert to lowercase
+    	textfilenames1=data/${train_set}/text
+   	textfilenames2=data/Test/text	
+    	textfilenames3=data/Val/text	
+    	for textfilename in $textfilenames1 $textfilenames2 $textfilenames3
+    	do
+	    sed -r 's/([^ \t]+\s)(.*)/\1\L\2/' $textfilename > ${textfilename}1  || exit 1;
+	    rm -rf $textfilename  || exit 1;
+	    mv ${textfilename}1 $textfilename  || exit 1;
+    	done
+    fi
+
+    # make json labels
+    data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
+
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --nj ${nj} --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+# You can skip this and remove --rnnlm option in the recognition (stage 5)
+if [ "$train_lm" = false ] ; then
+    lmexpname=pretrainedlm
+    lmexpdir=exp/${lmexpname}
+else
+    if [ -z ${lmtag} ]; then
+        lmtag=$(basename ${lm_config%.*})
+    fi
+    lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+    lmexpdir=exp/${lmexpname}
+    mkdir -p ${lmexpdir}
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    if [ "$train_lm" = false ] ; then
+        echo "stage 3: Use pretrained LM"
+    else
+        echo "stage 3: LM Preparation"
+        lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+        # use external data
+        if [ ! -e data/local/lm_train/librispeech-lm-norm.txt.gz ]; then
+            wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm_train/
+        fi
+        if [ ! -e ${lmdatadir} ]; then
+            mkdir -p ${lmdatadir}
+            cut -f 2- -d" " data/${train_set}/text | gzip -c > data/local/lm_train/${train_set}_text.gz
+            # combine external text and transcriptions and shuffle them with seed 777
+            zcat data/local/lm_train/librispeech-lm-norm.txt.gz data/local/lm_train/${train_set}_text.gz |\
+                spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+            cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
+                                                            > ${lmdatadir}/valid.txt
+        fi
+        ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+            lm_train.py \
+            --config ${lm_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --verbose 1 \
+            --outdir ${lmexpdir} \
+            --tensorboard-dir tensorboard/${lmexpname} \
+            --train-label ${lmdatadir}/train.txt \
+            --valid-label ${lmdatadir}/valid.txt \
+            --resume ${lm_resume} \
+            --dict ${dict} \
+            --dump-hdf5-path ${lmdatadir}
+    fi
+fi
+
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_val_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+           [[ $(get_yaml.py ${train_config} etype) = custom ]] || \
+           [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+
+        # Average LM models
+        if [ ${lm_n_average} -eq 0 ]; then
+            lang_model=rnnlm.model.best
+        else
+            if ${use_lm_valbest_average}; then
+                lang_model=rnnlm.val${lm_n_average}.avg.best
+                opt="--log ${lmexpdir}/log"
+            else
+                lang_model=rnnlm.last${lm_n_average}.avg.best
+                opt="--log"
+            fi
+            average_checkpoints.py \
+                ${opt} \
+                --backend ${backend} \
+                --snapshots ${lmexpdir}/snapshot.ep.* \
+                --out ${lmexpdir}/${lang_model} \
+                --num ${lm_n_average}
+        fi
+    fi
+
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${recog_model}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        # set batchsize 0 to disable batch decoding
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            --rnnlm ${lmexpdir}/${lang_model} \
+            --api v2
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
+
+exit 0
diff --git a/egs/lrs/asr1/steps b/egs/lrs/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/lrs/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/lrs/asr1/utils b/egs/lrs/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/lrs/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/m_ailabs/tts1/cmd.sh b/egs/m_ailabs/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/m_ailabs/tts1/cmd.sh
+++ b/egs/m_ailabs/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/m_ailabs/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml b/egs/m_ailabs/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
index 698203f46ce..4cf4d7ff457 100644
--- a/egs/m_ailabs/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
+++ b/egs/m_ailabs/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
@@ -1,5 +1,5 @@
 # This configuration uses reduction factor = 1 and location-sensitive attention.
-# Furthermore, to accelerate the learning of diaogonal attention, we additionaly
+# Furthermore, to accelerate the learning of diaogonal attention, we additionally
 # use guided attention loss. This leads super fast and robust attention learning.
 
 # encoder related
diff --git a/egs/m_ailabs/tts1/local/data_prep.sh b/egs/m_ailabs/tts1/local/data_prep.sh
index cf56cafd1c3..0fad1b87bb7 100755
--- a/egs/m_ailabs/tts1/local/data_prep.sh
+++ b/egs/m_ailabs/tts1/local/data_prep.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/m_ailabs/tts1/local/download.sh b/egs/m_ailabs/tts1/local/download.sh
index 5645ac0bb17..a5aa37c4866 100755
--- a/egs/m_ailabs/tts1/local/download.sh
+++ b/egs/m_ailabs/tts1/local/download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/m_ailabs/tts1/run.sh b/egs/m_ailabs/tts1/run.sh
index 5e11c375d53..7ac8cf4cc57 100755
--- a/egs/m_ailabs/tts1/run.sh
+++ b/egs/m_ailabs/tts1/run.sh
@@ -94,7 +94,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     ### Task dependent. You have to design training and dev name by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 1: Feature Generation"
-    # Trim silence parts at the begining and the end of audio
+    # Trim silence parts at the beginning and the end of audio
     if ${do_trimming}; then
         trim_silence.sh --cmd "${train_cmd}" \
             --fs ${fs} \
diff --git a/egs/mboshi_french/st1/cmd.sh b/egs/mboshi_french/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/mboshi_french/st1/cmd.sh
+++ b/egs/mboshi_french/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/mboshi_french/st1/local/divide_lang.sh b/egs/mboshi_french/st1/local/divide_lang.sh
deleted file mode 100755
index 561af0786fa..00000000000
--- a/egs/mboshi_french/st1/local/divide_lang.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2020 Kyoto University (Hirofumi Inaguma)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 <src-dir>"
-    echo "e.g.: $0 dev"
-    exit 1
-fi
-
-set=$1
-
-# Copy stuff into its final locations [this has been moved from the format_data script]
-for lang in mb fr; do
-    mkdir -p data/${set}.${lang}
-    for f in spk2utt utt2spk wav.scp feats.scp utt2num_frames; do
-        if [ -f data/${set}/${f} ]; then
-            sort data/${set}/${f} > data/${set}.${lang}/${f}
-        fi
-    done
-    sort data/${set}/text.tc.${lang} > data/${set}.${lang}/text  # dummy
-    sort data/${set}/text.tc.${lang} > data/${set}.${lang}/text.tc
-    sort data/${set}/text.lc.${lang} > data/${set}.${lang}/text.lc
-    sort data/${set}/text.lc.rm.${lang} > data/${set}.${lang}/text.lc.rm
-
-    utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
-    utils/validate_data_dir.sh --no-feats data/${set}.${lang} || exit 1;
-done
diff --git a/egs/mboshi_french/st1/run.sh b/egs/mboshi_french/st1/run.sh
index 7e42efa29b9..df32d1327f1 100755
--- a/egs/mboshi_french/st1/run.sh
+++ b/egs/mboshi_french/st1/run.sh
@@ -102,7 +102,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in ${train_set_prefix} dev100 dev; do
-        local/divide_lang.sh ${x}
+        divide_lang.sh ${x} "mb fr"
     done
 
     # remove long and short utterances
@@ -152,11 +152,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --nlsyms ${nlsyms} --lang fr \
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --nlsyms ${nlsyms} --lang "fr" \
         data/${train_set} ${dict} > ${feat_tr_dir}/data.${src_case}_${tgt_case}.json
     for x in ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --nlsyms ${nlsyms} --lang fr \
+        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --nlsyms ${nlsyms} --lang "fr" \
             data/${x} ${dict} > ${feat_trans_dir}/data.${src_case}_${tgt_case}.json
     done
 
diff --git a/egs/mgb2/asr1/cmd.sh b/egs/mgb2/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/mgb2/asr1/cmd.sh
+++ b/egs/mgb2/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/mgb2/asr1/local/add_to_datadir.py b/egs/mgb2/asr1/local/add_to_datadir.py
index 23d0fd044b0..69c71bb4891 100644
--- a/egs/mgb2/asr1/local/add_to_datadir.py
+++ b/egs/mgb2/asr1/local/add_to_datadir.py
@@ -39,11 +39,11 @@
         segId = "%s_spk-%04d_seg-%07d:%07d" % (basename, spk, start * 100, end * 100)
         spkId = "%s_spk-%04d" % (basename, spk)
 
-        # only add segments where Matching Error Rate is below the threshhold
+        # only add segments where Matching Error Rate is below the threshold
         if mer_thresh is None or mer <= mer_thresh:
-            print(segments_file, "%s %s %.2f %.2f" % (segId, basename, start, end))
-            print(text_file, "%s %s" % (segId, words))
-            print(utt2spk_file, "%s %s" % (segId, spkId))
+            print("%s %s %.2f %.2f" % (segId, basename, start, end), file=segments_file)
+            print("%s %s" % (segId, words), file=text_file)
+            print("%s %s" % (segId, spkId), file=utt2spk_file)
 
 segments_file.close()
 utt2spk_file.close()
diff --git a/egs/mgb2/asr1/local/mgb_data_prep.sh b/egs/mgb2/asr1/local/mgb_data_prep.sh
index e1865883c97..58755b7ff35 100644
--- a/egs/mgb2/asr1/local/mgb_data_prep.sh
+++ b/egs/mgb2/asr1/local/mgb_data_prep.sh
@@ -3,7 +3,7 @@
 # Copyright (C) 2020 Kanari AI 
 # (Amir Hussein)
 
-if [ $# -ne 3 ]; then
+if [ $# -ne 4 ]; then
   echo "Usage: $0 <DB-dir> <process-xml> <data-subset> <mer>"
   exit 1;
 fi
diff --git a/egs/mgb2/asr1/local/text_segmenting.py b/egs/mgb2/asr1/local/text_segmenting.py
index 011ef73fec1..ec9004a20b1 100644
--- a/egs/mgb2/asr1/local/text_segmenting.py
+++ b/egs/mgb2/asr1/local/text_segmenting.py
@@ -39,7 +39,7 @@ def get_split(text, maxlen=200, overlap=50):
 
 
 def get_args():
-    # Get some basic command line arguements
+    # Get some basic command line arguments
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-f", "--file_dir", help="Directory of text to be segmented", type=str
diff --git a/egs/mgb2/asr1/local/xml2stm.py b/egs/mgb2/asr1/local/xml2stm.py
index c390dde58b2..8bb29f457d2 100644
--- a/egs/mgb2/asr1/local/xml2stm.py
+++ b/egs/mgb2/asr1/local/xml2stm.py
@@ -8,16 +8,15 @@
 
 __author__ = "Yifan Zhang (yzhang@qf.org.qa)"
 
-import codecs
 import sys
 from xml.dom import minidom
 
-_unicode = u'\u0622\u0624\u0626\u0628\u062a\u062c\u06af\u062e\u0630" \
+_unicode = '\u0622\u0624\u0626\u0628\u062a\u062c\u06af\u062e\u0630" \
 u0632\u0634\u0636\u0638\u063a\u0640\u0642\u0644\u0646\u0648\u064a\u064c" \
 u064e\u0650\u0652\u0670\u067e\u0686\u0621\u0623\u0625\u06a4\u0627\u0629" \
 u062b\u062d\u062f\u0631\u0633\u0635\u0637\u0639\u0641\u0643\u0645\u0647" \
 u0649\u064b\u064d\u064f\u0651\u0671'
-_buckwalter = u"|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{"
+_buckwalter = "|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{"
 
 _forwardMap = {ord(a): b for a, b in zip(_unicode, _buckwalter)}
 _backwardMap = {ord(b): a for a, b in zip(_unicode, _buckwalter)}
@@ -66,28 +65,30 @@ def loadXml(xmlFileName, opts):
 
 
 def stm(data):
-    out = codecs.getwriter("utf-8")(sys.stdout)
+    sys.stdout.reconfigure(encoding="utf-8")
     for e in data["turn"]:
-        out.write(
+        sys.stdout.write(
             "{} 1 UNKNOWN {:.02f} {:.02f} ".format(data["id"], e.startTime, e.endTime)
         )
-        out.write(e.text)
-        out.write("\n")
+        sys.stdout.write(e.text)
+        sys.stdout.write("\n")
 
 
 def ctm(data):
     """generate ctm output for test"""
 
-    out = codecs.getwriter("utf-8")(sys.stdout)
+    sys.stdout.reconfigure(encoding="utf-8")
     for e in data["turn"]:
         tokens = e.text.split()
         duration = e.endTime - e.startTime
         interval = duration / len(tokens)
         startTime = e.startTime
         for token in tokens:
-            out.write("{} 1 {:.02f} {:.02f} ".format(data["id"], startTime, interval))
-            out.write(token)
-            out.write("\n")
+            sys.stdout.write(
+                "{} 1 {:.02f} {:.02f} ".format(data["id"], startTime, interval)
+            )
+            sys.stdout.write(token)
+            sys.stdout.write("\n")
 
 
 def main(args):
diff --git a/egs/mini_an4/asr1/cmd.sh b/egs/mini_an4/asr1/cmd.sh
index ba088619be2..66f0119f951 100644
--- a/egs/mini_an4/asr1/cmd.sh
+++ b/egs/mini_an4/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -66,7 +66,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/mini_an4/asr1/conf/train_conformer_transducer.yaml b/egs/mini_an4/asr1/conf/train_conformer_transducer.yaml
index a9980e2576c..868189b3581 100644
--- a/egs/mini_an4/asr1/conf/train_conformer_transducer.yaml
+++ b/egs/mini_an4/asr1/conf/train_conformer_transducer.yaml
@@ -8,21 +8,30 @@ criterion: loss
 early-stop-criterion: "validation/main/loss"
 sortagrad: 0
 opt: noam
-transformer-lr: 5.0
-transformer-warmup-steps: 10
+noam-lr: 5.0
+noam-adim: 4
+optimizer-warmup-steps: 10
 epochs: 1
 report-interval-iters: 1
 accum-grad: 1
 patience: 0
 
 # network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
 ## encoder related
 etype: custom
 enc-block-arch:
-        - type: transformer
+        - type: conformer
           d_hidden: 4
           d_ff: 4
           heads: 2
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 7
+          dropout-rate: 0.3
 enc-block-repeat: 2
 ## decoder related
 dtype: custom
diff --git a/egs/mini_an4/asr1/conf/train_conformer_transducer_aux.yaml b/egs/mini_an4/asr1/conf/train_conformer_transducer_aux.yaml
new file mode 100644
index 00000000000..25d27c98f3d
--- /dev/null
+++ b/egs/mini_an4/asr1/conf/train_conformer_transducer_aux.yaml
@@ -0,0 +1,52 @@
+# minibatch related
+batch-size: 2
+maxlen-in: 800
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+noam-lr: 5.0
+noam-adim: 4
+optimizer-warmup-steps: 10
+epochs: 1
+report-interval-iters: 1
+accum-grad: 1
+patience: 0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+enc-block-arch:
+        - type: conformer
+          d_hidden: 4
+          d_ff: 4
+          heads: 2
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 7
+          dropout-rate: 0.3
+enc-block-repeat: 2
+## decoder related
+dtype: custom
+dec-block-arch:
+        - type: transformer
+          d_hidden: 4
+          d_ff: 4
+          heads: 2
+## joint network related
+joint-dim: 4
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+use-ctc-loss: True
+ctc-loss-weight: 0.5
+ctc-loss-dropout-rate: 0.1
+use-lm-loss: True
+lm-loss-weight: 0.5
\ No newline at end of file
diff --git a/egs/mini_an4/asr1/conf/train_transducer_aux.yaml b/egs/mini_an4/asr1/conf/train_transducer_aux.yaml
new file mode 100644
index 00000000000..6ae6f217b71
--- /dev/null
+++ b/egs/mini_an4/asr1/conf/train_transducer_aux.yaml
@@ -0,0 +1,42 @@
+# minibatch related
+batch-size: 2
+maxlen-in: 800
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: adadelta
+epochs: 1
+patience: 0
+
+# network architecture
+## encoder related
+etype: blstmp
+elayers: 2
+eunits: 4
+eprojs: 4
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 4
+dunits: 4
+## joint network related
+joint-dim: 4
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+use-ctc-loss: True
+ctc-loss-weight: 0.5
+ctc-loss-dropout-rate: 0.1
+use-lm-loss: True
+lm-loss-weight: 0.5
+lm-loss-smoothing-rate: 0.05
+use-aux-transducer-loss: True
+aux-transducer-loss-weight: 0.3
+aux-transducer-loss-enc-output-layers: "[0]"
+aux-transducer-loss-mlp-dim: 4
+aux-transducer-loss-mlp-dropout-rate: 0.1
+use-symm-kl-div-loss: True
+symm-kl-div-loss-weight: 0.3
\ No newline at end of file
diff --git a/egs/mini_an4/asr1/conf/train_transducer_pre_init_lm.yaml b/egs/mini_an4/asr1/conf/train_transducer_pre_init_lm.yaml
index e011df0f2e2..fa312ac12b0 100644
--- a/egs/mini_an4/asr1/conf/train_transducer_pre_init_lm.yaml
+++ b/egs/mini_an4/asr1/conf/train_transducer_pre_init_lm.yaml
@@ -29,4 +29,4 @@ joint-dim: 4
 # transducer related
 model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
 dec-init: exp/train_rnnlm_pytorch_lm/rnnlm.model.best
-enc-init-mods: "dec."
\ No newline at end of file
+dec-init-mods: "dec."
diff --git a/egs/mini_an4/asr1/conf/train_transformer_transducer.yaml b/egs/mini_an4/asr1/conf/train_transformer_transducer.yaml
index a9980e2576c..d79e9411401 100644
--- a/egs/mini_an4/asr1/conf/train_transformer_transducer.yaml
+++ b/egs/mini_an4/asr1/conf/train_transformer_transducer.yaml
@@ -8,8 +8,9 @@ criterion: loss
 early-stop-criterion: "validation/main/loss"
 sortagrad: 0
 opt: noam
-transformer-lr: 5.0
-transformer-warmup-steps: 10
+noam-lr: 5.0
+noam-adim: 4
+optimizer-warmup-steps: 10
 epochs: 1
 report-interval-iters: 1
 accum-grad: 1
diff --git a/egs/mini_an4/mt1/run.sh b/egs/mini_an4/mt1/run.sh
index d8c8ab2d134..41c75df80b9 100755
--- a/egs/mini_an4/mt1/run.sh
+++ b/egs/mini_an4/mt1/run.sh
@@ -100,7 +100,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     n=$(($(wc -l < data/train/text) - 2))
     utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
 
-    # add pseudo case infomation
+    # add pseudo case information
     for x in train_nodev train_dev test; do
         cp data/${x}/text data/${x}/text.tc
         cp data/${x}/text data/${x}/text.lc
@@ -213,8 +213,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
-            ${expdir}/${decode_dir} en ${dict}
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+            ${expdir}/${decode_dir} "en" ${dict}
 
     ) &
     pids+=($!) # store background pids
diff --git a/egs/mini_an4/st1/run.sh b/egs/mini_an4/st1/run.sh
index 4383be1fe9a..9a1c2d3f159 100755
--- a/egs/mini_an4/st1/run.sh
+++ b/egs/mini_an4/st1/run.sh
@@ -116,7 +116,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     n=$(($(wc -l < data/train/text) - 2))
     utils/subset_data_dir.sh --last data/train ${n} data/train_nodev
 
-    # add pseudo case infomation
+    # add pseudo case information
     for x in train_nodev train_dev test; do
         for case in lc.rm lc tc; do
             cp data/${x}/text data/${x}/text.${case}.en
@@ -275,7 +275,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             ${expdir}/${decode_dir} "de" ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
diff --git a/egs/mtedx/mt1/cmd.sh b/egs/mtedx/mt1/cmd.sh
new file mode 100644
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/mtedx/mt1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/mtedx/mt1/conf/decode.yaml b/egs/mtedx/mt1/conf/decode.yaml
new file mode 120000
index 00000000000..78ad27dea6b
--- /dev/null
+++ b/egs/mtedx/mt1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer.tc.yaml
\ No newline at end of file
diff --git a/egs/mtedx/mt1/conf/tuning/decode_pytorch_transformer.lc.rm.yaml b/egs/mtedx/mt1/conf/tuning/decode_pytorch_transformer.lc.rm.yaml
new file mode 100644
index 00000000000..990e206a56d
--- /dev/null
+++ b/egs/mtedx/mt1/conf/tuning/decode_pytorch_transformer.lc.rm.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.8
+maxlenratio: 1.3
+minlenratio: 0.0
diff --git a/egs/mtedx/mt1/conf/tuning/decode_pytorch_transformer.tc.yaml b/egs/mtedx/mt1/conf/tuning/decode_pytorch_transformer.tc.yaml
new file mode 100644
index 00000000000..ba9a6945d40
--- /dev/null
+++ b/egs/mtedx/mt1/conf/tuning/decode_pytorch_transformer.tc.yaml
@@ -0,0 +1,5 @@
+batchsize: 0
+beam-size: 10
+penalty: 0.5
+maxlenratio: 1.3
+minlenratio: 0.0
diff --git a/egs/mtedx/mt1/conf/tuning/train_pytorch_transformer.yaml b/egs/mtedx/mt1/conf/tuning/train_pytorch_transformer.yaml
new file mode 100644
index 00000000000..4dfaaa5fbef
--- /dev/null
+++ b/egs/mtedx/mt1/conf/tuning/train_pytorch_transformer.yaml
@@ -0,0 +1,42 @@
+# network architecture
+# encoder related
+elayers: 6
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+tie-src-tgt-embedding: false
+tie-classifier: false
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 96
+maxlen-in: 100  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 100 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 1
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_mt_transformer:E2E"
+transformer-lr: 1.0
+transformer-warmup-steps: 8000
+transformer-attn-dropout-rate: 0.1
+transformer-length-normalized-loss: false
+transformer-init: xavier_uniform
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
diff --git a/egs/mtedx/mt1/local b/egs/mtedx/mt1/local
new file mode 120000
index 00000000000..705961977b3
--- /dev/null
+++ b/egs/mtedx/mt1/local
@@ -0,0 +1 @@
+../st1/local
\ No newline at end of file
diff --git a/egs/mtedx/mt1/path.sh b/egs/mtedx/mt1/path.sh
new file mode 100644
index 00000000000..813bf6153ff
--- /dev/null
+++ b/egs/mtedx/mt1/path.sh
@@ -0,0 +1,25 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/mtedx/mt1/run.sh b/egs/mtedx/mt1/run.sh
new file mode 100755
index 00000000000..4827f22d760
--- /dev/null
+++ b/egs/mtedx/mt1/run.sh
@@ -0,0 +1,330 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1        # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=8            # number of parallel jobs for decoding
+debugmode=1
+dumpdir=dump    # directory to dump full features
+N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0       # verbose option
+resume=         # Resume the training from snapshot
+seed=1          # seed to generate random number
+
+train_config=conf/train.yaml
+decode_config=conf/decode.yaml
+
+# decoding parameter
+trans_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+
+# model average realted (only for transformer)
+n_average=5                  # the number of MT models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best MT models will be averaged.
+                             # if false, the last `n_average` MT models will be averaged.
+metric=bleu                  # loss/acc/bleu
+
+# preprocessing related
+src_case=tc
+tgt_case=tc
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+
+# postprocessing related
+remove_nonverbal=true  # remove non-verbal labels such as "( Applaus )"
+# NOTE: IWSLT community accepts this setting and therefore we use this by default
+
+# mtedx_datadir=download # original data directory to be stored
+mtedx_datadir=/n/work3/inaguma/mTEDx # original data directory to be stored
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Italian (it)
+# Russian (ru)
+# Greek (el)
+# Portuguese (pt)
+# Arabic (ar)
+
+# if true, reverse source and target languages: **->English
+reverse_direction=false
+
+# use the same dict as in the ST task
+use_st_dict=true
+
+# bpemode (unigram or bpe)
+nbpe=1000
+bpemode=bpe
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train.${src_lang}-${tgt_lang}.${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}.${tgt_lang}
+trans_set="valid.${src_lang}-${tgt_lang}.${tgt_lang} test.${src_lang}-${tgt_lang}.${tgt_lang}"
+
+# verify language directions
+is_exist=false
+if [[ ${src_lang} == es ]]; then
+    tgt=en_fr_it_pt_es
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == fr ]]; then
+    tgt=en_es_pt_fr
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == it ]]; then
+    tgt=en_es_it
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == pt ]]; then
+    tgt=en_es_pt
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == ru ]]; then
+    tgt=en_ru
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == el ]]; then
+    tgt=en_el
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+
+    local/data_prep.sh ${mtedx_datadir} ${src_lang} ${tgt_lang}
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+
+    # Divide into source and target languages
+    for x in train.${src_lang}-${tgt_lang} valid.${src_lang}-${tgt_lang} test.${src_lang}-${tgt_lang}; do
+        divide_lang.sh ${x} "${src_lang} ${tgt_lang}"
+    done
+    for lang in ${src_lang} ${tgt_lang}; do
+        cp -rf data/valid.${src_lang}-${tgt_lang}.${lang} data/dev.${src_lang}-${tgt_lang}.${lang}
+    done
+
+    # remove long and short utterances
+    for x in train.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang}; do
+        clean_corpus.sh --no_feat true --maxchars 400 --utt_extra_files "text.tc text.lc text.lc.rm" data/${x} "${src_lang} ${tgt_lang}"
+    done
+fi
+
+if [ ${use_st_dict} = true ]; then
+    if [ ${reverse_direction} = true ]; then
+        dict=../st1/data/lang_1spm/train.${src_lang}-${tgt_lang}.${tgt_lang}_${bpemode}${nbpe}_units_${src_case}.txt
+        nlsyms=../st1/data/lang_1spm/train.${src_lang}-${tgt_lang}.${tgt_lang}_non_lang_syms_${src_case}.txt
+        bpemodel=../st1/data/lang_1spm/train.${src_lang}-${tgt_lang}.${tgt_lang}_${bpemode}${nbpe}_${src_case}
+    else
+        dict=../st1/data/lang_1spm/train.${src_lang}-${tgt_lang}.${tgt_lang}_${bpemode}${nbpe}_units_${tgt_case}.txt
+        nlsyms=../st1/data/lang_1spm/train.${src_lang}-${tgt_lang}.${tgt_lang}_non_lang_syms_${tgt_case}.txt
+        bpemodel=../st1/data/lang_1spm/train.${src_lang}-${tgt_lang}.${tgt_lang}_${bpemode}${nbpe}_${tgt_case}
+    fi
+else
+    dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+    nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+    bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+fi
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+
+    if [ ${use_st_dict} = false ]; then
+        echo "make a non-linguistic symbol list for all languages"
+        cut -f 2- -d' ' data/train.${src_lang}-${tgt_lang}.*/text.${tgt_case} | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+        cat ${nlsyms}
+
+        echo "make a joint source and target dictionary"
+        echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+        offset=$(wc -l < ${dict})
+        cut -f 2- -d' ' data/train.${src_lang}-${tgt_lang}.*/text.${tgt_case} | grep -v -e '^\s*$' > data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}_${tgt_case}.txt
+        spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}_${tgt_case}.txt \
+            --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+        spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${src_lang}_${tgt_lang}_${src_case}_${tgt_case}.txt \
+            | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+        wc -l ${dict}
+    fi
+
+    echo "make json files"
+    if [ ${reverse_direction} = true ]; then
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}; mkdir -p ${feat_dir}
+            set=$(echo ${x} | cut -f 1 -d ".")
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${src_lang}" \
+                data/${set} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        done
+
+        # update json (add source references)
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}
+            data_dir=data/$(echo ${x} | cut -f 1 -d ".").${src_lang}-${tgt_lang}.${tgt_lang}
+            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+        done
+    else
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}; mkdir -p ${feat_dir}
+            data2json.sh --nj 16 --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
+                data/${x} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+        done
+
+        # update json (add source references)
+        for x in ${train_set} ${train_dev} ${trans_set}; do
+            feat_dir=${dumpdir}/${x}
+            data_dir=data/$(echo ${x} | cut -f 1 -d ".").${src_lang}-${tgt_lang}.${src_lang}
+            update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+                ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+        done
+    fi
+fi
+
+# NOTE: skip stage 3: LM Preparation
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${src_case}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+else
+    expname=${train_set}_${src_case}_${tgt_case}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        mt_train.py \
+        --config ${train_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --seed ${seed} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
+        # Average MT models
+        if ${use_valbest_average}; then
+            trans_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric ${metric}"
+        else
+            trans_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${trans_model} \
+            --num ${n_average}
+    fi
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
+
+    pids=() # initialize pids
+    for x in ${trans_set}; do
+    (
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_dir=${dumpdir}/${x}
+
+        # reset log for RTF calculation
+        if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
+            rm ${expdir}/${decode_dir}/log/decode.*.log
+        fi
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            mt_trans.py \
+            --config ${decode_config} \
+            --ngpu ${dec_ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --trans-json ${feat_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${trans_model}
+
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+            --remove_nonverbal ${remove_nonverbal} \
+            ${expdir}/${decode_dir} "${tgt_lang}" ${dict}
+
+        calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/mtedx/mt1/steps b/egs/mtedx/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/mtedx/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/mtedx/mt1/utils b/egs/mtedx/mt1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/mtedx/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/mtedx/st1/cmd.sh b/egs/mtedx/st1/cmd.sh
new file mode 100644
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/mtedx/st1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/mtedx/st1/conf/decode.yaml b/egs/mtedx/st1/conf/decode.yaml
new file mode 120000
index 00000000000..9fd0988b5e2
--- /dev/null
+++ b/egs/mtedx/st1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer_beam4.yaml
\ No newline at end of file
diff --git a/egs/mtedx/st1/conf/fbank.conf b/egs/mtedx/st1/conf/fbank.conf
new file mode 100644
index 00000000000..5dc1aeeb069
--- /dev/null
+++ b/egs/mtedx/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=44100
+--num-mel-bins=80
diff --git a/egs/mtedx/st1/conf/gpu.conf b/egs/mtedx/st1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/mtedx/st1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/mtedx/st1/conf/pitch.conf b/egs/mtedx/st1/conf/pitch.conf
new file mode 100644
index 00000000000..70a7a59b41e
--- /dev/null
+++ b/egs/mtedx/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=44100
diff --git a/egs/mtedx/st1/conf/queue.conf b/egs/mtedx/st1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/mtedx/st1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/mtedx/st1/conf/slurm.conf b/egs/mtedx/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/mtedx/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/mtedx/st1/conf/specaug.yaml b/egs/mtedx/st1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/mtedx/st1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/mtedx/st1/conf/train.yaml b/egs/mtedx/st1/conf/train.yaml
new file mode 120000
index 00000000000..ee7633dd48e
--- /dev/null
+++ b/egs/mtedx/st1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_conformer_ctcasr0.3.yaml
\ No newline at end of file
diff --git a/egs/mtedx/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml b/egs/mtedx/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml
new file mode 100644
index 00000000000..bbfc569db31
--- /dev/null
+++ b/egs/mtedx/st1/conf/tuning/decode_pytorch_transformer_beam4.yaml
@@ -0,0 +1,6 @@
+batchsize: 0
+beam-size: 4
+penalty: 0.6  # 23
+# penalty: 0.8  # 22.87
+maxlenratio: 0.3
+minlenratio: 0.0
diff --git a/egs/mtedx/st1/conf/tuning/train_pytorch_conformer_ctcasr0.3.yaml b/egs/mtedx/st1/conf/tuning/train_pytorch_conformer_ctcasr0.3.yaml
new file mode 100644
index 00000000000..1d1b765258f
--- /dev/null
+++ b/egs/mtedx/st1/conf/tuning/train_pytorch_conformer_ctcasr0.3.yaml
@@ -0,0 +1,59 @@
+# network architecture
+# encoder related
+elayers: 12
+eunits: 2048
+# decoder related
+dlayers: 6
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# multitask
+mtlalpha: 1.0  # CTC weight
+asr-weight: 0.3
+mt-weight: 0.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 4
+grad-clip: 5
+patience: 0
+epochs: 30
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_st_conformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 2.5
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# pre-training related
+enc-init-mods: encoder.embed,encoder.encoders,encoder.after_norm
+dec-init-mods: decoder.embed,decoder.decoders,decoder.after_norm,decoder.output_layer
+
+# Report CER & WER
+report-cer: true
+report-wer: true
+
+# conformer specific setting
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+rel-pos-type: latest
+transformer-encoder-activation-type: swish
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 15
diff --git a/egs/mtedx/st1/local/copy_data_dir.sh b/egs/mtedx/st1/local/copy_data_dir.sh
new file mode 100755
index 00000000000..7adade5d50b
--- /dev/null
+++ b/egs/mtedx/st1/local/copy_data_dir.sh
@@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  feats.scp
+#  wav.scp
+#  vad.scp
+#  spk2utt
+#  utt2spk
+#  text
+#
+# It copies to another directory, possibly adding a specified prefix or a suffix
+# to the utterance and/or speaker names.  Note, the recording-ids stay the same.
+#
+
+
+# begin configuration section
+spk_prefix=
+utt_prefix=
+spk_suffix=
+utt_suffix=
+validate_opts=   # should rarely be needed.
+# end configuration section
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1"
+  echo "Options"
+  echo "   --spk-prefix=<prefix>     # Prefix for speaker ids, default empty"
+  echo "   --utt-prefix=<prefix>     # Prefix for utterance ids, default empty"
+  echo "   --spk-suffix=<suffix>     # Suffix for speaker ids, default empty"
+  echo "   --utt-suffix=<suffix>     # Suffix for utterance ids, default empty"
+  exit 1;
+fi
+
+
+export LC_ALL=C
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/utt2spk ]; then
+  echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
+  exit 1;
+fi
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+set -e;
+
+mkdir -p $destdir
+
+cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
+cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map
+
+if [ ! -f $srcdir/utt2uniq ]; then
+  if [[ ! -z $utt_prefix  ||  ! -z $utt_suffix ]]; then
+    cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq
+  fi
+else
+  cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq
+fi
+
+cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map  | \
+  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+if [ -f $srcdir/feats.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp
+fi
+
+if [ -f $srcdir/vad.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp
+fi
+
+if [ -f $srcdir/segments ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments
+  cp $srcdir/wav.scp $destdir
+else # no segments->wav indexed by utt.
+  if [ -f $srcdir/wav.scp ]; then
+    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp
+  fi
+fi
+
+if [ -f $srcdir/reco2file_and_channel ]; then
+  cp $srcdir/reco2file_and_channel $destdir/
+fi
+
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
+fi
+if [ -f $srcdir/text.tc ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.tc >$destdir/text.tc
+fi
+if [ -f $srcdir/text.lc ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.lc >$destdir/text.lc
+fi
+if [ -f $srcdir/text.lc.rm ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text.lc.rm >$destdir/text.lc.rm
+fi
+if [ -f $srcdir/utt2dur ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
+fi
+if [ -f $srcdir/reco2dur ]; then
+  if [ -f $srcdir/segments ]; then
+    cp $srcdir/reco2dur $destdir/reco2dur
+  else
+    utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur
+  fi
+fi
+if [ -f $srcdir/spk2gender ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
+fi
+if [ -f $srcdir/cmvn.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
+fi
+for f in stm glm ctm; do
+  if [ -f $srcdir/$f ]; then
+    cp $srcdir/$f $destdir
+  fi
+done
+
+rm $destdir/spk_map $destdir/utt_map
+
+echo "$0: copied data from $srcdir to $destdir"
+
+for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do
+  if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
+    echo "$0: file $f exists in dest $destdir but not in src $srcdir.  Moving it to"
+    echo " ... $destdir/.backup/$f"
+    mkdir -p $destdir/.backup
+    mv $destdir/$f $destdir/.backup/
+  fi
+done
+
+
+[ ! -f $srcdir/wav.scp ] && validate_opts="$validate_opts --no-wav"
+[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats"
+[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"
+
+utils/validate_data_dir.sh $validate_opts $destdir
diff --git a/egs/mtedx/st1/local/data_prep.sh b/egs/mtedx/st1/local/data_prep.sh
new file mode 100755
index 00000000000..8231d2b0e4b
--- /dev/null
+++ b/egs/mtedx/st1/local/data_prep.sh
@@ -0,0 +1,191 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+export LC_ALL=C
+
+. utils/parse_options.sh || exit 1;
+
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <src-dir> <src-lang> <tgt_text-lang>"
+    echo "e.g.: $0 /n//work3/inaguma/mTEDx source_lang target_lang"
+    exit 1;
+fi
+
+src_lang=$2
+tgt_lang=$3
+
+# all utterances are FLAC compressed
+# if ! which flac >&/dev/null; then
+#    echo "Please install 'flac' on ALL worker nodes!"
+#    exit 1
+# fi
+
+for set in train valid test; do
+    src=$1/${src_lang}-${tgt_lang}/data/${set}
+    dst=data/local/${src_lang}-${tgt_lang}/${set}
+
+    [ ! -d ${src} ] && echo "$0: no such directory ${src}" && exit 1;
+
+    wav_dir=${src}/wav
+    trans_dir=${src}/txt
+    yml=${trans_dir}/${set}.yaml
+    src_text=${trans_dir}/${set}.${src_lang}
+    tgt_text=${trans_dir}/${set}.${tgt_lang}
+
+    mkdir -p ${dst} || exit 1;
+
+    [ ! -d ${wav_dir} ] && echo "$0: no such directory ${wav_dir}" && exit 1;
+    [ ! -d ${trans_dir} ] && echo "$0: no such directory ${trans_dir}" && exit 1;
+    [ ! -f ${yml} ] && echo "$0: expected file ${yml} to exist" && exit 1;
+    [ ! -f ${src_text} ] && echo "$0: expected file ${src_text} to exist" && exit 1;
+    [ ! -f ${tgt_text} ] && echo "$0: expected file ${tgt_text} to exist" && exit 1;
+
+    wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
+    trans_src=${dst}/text.${src_lang}; [[ -f "${trans_src}" ]] && rm ${trans_src}
+    trans_tgt=${dst}/text.${tgt_lang}; [[ -f "${trans_tgt}" ]] && rm ${trans_tgt}
+    utt2spk=${dst}/utt2spk; [[ -f "${utt2spk}" ]] && rm ${utt2spk}
+    spk2utt=${dst}/spk2utt; [[ -f "${spk2utt}" ]] && rm ${spk2utt}
+    segments=${dst}/segments; [[ -f "${segments}" ]] && rm ${segments}
+
+    # error check
+    n=$(cat ${yml} | grep duration | wc -l)
+    n_src=$(cat ${src_text} | wc -l)
+    n_tgt=$(cat ${tgt_text} | wc -l)
+    [ ${n} -ne ${n_src} ] && echo "Warning: expected ${n} data files, found ${n_src}" && exit 1;
+    [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data files, found ${n_tgt}" && exit 1;
+
+    # (1a) Transcriptions and translations preparation
+    # make basic transcription file (add segments info)
+    cp ${yml} ${dst}/.yaml0
+    grep duration ${dst}/.yaml0 > ${dst}/.yaml1
+    awk '{
+        duration=$3; offset=$5; spkid=$7;
+        gsub(",","",duration);
+        gsub(",","",offset);
+        gsub(",","",spkid);
+        duration=sprintf("%.7f", duration);
+        if ( duration < 0.2 ) extendt=sprintf("%.7f", (0.2-duration)/2);
+        else extendt=0;
+        offset=sprintf("%.7f", offset);
+        startt=offset-extendt;
+        endt=offset+duration+extendt;
+        printf("mtedx%s_%07.0f_%07.0f\n", spkid, int(1000*startt+0.5), int(1000*endt+0.5));
+    }' ${dst}/.yaml1 > ${dst}/.yaml2
+    # NOTE: Extend the lengths of short utterances (< 0.2s) rather than exclude them
+    # NOTE: adding prefix "mtedx" is important for avoiding an error occured during speed perturbation
+
+    cp ${src_text} ${dst}/${src_lang}.org
+    cp ${tgt_text} ${dst}/${tgt_lang}.org
+
+    for lang in ${src_lang} ${tgt_lang}; do
+        # normalize punctuation
+        normalize-punctuation.perl -l ${lang} < ${dst}/${lang}.org > ${dst}/${lang}.norm
+
+        # lowercasing
+        lowercase.perl < ${dst}/${lang}.norm > ${dst}/${lang}.norm.lc
+        cp ${dst}/${lang}.norm ${dst}/${lang}.norm.tc
+
+        # remove punctuation
+        remove_punctuation.pl < ${dst}/${lang}.norm.lc > ${dst}/${lang}.norm.lc.rm
+
+        for case in lc.rm lc tc; do
+            # tokenization
+            tokenizer.perl -l ${lang} -q < ${dst}/${lang}.norm.${case} > ${dst}/${lang}.norm.${case}.tok
+
+            # paste -d " " ${dst}/.yaml2 ${dst}/${lang}.norm.${case}.tok | sort > ${dst}/text.${case}.${lang}
+            paste -d " " <(cat ${dst}/.yaml2) <(cat ${dst}/${lang}.norm.${case}.tok | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
+                > ${dst}/text.${case}.${lang}
+        done
+
+        # save original and cleaned punctuation
+        lowercase.perl < ${dst}/${lang}.org | text2token.py -s 0 -n 1 | tr " " "\n" \
+            | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.${lang}
+        lowercase.perl < ${dst}/${lang}.norm.tc | text2token.py -s 0 -n 1 | tr " " "\n" \
+            | sort | uniq | grep -v -e '^\s*$' | awk '{print $0 " " NR+1}' > ${dst}/punctuation.clean.${lang}
+    done
+
+
+    # error check
+    n=$(cat ${dst}/.yaml2 | wc -l)
+    n_src=$(cat ${dst}/${src_lang}.norm.tc.tok | wc -l)
+    n_tgt=$(cat ${dst}/${tgt_lang}.norm.tc.tok | wc -l)
+    [ ${n} -ne ${n_src} ] && echo "Warning: expected ${n} data files, found ${n_src}" && exit 1;
+    [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data files, found ${n_tgt}" && exit 1;
+
+
+    # (1c) Make segments files from transcript
+    #segments file format is: utt-id start-time end-time, e.g.:
+    # 0u7tTptBo9I_0000 0u7tTptBo9I 17.22 17.69
+    awk '{
+        segment=$1; n=split(segment,S,"[_]");
+        if ( n==3 ) {spkid=S[1]; startf=S[2]; endf=S[3];}
+        else if ( n==4 ) {spkid=S[1] "_" S[2]; startf=S[3]; endf=S[4];}
+        else if ( n==5 ) {spkid=S[1] "_" S[2] "_" S[3]; startf=S[4]; endf=S[5];}
+        printf("%s %s %.2f %.2f\n", segment, spkid, startf/1000, endf/1000);
+    }' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/segments
+
+    awk '{
+        segment=$1; n=split(segment,S,"[_]");
+        if ( n==3 ) spkid=S[1];
+        else if ( n==4 ) spkid=S[1] "_" S[2];
+        else if ( n==5 ) spkid=S[1] "_" S[2] "_" S[3];
+        spkid_fix=spkid;
+        gsub("mtedx","",spkid_fix);
+        # printf("%s flac -c -d -s '${wav_dir}'/%s.flac |\n", spkid, spkid_fix);
+        # printf("%s ffmpeg -i '${wav_dir}'/%s.flac -f wav -ar 44100 -ab 16 -ac 1 - |\n", spkid, spkid_fix);
+        printf("%s ffmpeg -i '${wav_dir}'/%s.flac -f wav -ar 44100 -ac 1 - |\n", spkid, spkid_fix);
+    }' < ${dst}/text.tc.${tgt_lang} | uniq | sort > ${dst}/wav.scp
+
+    awk '{
+        segment=$1; n=split(segment,S,"[_]");
+        if ( n==3 ) spkid=S[1];
+        else if ( n==4 ) spkid=S[1] "_" S[2];
+        else if ( n==5 ) spkid=S[1] "_" S[2] "_" S[3];
+        print $1 " " spkid
+    }' ${dst}/segments | uniq | sort > ${dst}/utt2spk
+
+    cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort > ${dst}/spk2utt
+
+    # error check
+    n_src=$(cat ${dst}/text.tc.${src_lang} | wc -l)
+    n_tgt=$(cat ${dst}/text.tc.${tgt_lang} | wc -l)
+    [ ${n_src} -ne ${n_tgt} ] && echo "Warning: expected ${n_src} data files, found ${n_tgt}" && exit 1;
+
+    # Copy stuff into its final locations [this has been moved from the format_data script]
+    mkdir -p data/${set}.${src_lang}-${tgt_lang}
+
+    # remove duplicated utterances (the same offset)
+    echo "remove duplicate lines..."
+    cut -d ' ' -f 1 ${dst}/text.tc.${src_lang} | sort | uniq -c | sort -n -k1 -r | grep -v '   1' \
+        | sed 's/^[ \t]*//' > ${dst}/duplicate_lines
+    cut -d ' ' -f 1 ${dst}/text.tc.${src_lang} | sort | uniq -c | sort -n -k1 -r | grep '   1' \
+        | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
+    reduce_data_dir.sh ${dst} ${dst}/reclist data/${set}.${src_lang}-${tgt_lang}
+    for case in lc.rm lc tc; do
+        cp ${dst}/text.${case}.${src_lang} data/${set}.${src_lang}-${tgt_lang}
+        cp ${dst}/text.${case}.${tgt_lang} data/${set}.${src_lang}-${tgt_lang}
+    done
+    utils/fix_data_dir.sh --utt_extra_files \
+        "text.tc.${src_lang} text.lc.${src_lang} text.lc.rm.${src_lang} \
+         text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" data/${set}.${src_lang}-${tgt_lang}
+
+    # remove empty and short utterances
+    cp -rf data/${set}.${src_lang}-${tgt_lang} data/${set}.${src_lang}-${tgt_lang}.tmp
+    grep -v emptyutterance data/${set}.${src_lang}-${tgt_lang}/text.tc.${src_lang} | cut -f 1 -d " " | sort > data/${set}.${src_lang}-${tgt_lang}/reclist.${src_lang}
+    grep -v emptyutterance data/${set}.${src_lang}-${tgt_lang}/text.tc.${tgt_lang} | cut -f 1 -d " " | sort > data/${set}.${src_lang}-${tgt_lang}/reclist.${tgt_lang}
+    comm -12 data/${set}.${src_lang}-${tgt_lang}/reclist.${src_lang} data/${set}.${src_lang}-${tgt_lang}/reclist.${tgt_lang} > data/${set}.${src_lang}-${tgt_lang}/reclist
+    reduce_data_dir.sh data/${set}.${src_lang}-${tgt_lang}.tmp data/${set}.${src_lang}-${tgt_lang}/reclist data/${set}.${src_lang}-${tgt_lang}
+    utils/fix_data_dir.sh --utt_extra_files \
+        "text.tc.${src_lang} text.lc.${src_lang} text.lc.rm.${src_lang} \
+         text.tc.${tgt_lang} text.lc.${tgt_lang} text.lc.rm.${tgt_lang}" data/${set}.${src_lang}-${tgt_lang}
+    rm -rf data/${set}.${src_lang}-${tgt_lang}.tmp
+
+    # error check
+    n_seg=$(cat data/${set}.${src_lang}-${tgt_lang}/segments | wc -l)
+    n_text=$(cat data/${set}.${src_lang}-${tgt_lang}/text.tc.${tgt_lang} | wc -l)
+    [ ${n_seg} -ne ${n_text} ] && echo "Warning: expected ${n_seg} data files, found ${n_text}" && exit 1;
+
+    echo "$0: successfully prepared data in ${dst}"
+done
diff --git a/egs/mtedx/st1/path.sh b/egs/mtedx/st1/path.sh
new file mode 100644
index 00000000000..813bf6153ff
--- /dev/null
+++ b/egs/mtedx/st1/path.sh
@@ -0,0 +1,25 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# check extra module installation
+if ! which tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/mtedx/st1/run.sh b/egs/mtedx/st1/run.sh
new file mode 100755
index 00000000000..2c4465ba52d
--- /dev/null
+++ b/egs/mtedx/st1/run.sh
@@ -0,0 +1,340 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1        # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=2          # number of gpus during training ("0" uses cpu, otherwise use gpu)
+dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
+nj=8            # number of parallel jobs for decoding
+debugmode=1
+dumpdir=dump    # directory to dump full features
+N=0             # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0       # verbose option
+resume=         # Resume the training from snapshot
+seed=1          # seed to generate random number
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml
+decode_config=conf/decode.yaml
+
+# decoding parameter
+trans_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ST models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ST models will be averaged.
+                             # if false, the last `n_average` ST models will be averaged.
+metric=bleu                  # loss/acc/bleu
+max_epoch=100
+
+# pre-training related
+asr_model=
+mt_model=
+
+# preprocessing related
+src_case=lc.rm
+tgt_case=tc
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+
+# postprocessing related
+remove_nonverbal=true  # remove non-verbal labels such as "( Applaus )"
+# NOTE: IWSLT community accepts this setting and therefore we use this by default
+
+# mtedx_datadir=download # original data directory to be stored
+mtedx_datadir=/n/work3/inaguma/mTEDx # original data directory to be stored
+
+# language related
+src_lang=es
+tgt_lang=en
+# English (en)
+# French (fr)
+# German (de)
+# Spanish (es)
+# Italian (it)
+# Russian (ru)
+# Greek (el)
+# Portuguese (pt)
+# Arabic (ar)
+
+# bpemode (unigram or bpe)
+nbpe=1000
+bpemode=bpe
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train.${src_lang}-${tgt_lang}.${tgt_lang}
+train_dev=dev.${src_lang}-${tgt_lang}.${tgt_lang}
+trans_set="valid.${src_lang}-${tgt_lang}.${tgt_lang} test.${src_lang}-${tgt_lang}.${tgt_lang}"
+
+# verify language directions
+is_exist=false
+if [[ ${src_lang} == es ]]; then
+    tgt=en_fr_it_pt_es
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == fr ]]; then
+    tgt=en_es_pt_fr
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == it ]]; then
+    tgt=en_es_it
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == pt ]]; then
+    tgt=en_es_pt
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == ru ]]; then
+    tgt=en_ru
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+elif [[ ${src_lang} == el ]]; then
+    tgt=en_el
+    for lang in $(echo ${tgt} | tr '_' ' '); do
+        if [[ ${lang} == "${tgt_lang}" ]]; then
+            is_exist=true
+            break
+        fi
+    done
+fi
+if [[ ${is_exist} == false ]]; then
+    echo "No language direction: ${src_lang} to ${tgt_lang}" && exit 1;
+fi
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+
+    local/data_prep.sh ${mtedx_datadir} ${src_lang} ${tgt_lang}
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in train.${src_lang}-${tgt_lang} valid.${src_lang}-${tgt_lang} test.${src_lang}-${tgt_lang}; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 16 --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+    # Divide into source and target languages
+    for x in train.${src_lang}-${tgt_lang} valid.${src_lang}-${tgt_lang} test.${src_lang}-${tgt_lang}; do
+        divide_lang.sh ${x} "${src_lang} ${tgt_lang}"
+    done
+    for lang in ${src_lang} ${tgt_lang}; do
+        cp -rf data/valid.${src_lang}-${tgt_lang}.${lang} data/dev.${src_lang}-${tgt_lang}.${lang}
+    done
+
+    # remove long and short utterances
+    for x in train.${src_lang}-${tgt_lang} dev.${src_lang}-${tgt_lang}; do
+        clean_corpus.sh --maxframes 3000 --maxchars 400 --utt_extra_files "text.tc text.lc text.lc.rm" data/${x} "${src_lang} ${tgt_lang}"
+    done
+
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
+
+    # dump features for training
+    dump.sh --cmd "$train_cmd" --nj 80 --do_delta ${do_delta} \
+        data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
+    for x in ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}; mkdir -p ${feat_dir}
+        dump.sh --cmd "$train_cmd" --nj 16 --do_delta ${do_delta} \
+            data/${x}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/trans/${x} ${feat_dir}
+    done
+fi
+
+dict=data/lang_1spm/${train_set}_${bpemode}${nbpe}_units_${tgt_case}.txt
+nlsyms=data/lang_1spm/${train_set}_non_lang_syms_${tgt_case}.txt
+bpemodel=data/lang_1spm/${train_set}_${bpemode}${nbpe}_${tgt_case}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_1spm/
+
+    echo "make a non-linguistic symbol list for all languages"
+    cat data/train.${src_lang}-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -o -P '&[^;]*;'| sort | uniq > ${nlsyms}
+    cat ${nlsyms}
+
+    echo "make a joint source and target dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    cat data/train.${src_lang}-${tgt_lang}.*/text.${tgt_case} | cut -f 2- -d' ' | grep -v -e '^\s*$' > data/lang_1spm/input_${src_lang}_${tgt_lang}_${tgt_case}.txt
+    spm_train --user_defined_symbols="$(tr "\n" "," < ${nlsyms})" --input=data/lang_1spm/input_${src_lang}_${tgt_lang}_${tgt_case}.txt \
+        --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_1spm/input_${src_lang}_${tgt_lang}_${tgt_case}.txt \
+        | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    for x in ${train_set} ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data2json.sh --nj 16 --feat ${feat_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
+            data/${x} ${dict} > ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+    done
+
+    # update json (add source references)
+    for x in ${train_set} ${train_dev} ${trans_set}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data_dir=data/$(echo ${x} | cut -f 1 -d ".").${src_lang}-${tgt_lang}.${src_lang}
+        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
+    done
+fi
+
+# NOTE: skip stage 3: LM Preparation
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${tgt_case}_${backend}_$(basename ${train_config%.*})_${bpemode}${nbpe}
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+    if [ -n "${asr_model}" ]; then
+        expname=${expname}_asrtrans
+    fi
+    if [ -n "${mt_model}" ]; then
+        expname=${expname}_mttrans
+    fi
+else
+    expname=${train_set}_${tgt_case}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        st_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --seed ${seed} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
+        --enc-init ${asr_model} \
+        --dec-init ${mt_model} \
+        --n-iter-processes 2
+fi
+
+trans_set="test.${src_lang}-${tgt_lang}.${tgt_lang}"
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+       [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]]; then
+        # Average ST models
+        if ${use_valbest_average}; then
+            trans_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log --metric ${metric}"
+        else
+            trans_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${trans_model} \
+            --num ${n_average} \
+            --max-epoch ${max_epoch}
+    fi
+
+    if [ ${dec_ngpu} = 1 ]; then
+        nj=1
+    fi
+
+    pids=() # initialize pids
+    for x in ${trans_set}; do
+    (
+        decode_dir=decode_${x}_$(basename ${decode_config%.*})
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+
+        # reset log for RTF calculation
+        if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
+            rm ${expdir}/${decode_dir}/log/decode.*.log
+        fi
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
+
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            st_trans.py \
+            --config ${decode_config} \
+            --ngpu ${dec_ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --trans-json ${feat_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${trans_model}
+
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
+            --remove_nonverbal ${remove_nonverbal} \
+            ${expdir}/${decode_dir} "${tgt_lang}" ${dict}
+
+        calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/mtedx/st1/steps b/egs/mtedx/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/mtedx/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/mtedx/st1/utils b/egs/mtedx/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/mtedx/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/mucs21_subtask1/asr1/cmd.sh b/egs/mucs21_subtask1/asr1/cmd.sh
new file mode 100644
index 00000000000..4d70c9c7a79
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/mucs21_subtask1/asr1/conf/decode.yaml b/egs/mucs21_subtask1/asr1/conf/decode.yaml
new file mode 120000
index 00000000000..545dc308412
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_pytorch_transformer_large.yaml
\ No newline at end of file
diff --git a/egs/mucs21_subtask1/asr1/conf/fbank.conf b/egs/mucs21_subtask1/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..d75ddde4df8
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=8000 
+--num-mel-bins=80
diff --git a/egs/mucs21_subtask1/asr1/conf/gpu.conf b/egs/mucs21_subtask1/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/mucs21_subtask1/asr1/conf/lm.yaml b/egs/mucs21_subtask1/asr1/conf/lm.yaml
new file mode 120000
index 00000000000..8164a808f1b
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/lm.yaml
@@ -0,0 +1 @@
+tuning/lm_transformer.yaml
\ No newline at end of file
diff --git a/egs/mucs21_subtask1/asr1/conf/pitch.conf b/egs/mucs21_subtask1/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs/mucs21_subtask1/asr1/conf/queue.conf b/egs/mucs21_subtask1/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/mucs21_subtask1/asr1/conf/slurm.conf b/egs/mucs21_subtask1/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/mucs21_subtask1/asr1/conf/specaug.yaml b/egs/mucs21_subtask1/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/mucs21_subtask1/asr1/conf/train.yaml b/egs/mucs21_subtask1/asr1/conf/train.yaml
new file mode 120000
index 00000000000..aca3f74ba99
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_pytorch_conformer_large.yaml
\ No newline at end of file
diff --git a/egs/mucs21_subtask1/asr1/local/check_audio_data_folder.sh b/egs/mucs21_subtask1/asr1/local/check_audio_data_folder.sh
new file mode 100755
index 00000000000..7d9f1ddb2af
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/local/check_audio_data_folder.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+if [ "$#" -ne 1 ]; then
+  echo "Usage: $0 <audio-data-dir>"
+  echo "e.g.: $0 /home/IS21_data/"
+  exit 1
+fi
+
+src=$1
+
+for part in Gujarati Hindi Marathi Odia Tamil Telugu;do
+
+	[ ! -d "$src/$part" ] && echo "Expected directory $src/$part to exist" && exit 1;
+
+	[ ! -d "$src/$part/train" ] && echo "Expected directory $src/$part/train to exist" && exit 1;
+	[ ! -d "$src/$part/train/audio" ] && echo "Expected directory $src/$part/train/audio to exist" && exit 1;
+	[ ! -f "$src/$part/train/transcription.txt" ] && echo "Expected file $src/$part/train/transcription.txt to exist" && exit 1;
+	num_aud_files_train=`ls $src/$part/train/audio | wc -l`
+	num_transcripts_train=`cat $src/$part/train/transcription.txt | wc -l`
+	[ $num_aud_files_train -ne $num_transcripts_train ] && echo "Inconsistent #transcripts($num_transcripts_train) and #audio files($num_aud_files_train)" && exit 1
+
+	[ ! -d "$src/$part/test" ] && echo "Expected directory $src/$part/test to exist" && exit 1;
+	[ ! -d "$src/$part/test/audio" ] && echo "Expected directory $src/$part/test/audio to exist" && exit 1;
+	[ ! -f "$src/$part/test/transcription.txt" ] && echo "Expected file $src/$part/test/transcription.txt to exist" && exit 1;
+	num_aud_files_test=`ls $src/$part/test/audio | wc -l`
+	num_transcripts_test=`cat $src/$part/test/transcription.txt | wc -l`
+	[ $num_aud_files_test -ne $num_transcripts_test ] && echo "Inconsistent #transcripts($num_transcripts_test) and #audio files($num_aud_files_test)" && exit 1
+
+	echo $src/$part - Correct format
+
+done
+
+echo All 6 languages directories in $src are in the correct format || exit 1;
\ No newline at end of file
diff --git a/egs/mucs21_subtask1/asr1/local/download_data.sh b/egs/mucs21_subtask1/asr1/local/download_data.sh
new file mode 100755
index 00000000000..1fdc04e921d
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/local/download_data.sh
@@ -0,0 +1,32 @@
+path=$1
+cwd=`pwd`
+DIR=$cwd/$path
+
+declare -A trainset
+trainset['Hindi']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Hindi_train.tar.gz'
+trainset['Marathi']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Marathi_train.tar.gz'
+trainset['Odia']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Odia_train.tar.gz'
+
+declare -A testset
+testset['Hindi']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Hindi_test.tar.gz'
+testset['Marathi']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Marathi_test.tar.gz'
+testset['Odia']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Odia_test.tar.gz'
+
+for lang in Hindi Marathi Odia; do
+  if [ ! -e ${DIR}/${lang}.done ]; then
+      cd ${DIR}
+      mkdir -p ${lang}
+      cd ${lang}
+      wget -O test.zip ${testset[$lang]}
+      tar xf "test.zip"
+      rm test.zip
+      wget -O train.zip ${trainset[$lang]}
+      tar xf "train.zip"
+      rm train.zip
+      cd $cwd
+      echo "Successfully finished downloading $lang data."
+      touch ${DIR}/${lang}.done
+  else
+      echo "$lang data already exists. Skip download."
+  fi
+done
diff --git a/egs/mucs21_subtask1/asr1/local/prepare_data.sh b/egs/mucs21_subtask1/asr1/local/prepare_data.sh
new file mode 100755
index 00000000000..de237034a8d
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/local/prepare_data.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Copyright 2021    Indian Institute of Science (author: Sathvik Udupa)
+# Apache 2.0
+
+path=$1
+cwd=`pwd`
+DIR=$cwd/$path
+
+if [ -d "$DIR" ]; then
+  echo "Found existing folder '${path}'.."
+else
+  echo $path 'folder not found. Creating directory..'
+  mkdir $DIR
+fi
+
+
+if [[ -d "$DIR"/microsoftspeechcorpusindianlanguages/ ]]; then
+  echo "'$path/microsoftspeechcorpusindianlanguages' exists on your filesystem."
+else
+  echo "'microsoftspeechcorpusindianlanguages' folder not found. Download it with the following line of code: "
+  echo " "
+  echo "azcopy copy '<msr opendata azcopy link>' <local folder path> --recursive"
+  echo " "
+  echo "Then move the contents of <local folder path> to $DIR without changes."
+  echo "The directory structure will then be as follows:"
+  echo "-$path"
+  echo "   -microsoftspeechcorpusindianlanguages"
+  echo "       -ta-in-Train"
+  echo "       -te-in-Train"
+  echo "       - ..."
+  echo "Exiting.."
+  exit 1
+fi
+
+declare -A msrdata_train
+msrdata_train['Tamil']=ta-in-Train
+msrdata_train['Telugu']=te-in-Train
+msrdata_train['Gujarati']=gu-in-Train
+
+declare -A msrdata_test
+msrdata_test['Tamil']=ta-in-Test
+msrdata_test['Telugu']=te-in-Test
+msrdata_test['Gujarati']=gu-in-Test
+
+for lang in Tamil Telugu Gujarati; do
+  if [ ! -e ${DIR}/${lang}.done ]; then
+      cd ${DIR}
+      mkdir -p ${lang}
+      cd ${lang}
+      cp -r ../microsoftspeechcorpusindianlanguages/${msrdata_train[$lang]} train
+      cp -r ../microsoftspeechcorpusindianlanguages/${msrdata_test[$lang]} test
+      mkdir train/audio
+      mkdir test/audio
+      DIR="$DIR/$lang/train/Audios/*"
+      reDir=$DIR/$lang/train/'audio'/
+      for i in $DIR; do
+          ffmpeg -y  -i "$i" -ar 8000 "$reDir${i##*/}"
+      done
+      DIR="$DIR/$lang/test/Audios/*"
+      reDir=$DIR/$lang/test/'audio'/
+      for i in $DIR; do
+          ffmpeg -y  -i "$i" -ar 8000 "$reDir${i##*/}"
+      done
+      rm -r train/Audios
+      rm -r test/Audios
+      touch ${DIR}/${lang}.done
+    else
+          echo "$lang data already exists. Skip prep."
+    fi
+done
diff --git a/egs/mucs21_subtask1/asr1/local/test_data_prep.sh b/egs/mucs21_subtask1/asr1/local/test_data_prep.sh
new file mode 100755
index 00000000000..f3a16e1cafd
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/local/test_data_prep.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /home/IS21_data/ data/test"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+
+mkdir -p $dst || exit 1;
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+utt=$dst/utt;[[ -f "$utt" ]] && rm $utt
+utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
+
+
+for part in Gujarati Hindi Marathi Odia Tamil Telugu;do 
+
+	if [ $part == "Gujarati" ];then
+		subset="GU"
+
+	elif [ $part == "Hindi" ];then
+		subset="HI"
+
+	elif [ $part == "Marathi" ];then
+		subset="MR"
+
+	elif [ $part == "Odia" ];then
+		subset="OR"
+
+	elif [ $part == "Tamil" ];then
+		subset="TA"
+
+	elif [ $part == "Telugu" ];then
+		subset="TE"
+
+	fi
+
+	echo "Preparing test data from $src/$part/test"
+
+	sed -e "s/^/${subset}_/" $src/$part/test/transcription.txt >> $trans
+
+	cat $src/$part/test/transcription.txt | awk '{print $1}' | awk '{print s$0, d$0".wav"}' s="$subset"_ d="$src/$part/test/audio/" >> $wav_scp
+
+done
+
+
+sort $trans | sed 's/[[:space:]]/ /g'>$dst/temp
+rm $trans && mv $dst/temp $trans
+
+
+sort $wav_scp >$dst/temp
+rm $wav_scp && mv $dst/temp $wav_scp
+
+#Preparing utt2spk
+cat $wav_scp | awk '{print $1}' >$utt
+paste $utt $utt > $utt2spk
+
+# Preparing spk2utt
+spk2utt=$dst/spk2utt
+utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
+
+# Validate the test directory
+utils/validate_data_dir.sh --no-feats $dst || exit 1;
+
+echo "$0: Successfully prepared data in $dst" ||  exit 1
+
+
diff --git a/egs/mucs21_subtask1/asr1/local/train_data_prep.sh b/egs/mucs21_subtask1/asr1/local/train_data_prep.sh
new file mode 100755
index 00000000000..71dc0ec76fc
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/local/train_data_prep.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /home/IS21_data/ data/train"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+
+mkdir -p $dst || exit 1;
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+utt=$dst/utt;[[ -f "$utt" ]] && rm $utt
+utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
+
+
+for part in Gujarati Hindi Marathi Odia Tamil Telugu;do 
+
+	if [ $part == "Gujarati" ];then
+		subset="GU"
+
+	elif [ $part == "Hindi" ];then
+		subset="HI"
+
+	elif [ $part == "Marathi" ];then
+		subset="MR"
+
+	elif [ $part == "Odia" ];then
+		subset="OR"
+
+	elif [ $part == "Tamil" ];then
+		subset="TA"
+
+	elif [ $part == "Telugu" ];then
+		subset="TE"
+
+	fi
+
+	echo "Preparing train data from $src/$part/train"
+
+	sed -e "s/^/${subset}_/" $src/$part/train/transcription.txt >> $trans
+
+	cat $src/$part/train/transcription.txt | awk '{print $1}' | awk '{print s$0, d$0".wav"}' s="$subset"_ d="$src/$part/train/audio/" >> $wav_scp
+
+done
+
+
+sort $trans | sed 's/[[:space:]]/ /g'>$dst/temp
+rm $trans && mv $dst/temp $trans
+
+
+sort $wav_scp >$dst/temp
+rm $wav_scp && mv $dst/temp $wav_scp
+
+#Preparing utt2spk
+cat $wav_scp | awk '{print $1}' >$utt
+paste $utt $utt > $utt2spk
+
+# Preparing spk2utt
+spk2utt=$dst/spk2utt
+utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
+
+# Validate the train directory
+utils/validate_data_dir.sh --no-feats $dst || exit 1;
+
+echo "$0: Successfully prepared data in $dst" ||  exit 1
+
+
diff --git a/egs/mucs21_subtask1/asr1/path.sh b/egs/mucs21_subtask1/asr1/path.sh
new file mode 100644
index 00000000000..8d773b5ffee
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/path.sh
@@ -0,0 +1,15 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/mucs21_subtask1/asr1/run.sh b/egs/mucs21_subtask1/asr1/run.sh
new file mode 100755
index 00000000000..d99d424a15f
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/run.sh
@@ -0,0 +1,296 @@
+#!/bin/bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+stage=-1       # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=1         # number of gpus ("0" uses cpu, otherwise use gpu)
+nj=32
+debugmode=1
+dumpdir=dump   # directory to dump full features
+N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      # verbose option
+resume=        # Resume the training from snapshot
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml # current default recipe requires 4 gpus.
+                             # if you do not have 4 gpus, please reconfigure the `batch-bins` and `accum-grad` parameters in config.
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume= # specify a snapshot file to resume LM training
+lmtag=     # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best  # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+lang_model=rnnlm.model.best # set a language model to be used for decoding
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
+                             # if false, the last `n_average` ASR models will be averaged.
+lm_n_average=0               # the number of languge models to be averaged
+use_lm_valbest_average=false # if true, the validation `lm_n_average`-best language models will be averaged.
+                             # if false, the last `lm_n_average` language models will be averaged.
+
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_sp=train_sp
+train_dev="test"
+recog_set="test"
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    local/download_data.sh raw_data
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    # remove space in text
+    mkdir -p data
+    local/prepare_data.sh raw_data
+    local/check_audio_data_folder.sh raw_data
+    local/test_data_prep.sh raw_data data/test
+    local/train_data_prep.sh raw_data data/train
+    echo "End of stage 0"
+
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_sp_dir=${dumpdir}/${train_sp}/delta${do_delta}; mkdir -p ${feat_sp_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in train test ; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+
+    cp  -r  data/${train_set} data/${train_set}_org
+    cp  -r  data/${train_dev} data/${train_dev}_org
+    utils/perturb_data_dir_speed.sh 0.9  data/${train_set}_org  data/temp1
+    utils/perturb_data_dir_speed.sh 1.0  data/${train_set}_org  data/temp2
+    utils/perturb_data_dir_speed.sh 1.1  data/${train_set}_org  data/temp3
+
+    utils/combine_data.sh --extra-files utt2uniq data/${train_sp} data/temp1 data/temp2 data/temp3
+
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj  --write_utt2num_frames true \
+            data/${train_sp} exp/make_fbank/${train_sp}  ${fbankdir}
+    utils/fix_data_dir.sh data/${train_sp}
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_sp}/feats.scp data/${train_sp}/cmvn.ark
+
+
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_sp}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/train ${feat_sp_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_dev}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/dev ${feat_dt_dir}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+            data/${rtask}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/recog/${rtask} \
+            ${feat_recog_dir}
+    done
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    # echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_char/
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    wc -l ${dict}
+
+    # # make json labels
+    data2json.sh --nj ${nj} --feat ${feat_sp_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_sp} ${dict} > ${feat_sp_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --nj ${nj} --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --nj ${nj} --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+# You can skip this and remove --rnnlm option in the recognition (stage 5)
+if [ -z ${lmtag} ]; then
+    lmtag=$(basename ${lm_config%.*})
+fi
+lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+lmexpdir=exp/${lmexpname}
+mkdir -p ${lmexpdir}
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Preparation"
+    lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+    if [ ! -e ${lmdatadir} ]; then
+        mkdir -p ${lmdatadir}
+        cut -f 2- -d" " data/${train_set}/text |\
+            spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+        cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
+                                                            > ${lmdatadir}/valid.txt
+    fi
+    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+        lm_train.py \
+        --config ${lm_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --verbose 1 \
+        --outdir ${lmexpdir} \
+        --tensorboard-dir tensorboard/${lmexpname} \
+        --train-label ${lmdatadir}/train.txt \
+        --valid-label ${lmdatadir}/valid.txt \
+        --resume ${lm_resume} \
+        --dict ${dict} \
+        --dump-hdf5-path ${lmdatadir}
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_sp_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+           [[ $(get_yaml.py ${train_config} etype) = transformer ]] || \
+           [[ $(get_yaml.py ${train_config} dtype) = transformer ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+
+        # Average LM models
+        if [ ${lm_n_average} -eq 0 ]; then
+            lang_model=rnnlm.model.best
+        else
+            if ${use_lm_valbest_average}; then
+                lang_model=rnnlm.val${lm_n_average}.avg.best
+                opt="--log ${lmexpdir}/log"
+            else
+                lang_model=rnnlm.last${lm_n_average}.avg.best
+                opt="--log"
+            fi
+            average_checkpoints.py \
+                ${opt} \
+                --backend ${backend} \
+                --snapshots ${lmexpdir}/snapshot.ep.* \
+                --out ${lmexpdir}/${lang_model} \
+                --num ${lm_n_average}
+        fi
+    fi
+
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=v3decode_${rtask}_${recog_model}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        # set batchsize 0 to disable batch decoding
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            --rnnlm ${lmexpdir}/${lang_model} \
+            --api v2
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/mucs21_subtask1/asr1/steps b/egs/mucs21_subtask1/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs/mucs21_subtask1/asr1/utils b/egs/mucs21_subtask1/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs/mucs21_subtask1/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs/mucs21_subtask2/asr1/RESULTS.md b/egs/mucs21_subtask2/asr1/RESULTS.md
new file mode 100644
index 00000000000..5dfb6dc2f7f
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/RESULTS.md
@@ -0,0 +1,25 @@
+# Conformer (large model + specaug + TransformerLM)
+- Model files
+  - training config file: `conf/train.yaml`
+  - decoding config file: `conf/decode.yaml`
+  - preprocess config file: `conf/specaug.yaml`
+
+- Results
+```
+Test Set (UnA):
+    Ben-Eng:
+    | SPKR | # Snt # Wrd | Corr Sub Del Ins Err S.Err |
+    | Sum/Avg | 4275 38562 | 66.7 24.4 8.9 3.9 37.2 82.1 |
+    Hin-Eng:
+    | SPKR | # Snt # Wrd | Corr Sub Del Ins Err S.Err |
+    | Sum/Avg | 3136 37611 | 76.6 16.7 6.7 4.3 27.7 62.8 |
+```
+```
+Blind Test Set(UnA):
+    Ben-Eng:
+    | SPKR | # Snt # Wrd | Corr Sub Del Ins Err S.Err |
+    | Sum/Avg | 3130 29157 | 70.1 26.1 3.8 10.6 40.5 87.9 |
+    Hin-Eng:
+    | SPKR | # Snt # Wrd | Corr Sub Del Ins Err S.Err |
+    | Sum/Avg | 4034 44705 | 77.7 17.7 4.6 11.4 33.7 87.0 |
+```
diff --git a/egs/mucs21_subtask2/asr1/cmd.sh b/egs/mucs21_subtask2/asr1/cmd.sh
new file mode 100644
index 00000000000..7b70ef5e06e
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/mucs21_subtask2/asr1/conf/decode.yaml b/egs/mucs21_subtask2/asr1/conf/decode.yaml
new file mode 100644
index 00000000000..0c923a7d1f6
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/decode.yaml
@@ -0,0 +1,4 @@
+batchsize: 0
+beam-size: 10
+ctc-weight: 0.4
+lm-weight: 0.6
diff --git a/egs/mucs21_subtask2/asr1/conf/fbank.conf b/egs/mucs21_subtask2/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs/mucs21_subtask2/asr1/conf/gpu.conf b/egs/mucs21_subtask2/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs/mucs21_subtask2/asr1/conf/lm.yaml b/egs/mucs21_subtask2/asr1/conf/lm.yaml
new file mode 100644
index 00000000000..e60ee7188ee
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/lm.yaml
@@ -0,0 +1,25 @@
+
+# network architecture
+model-module: transformer
+att-unit: 512
+embed-unit: 128
+head: 8
+layer: 16
+pos-enc: none
+unit: 2048
+
+# minibatch related
+batchsize: 32
+maxlen: 40
+
+# optimization related
+opt: adam
+schedulers: lr=cosine
+dropout-rate: 0.0
+epoch: 50
+gradclip: 1.0
+lr: 1e-4
+lr-cosine-total: 100000
+lr-cosine-warmup: 1000
+patience: 0
+sortagrad: 0
diff --git a/egs/mucs21_subtask2/asr1/conf/pitch.conf b/egs/mucs21_subtask2/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/mucs21_subtask2/asr1/conf/queue.conf b/egs/mucs21_subtask2/asr1/conf/queue.conf
new file mode 100644
index 00000000000..257d7b7b3aa
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs/mucs21_subtask2/asr1/conf/slurm.conf b/egs/mucs21_subtask2/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/mucs21_subtask2/asr1/conf/specaug.yaml b/egs/mucs21_subtask2/asr1/conf/specaug.yaml
new file mode 100644
index 00000000000..c0643d38597
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/mucs21_subtask2/asr1/conf/train.yaml b/egs/mucs21_subtask2/asr1/conf/train.yaml
new file mode 100644
index 00000000000..de0aa09acbe
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/conf/train.yaml
@@ -0,0 +1,34 @@
+accum-grad: 4
+adim: 512
+aheads: 8
+backend: pytorch
+batch-size: 64
+maxlen-in: 512
+maxlen-out: 150
+#batch-bins: 15000000
+dlayers: 6
+dropout-rate: 0.1
+dunits: 2048
+elayers: 12
+epochs: 60
+eunits: 2048
+grad-clip: 5
+lsm-weight: 0.1
+model-module: espnet.nets.pytorch_backend.e2e_asr_conformer:E2E
+mtlalpha: 0.3
+opt: noam
+patience: 3
+sortagrad: 0
+transformer-attn-dropout-rate: 0.0
+transformer-init: pytorch
+transformer-input-layer: conv2d
+transformer-length-normalized-loss: false
+transformer-lr: 10.0
+transformer-warmup-steps: 25000
+
+transformer-encoder-activation-type: swish
+transformer-encoder-pos-enc-layer-type: rel_pos
+transformer-encoder-selfattn-layer-type: rel_selfattn
+macaron-style: true
+use-cnn-module: true
+cnn-module-kernel: 31
diff --git a/egs/mucs21_subtask2/asr1/local/download_data.sh b/egs/mucs21_subtask2/asr1/local/download_data.sh
new file mode 100755
index 00000000000..29caf4b9d6e
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/local/download_data.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+available_languages=(
+    "hi-en" "bn-en"
+)
+db=$1
+lang=$2
+
+if [ $# != 2 ]; then
+    echo "Usage: $0 <db_root_dir> <spk>"
+    echo "Available languages for mucs subtask2: ${available_languages[*]}"
+    exit 1
+fi
+
+if ! $(echo ${available_languages[*]} | grep -q ${lang}); then
+    echo "Specified language (${lang}) is not available or not supported." >&2
+    echo "Choose from: ${available_languages[*]}"
+    exit 1
+fi
+
+declare -A trainset
+trainset['hi-en']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Hindi-English_train.tar.gz'
+trainset['bn-en']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Bengali-English_train.tar.gz'
+
+declare -A valset
+valset['hi-en']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Hindi-English_test.tar.gz'
+valset['bn-en']='http://www.ee.iisc.ac.in/new/people/faculty/prasantg/downloads/Bengali-English_test.tar.gz'
+
+
+
+cwd=`pwd`
+if [ ! -e ${db}/${lang}.done ]; then
+    mkdir -p ${db}
+    cd ${db}
+    mkdir -p ${lang}
+    cd ${lang}
+    wget -O valid.zip ${valset[$lang]}
+    tar xf "valid.zip"
+    rm valid.zip
+    wget -O train.zip ${trainset[$lang]}
+    tar xf "train.zip"
+    rm train.zip
+    cd $cwd
+    echo "Successfully finished download."
+    touch ${db}/${lang}.done
+else
+    echo "Already exists. Skip download."
+fi
diff --git a/egs/mucs21_subtask2/asr1/local/gen_wavscp.sh b/egs/mucs21_subtask2/asr1/local/gen_wavscp.sh
new file mode 100644
index 00000000000..bff1cb08e69
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/local/gen_wavscp.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+folder=$1
+awk -v folder="$folder" '{print $1 " " folder "/" $2}'
diff --git a/egs/mucs21_subtask2/asr1/local/prepare_data.sh b/egs/mucs21_subtask2/asr1/local/prepare_data.sh
new file mode 100755
index 00000000000..51323e19d44
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/local/prepare_data.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+IFS=$'\n'
+set -f
+for i in $(cat < "$1"); do
+   stem=$( echo $i | cut -d' ' -f1)
+   path=$2$( echo $i | cut -d' ' -f2)
+   echo $stem $path  >> $3
+done
+mv $3 $1
diff --git a/egs/mucs21_subtask2/asr1/path.sh b/egs/mucs21_subtask2/asr1/path.sh
new file mode 100644
index 00000000000..8d773b5ffee
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/path.sh
@@ -0,0 +1,15 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$MAIN_ROOT/tools/chainer_ctc/ext/warp-ctc/build
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
diff --git a/egs/mucs21_subtask2/asr1/run.sh b/egs/mucs21_subtask2/asr1/run.sh
new file mode 100755
index 00000000000..dfdc9c865fc
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/run.sh
@@ -0,0 +1,299 @@
+#!/bin/bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+# general configuration
+backend=pytorch
+lang=hi-en
+stage=-1       # start from -1 if you need to start from data download
+stop_stage=100
+ngpu=4         # number of gpus ("0" uses cpu, otherwise use gpu)
+nj=32
+debugmode=1
+dumpdir=dump/$lang/  # directory to dump full features
+N=0            # number of minibatches to be used (mainly for debugging). "0" uses all minibatches.
+verbose=0      # verbose option
+resume=        # Resume the training from snapshot
+
+# feature configuration
+do_delta=false
+
+preprocess_config=conf/specaug.yaml
+train_config=conf/train.yaml # current default recipe requires 4 gpus.
+                             # if you do not have 4 gpus, please reconfigure the `batch-bins` and `accum-grad` parameters in config.
+lm_config=conf/lm.yaml
+decode_config=conf/decode.yaml
+
+# rnnlm related
+lm_resume= # specify a snapshot file to resume LM training
+lmtag=     # tag for managing LMs
+
+# decoding parameter
+recog_model=model.acc.best  # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best'
+lang_model=rnnlm.model.best # set a language model to be used for decoding
+
+# model average realted (only for transformer)
+n_average=5                  # the number of ASR models to be averaged
+use_valbest_average=true     # if true, the validation `n_average`-best ASR models will be averaged.
+                             # if false, the last `n_average` ASR models will be averaged.
+lm_n_average=0               # the number of languge models to be averaged
+use_lm_valbest_average=false # if true, the validation `lm_n_average`-best language models will be averaged.
+                             # if false, the last `lm_n_average` language models will be averaged.
+
+# Set this to somewhere where you want to put your data, or where
+# someone else has already put it.
+# datadir=
+
+# base url for downloads.
+# data_url=www.openslr.org/resources/12
+
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag="" # tag for managing experiments.
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train/$lang
+train_sp=train_sp
+train_dev=test/$lang
+recog_set="test"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    local/download_data.sh data/ $lang
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+
+    for dset in test train; do
+    local/prepare_data.sh data/$lang/$dset/transcripts/wav.scp data/$lang/$dset/ out.scp
+    done
+fi
+
+feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_sp_dir=${dumpdir}/${train_sp}/delta${do_delta}; mkdir -p ${feat_sp_dir}
+feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in hi-en/train/transcripts hi-en/test/transcripts ; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
+            data/${x} exp/make_fbank/${x} ${fbankdir}
+        utils/fix_data_dir.sh data/${x}
+    done
+
+
+    cp  -r  data/${train_set} data/${train_set}_org
+    cp  -r  data/${train_dev} data/${train_dev}_org
+    utils/perturb_data_dir_speed.sh 0.9  data/${train_set}_org  data/temp1
+    utils/perturb_data_dir_speed.sh 1.0  data/${train_set}_org  data/temp2
+    utils/perturb_data_dir_speed.sh 1.1  data/${train_set}_org  data/temp3
+
+    utils/combine_data.sh --extra-files utt2uniq data/${train_sp} data/temp1 data/temp2 data/temp3
+
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj $nj  --write_utt2num_frames true \
+            data/${train_sp} exp/make_fbank/${train_sp}  ${fbankdir}
+    utils/fix_data_dir.sh data/${train_sp}
+    # compute global CMVN
+    compute-cmvn-stats scp:data/${train_sp}/feats.scp data/${train_sp}/cmvn.ark
+
+
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_sp}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/train ${feat_sp_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+        data/${train_dev}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/dev ${feat_dt_dir}
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}; mkdir -p ${feat_recog_dir}
+        dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta ${do_delta} \
+            data/${rtask}/feats.scp data/${train_sp}/cmvn.ark exp/dump_feats/recog/${rtask} \
+            ${feat_recog_dir}
+    done
+fi
+
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    # echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_char/
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt
+    spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    wc -l ${dict}
+
+    # # make json labels
+    data2json.sh --nj ${nj} --feat ${feat_sp_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_sp} ${dict} > ${feat_sp_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --nj ${nj} --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
+        data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+
+    for rtask in ${recog_set}; do
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+        data2json.sh --nj ${nj} --feat ${feat_recog_dir}/feats.scp --bpecode ${bpemodel}.model \
+            data/${rtask} ${dict} > ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+    done
+fi
+
+# You can skip this and remove --rnnlm option in the recognition (stage 5)
+if [ -z ${lmtag} ]; then
+    lmtag=$(basename ${lm_config%.*})
+fi
+lmexpname=train_rnnlm_${backend}_${lmtag}_${bpemode}${nbpe}_ngpu${ngpu}
+lmexpdir=exp/${lmexpname}
+mkdir -p ${lmexpdir}
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Preparation"
+    lmdatadir=data/local/lm_train_${bpemode}${nbpe}
+    if [ ! -e ${lmdatadir} ]; then
+        mkdir -p ${lmdatadir}
+        cut -f 2- -d" " data/${train_set}/text |\
+            spm_encode --model=${bpemodel}.model --output_format=piece > ${lmdatadir}/train.txt
+        cut -f 2- -d" " data/${train_dev}/text | spm_encode --model=${bpemodel}.model --output_format=piece \
+                                                            > ${lmdatadir}/valid.txt
+    fi
+    ${cuda_cmd} --gpu ${ngpu} ${lmexpdir}/train.log \
+        lm_train.py \
+        --config ${lm_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --verbose 1 \
+        --outdir ${lmexpdir} \
+        --tensorboard-dir tensorboard/${lmexpname} \
+        --train-label ${lmdatadir}/train.txt \
+        --valid-label ${lmdatadir}/valid.txt \
+        --resume ${lm_resume} \
+        --dict ${dict} \
+        --dump-hdf5-path ${lmdatadir}
+fi
+
+if [ -z ${tag} ]; then
+    expname=${train_set}_${backend}_$(basename ${train_config%.*})
+    if ${do_delta}; then
+        expname=${expname}_delta
+    fi
+    if [ -n "${preprocess_config}" ]; then
+        expname=${expname}_$(basename ${preprocess_config%.*})
+    fi
+else
+    expname=${train_set}_${backend}_${tag}
+fi
+expdir=exp/${expname}
+mkdir -p ${expdir}
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Network Training"
+    ${cuda_cmd} --gpu ${ngpu} ${expdir}/train.log \
+        asr_train.py \
+        --config ${train_config} \
+        --preprocess-conf ${preprocess_config} \
+        --ngpu ${ngpu} \
+        --backend ${backend} \
+        --outdir ${expdir}/results \
+        --tensorboard-dir tensorboard/${expname} \
+        --debugmode ${debugmode} \
+        --dict ${dict} \
+        --debugdir ${expdir} \
+        --minibatches ${N} \
+        --verbose ${verbose} \
+        --resume ${resume} \
+        --train-json ${feat_sp_dir}/data_${bpemode}${nbpe}.json \
+        --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Decoding"
+    if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \
+           [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \
+           [[ $(get_yaml.py ${train_config} etype) = transformer ]] || \
+           [[ $(get_yaml.py ${train_config} dtype) = transformer ]]; then
+        # Average ASR models
+        if ${use_valbest_average}; then
+            recog_model=model.val${n_average}.avg.best
+            opt="--log ${expdir}/results/log"
+        else
+            recog_model=model.last${n_average}.avg.best
+            opt="--log"
+        fi
+        average_checkpoints.py \
+            ${opt} \
+            --backend ${backend} \
+            --snapshots ${expdir}/results/snapshot.ep.* \
+            --out ${expdir}/results/${recog_model} \
+            --num ${n_average}
+
+        # Average LM models
+        if [ ${lm_n_average} -eq 0 ]; then
+            lang_model=rnnlm.model.best
+        else
+            if ${use_lm_valbest_average}; then
+                lang_model=rnnlm.val${lm_n_average}.avg.best
+                opt="--log ${lmexpdir}/log"
+            else
+                lang_model=rnnlm.last${lm_n_average}.avg.best
+                opt="--log"
+            fi
+            average_checkpoints.py \
+                ${opt} \
+                --backend ${backend} \
+                --snapshots ${lmexpdir}/snapshot.ep.* \
+                --out ${lmexpdir}/${lang_model} \
+                --num ${lm_n_average}
+        fi
+    fi
+
+    pids=() # initialize pids
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=v3decode_${rtask}_${recog_model}_$(basename ${decode_config%.*})_${lmtag}
+        feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta}
+
+        # split data
+        splitjson.py --parts ${nj} ${feat_recog_dir}/data_${bpemode}${nbpe}.json
+
+        #### use CPU for decoding
+        ngpu=0
+
+        # set batchsize 0 to disable batch decoding
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+            asr_recog.py \
+            --config ${decode_config} \
+            --ngpu ${ngpu} \
+            --backend ${backend} \
+            --batchsize 0 \
+            --recog-json ${feat_recog_dir}/split${nj}utt/data_${bpemode}${nbpe}.JOB.json \
+            --result-label ${expdir}/${decode_dir}/data.JOB.json \
+            --model ${expdir}/results/${recog_model}  \
+            --rnnlm ${lmexpdir}/${lang_model} \
+            --api v2
+
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
+
+    ) &
+    pids+=($!) # store background pids
+    done
+    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+    echo "Finished"
+fi
diff --git a/egs/mucs21_subtask2/asr1/steps b/egs/mucs21_subtask2/asr1/steps
new file mode 120000
index 00000000000..5e0ea536784
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/steps
@@ -0,0 +1 @@
+../../wsj/asr1/steps
\ No newline at end of file
diff --git a/egs/mucs21_subtask2/asr1/utils b/egs/mucs21_subtask2/asr1/utils
new file mode 120000
index 00000000000..688bc119e94
--- /dev/null
+++ b/egs/mucs21_subtask2/asr1/utils
@@ -0,0 +1 @@
+../../wsj/asr1/utils
\ No newline at end of file
diff --git a/egs/must_c/asr1/cmd.sh b/egs/must_c/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/must_c/asr1/cmd.sh
+++ b/egs/must_c/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/must_c/asr1/run.sh b/egs/must_c/asr1/run.sh
index a1b789cfb42..05fe6a97498 100755
--- a/egs/must_c/asr1/run.sh
+++ b/egs/must_c/asr1/run.sh
@@ -106,7 +106,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in ${train_set_prefix}.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
-        local/divide_lang.sh ${x} ${tgt_lang}
+        divide_lang.sh ${x} "en ${tgt_lang}"
     done
     for lang in ${tgt_lang} en; do
         cp -rf data/dev.en-${tgt_lang}.${lang} data/dev_org.en-${tgt_lang}.${lang}
diff --git a/egs/must_c/mt1/cmd.sh b/egs/must_c/mt1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/must_c/mt1/cmd.sh
+++ b/egs/must_c/mt1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/must_c/mt1/run.sh b/egs/must_c/mt1/run.sh
index 39fa560cad0..52a292076c9 100755
--- a/egs/must_c/mt1/run.sh
+++ b/egs/must_c/mt1/run.sh
@@ -7,9 +7,9 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=-1        # start from -1 if you need to start from data download
-stop_stage=100
+stop_stage=5
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
 dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
 nj=4            # number of parallel jobs for decoding
@@ -33,7 +33,7 @@ use_valbest_average=true     # if true, the validation `n_average`-best MT model
 metric=bleu                  # loss/acc/bleu
 
 # cascaded-ST related
-asr_model=
+asr_model_dir=
 decode_config_asr=
 dict_asr=
 
@@ -54,6 +54,7 @@ must_c=/n/rd8/MUSTC_v1.0
 
 # target language related
 tgt_lang=de
+# you can choose from de, es, fr, it, nl, pt, ro, ru
 
 # if true, reverse source and target languages: **->English
 reverse_direction=false
@@ -107,7 +108,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
-        local/divide_lang.sh ${x} ${tgt_lang}
+        divide_lang.sh ${x} "en ${tgt_lang}"
     done
     for lang in ${tgt_lang} en; do
         cp -rf data/dev.en-${tgt_lang}.${lang} data/dev_org.en-${tgt_lang}.${lang}
@@ -158,12 +159,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 
     echo "make json files"
     if [ ${reverse_direction} = true ]; then
-        data2json.sh --nj 16 --text data/train.en-${tgt_lang}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data2json.sh --nj 16 --text data/train.en-${tgt_lang}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
             data/train.en-${tgt_lang}.en ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         for x in ${train_dev} ${trans_set}; do
             feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
             set=$(echo ${x} | cut -f 1 -d ".")
-            data2json.sh --text data/${set}.en-${tgt_lang}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+            data2json.sh --text data/${set}.en-${tgt_lang}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
                 data/${set}.en-${tgt_lang}.en ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         done
 
@@ -175,11 +176,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
                 ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
         done
     else
-        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
             data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         for x in ${train_dev} ${trans_set}; do
             feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
-            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         done
 
@@ -254,7 +255,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         feat_trans_dir=${dumpdir}/${x}
 
         # reset log for RTF calculation
-        if [ -d ${expdir}/${decode_dir}/log/ ]; then
+        if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
             rm ${expdir}/${decode_dir}/log/decode.*.log
         fi
 
@@ -272,13 +273,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --model ${expdir}/results/${trans_model}
 
         if [ ${reverse_direction} = true ]; then
-            score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
                 --remove_nonverbal ${remove_nonverbal} \
                 ${expdir}/${decode_dir} "en" ${dict}
         else
-            score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
                 --remove_nonverbal ${remove_nonverbal} \
-                ${expdir}/${decode_dir} ${tgt_lang} ${dict}
+                ${expdir}/${decode_dir} "${tgt_lang}" ${dict}
         fi
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
@@ -290,7 +291,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "Finished"
 fi
 
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -n "${decode_config_asr}" ] && [ -n "${dict_asr}" ] && [ ${reverse_direction} = false ]; then
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model_dir}" ] && [ -n "${decode_config_asr}" ] && [ -n "${dict_asr}" ] && [ ${reverse_direction} = false ]; then
     echo "stage 6: Cascaded-ST decoding"
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
         # Average MT models
@@ -302,17 +303,17 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
     fi
 
     for x in ${trans_set}; do
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev); mkdir -p ${feat_trans_dir}
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}; mkdir -p ${feat_trans_dir}
         rtask=$(echo ${x} | cut -f -2 -d ".").en
         data_dir=data/${rtask}
 
         # ASR outputs
         asr_decode_dir=decode_${rtask}_$(basename ${decode_config_asr%.*})
-        json2text.py ${asr_model}/${asr_decode_dir}/data.json ${dict_asr} ${data_dir}/text_asr_ref.${src_case} ${data_dir}/text_asr_hyp.${src_case}
-        spm_decode --model=${bpemodel}.model --input_format=piece < ${data_dir}/text_asr_hyp.${src_case} | sed -e "s/▁/ /g" \
+        json2text.py ${asr_model_dir}/${asr_decode_dir}/data.json ${dict_asr} ${data_dir}/text_asr_ref.${src_case} ${data_dir}/text_asr_hyp.${src_case}
+        paste -d " " <(cut -d " " -f 1 ${data_dir}/text_asr_hyp.${src_case}) <(cut -d " " -f 2- ${data_dir}/text_asr_hyp.${src_case} | spm_decode --model=${bpemodel}.model --input_format=piece | sed -e "s/▁/ /g" | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
             > ${data_dir}/text_asr_hyp.wrd.${src_case}
 
-        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         update_json.sh --text ${data_dir}/text_asr_hyp.wrd.${src_case} --bpecode ${bpemodel}.model \
             ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
@@ -326,7 +327,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
     for x in ${trans_set}; do
     (
         decode_dir=decode_${x}_$(basename ${decode_config%.*})_pipeline
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev)
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}
 
         # reset log for RTF calculation
         if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
@@ -346,9 +347,9 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             --remove_nonverbal ${remove_nonverbal} \
-            ${expdir}/${decode_dir} ${tgt_lang} ${dict}
+            ${expdir}/${decode_dir} "${tgt_lang}" ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
     ) &
diff --git a/egs/must_c/st1/cmd.sh b/egs/must_c/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/must_c/st1/cmd.sh
+++ b/egs/must_c/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/must_c/st1/local/divide_lang.sh b/egs/must_c/st1/local/divide_lang.sh
deleted file mode 100755
index 60aae5d31bd..00000000000
--- a/egs/must_c/st1/local/divide_lang.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2019 Kyoto University (Hirofumi Inaguma)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-. ./path.sh
-
-if [ "$#" -ne 2 ]; then
-    echo "Usage: $0 <set> <lang>"
-    echo "e.g.: $0 dev"
-    exit 1
-fi
-
-set=$1
-lang=$2
-
-# Copy stuff into its final locations [this has been moved from the format_data script]
-# for En
-mkdir -p data/${set}.en
-for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
-    if [ -f data/${set}/${f} ]; then
-        sort data/${set}/${f} > data/${set}.en/${f}
-    fi
-done
-sort data/${set}/text.lc.rm.en > data/${set}.en/text  # dummy
-sort data/${set}/text.tc.en > data/${set}.en/text.tc
-sort data/${set}/text.lc.en > data/${set}.en/text.lc
-sort data/${set}/text.lc.rm.en > data/${set}.en/text.lc.rm
-utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.en
-if [ -f data/${set}.en/feats.scp ]; then
-    utils/validate_data_dir.sh data/${set}.en || exit 1;
-else
-    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.en || exit 1;
-fi
-
-# for target language
-mkdir -p data/${set}.${lang}
-for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
-    if [ -f data/${set}/${f} ]; then
-        sort data/${set}/${f} > data/${set}.${lang}/${f}
-    fi
-done
-sort data/${set}/text.tc.${lang} > data/${set}.${lang}/text  # dummy
-sort data/${set}/text.tc.${lang} > data/${set}.${lang}/text.tc
-sort data/${set}/text.lc.${lang} > data/${set}.${lang}/text.lc
-sort data/${set}/text.lc.rm.${lang} > data/${set}.${lang}/text.lc.rm
-utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
-if [ -f data/${set}.${lang}/feats.scp ]; then
-    utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
-else
-    utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
-fi
diff --git a/egs/must_c/st1/run.sh b/egs/must_c/st1/run.sh
index ab49564f164..910012760d4 100755
--- a/egs/must_c/st1/run.sh
+++ b/egs/must_c/st1/run.sh
@@ -7,7 +7,7 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=-1        # start from -1 if you need to start from data download
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
@@ -109,7 +109,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
-        local/divide_lang.sh ${x} ${tgt_lang}
+        divide_lang.sh ${x} "en ${tgt_lang}"
     done
     for lang in ${tgt_lang} en; do
         cp -rf data/dev.en-${tgt_lang}.${lang} data/dev_org.en-${tgt_lang}.${lang}
@@ -124,16 +124,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
 
     # dump features for training
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_set}/delta${do_delta}/storage \
-          ${feat_tr_dir}/storage
-    fi
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c/st1/dump/${train_dev}/delta${do_delta}/storage \
-          ${feat_dt_dir}/storage
-    fi
     dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
         data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
     for x in ${train_dev} ${trans_set}; do
@@ -167,11 +157,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
         data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     for x in ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     done
 
@@ -278,7 +268,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             --remove_nonverbal ${remove_nonverbal} \
             ${expdir}/${decode_dir} ${tgt_lang} ${dict}
 
diff --git a/egs/must_c_v2/asr1/cmd.sh b/egs/must_c_v2/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/must_c_v2/asr1/cmd.sh
+++ b/egs/must_c_v2/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/must_c_v2/asr1/run.sh b/egs/must_c_v2/asr1/run.sh
index f78f206fc6c..fda233c2918 100755
--- a/egs/must_c_v2/asr1/run.sh
+++ b/egs/must_c_v2/asr1/run.sh
@@ -105,7 +105,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in ${train_set_prefix}.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
-        local/divide_lang.sh ${x} ${tgt_lang}
+        divide_lang.sh ${x} "en ${tgt_lang}"
     done
     for lang in ${tgt_lang} en; do
         cp -rf data/dev.en-${tgt_lang}.${lang} data/dev_org.en-${tgt_lang}.${lang}
diff --git a/egs/must_c_v2/mt1/cmd.sh b/egs/must_c_v2/mt1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/must_c_v2/mt1/cmd.sh
+++ b/egs/must_c_v2/mt1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/must_c_v2/mt1/run.sh b/egs/must_c_v2/mt1/run.sh
index cd7cb5ed7d9..cd84322e73f 100755
--- a/egs/must_c_v2/mt1/run.sh
+++ b/egs/must_c_v2/mt1/run.sh
@@ -7,9 +7,9 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=-1        # start from -1 if you need to start from data download
-stop_stage=100
+stop_stage=5
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
 dec_ngpu=0      # number of gpus during decoding ("0" uses cpu, otherwise use gpu)
 nj=4            # number of parallel jobs for decoding
@@ -33,7 +33,7 @@ use_valbest_average=true     # if true, the validation `n_average`-best MT model
 metric=bleu                  # loss/acc/bleu
 
 # cascaded-ST related
-asr_model=
+asr_model_dir=
 decode_config_asr=
 dict_asr=
 
@@ -54,6 +54,7 @@ must_c=/n/rd8/MUSTC_v2.0
 
 # target language related
 tgt_lang=de
+# you can choose from de, es, fr, it, nl, pt, ro, ru
 
 # if true, reverse source and target languages: **->English
 reverse_direction=false
@@ -107,7 +108,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in train.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
-        local/divide_lang.sh ${x} ${tgt_lang}
+        divide_lang.sh ${x} "en ${tgt_lang}"
     done
     for lang in ${tgt_lang} en; do
         cp -rf data/dev.en-${tgt_lang}.${lang} data/dev_org.en-${tgt_lang}.${lang}
@@ -158,12 +159,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 
     echo "make json files"
     if [ ${reverse_direction} = true ]; then
-        data2json.sh --nj 16 --text data/train.en-${tgt_lang}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data2json.sh --nj 16 --text data/train.en-${tgt_lang}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
             data/train.en-${tgt_lang}.en ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         for x in ${train_dev} ${trans_set}; do
             feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
             set=$(echo ${x} | cut -f 1 -d ".")
-            data2json.sh --text data/${set}.en-${tgt_lang}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+            data2json.sh --text data/${set}.en-${tgt_lang}.en/text.${tgt_case} --bpecode ${bpemodel}.model --lang "en" \
                 data/${set}.en-${tgt_lang}.en ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         done
 
@@ -175,11 +176,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
                 ${feat_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
         done
     else
-        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data2json.sh --nj 16 --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
             data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         for x in ${train_dev} ${trans_set}; do
             feat_trans_dir=${dumpdir}/${x}; mkdir -p ${feat_trans_dir}
-            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+            data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
                 data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         done
 
@@ -254,7 +255,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         feat_trans_dir=${dumpdir}/${x}
 
         # reset log for RTF calculation
-        if [ -d ${expdir}/${decode_dir}/log/ ]; then
+        if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
             rm ${expdir}/${decode_dir}/log/decode.*.log
         fi
 
@@ -272,13 +273,13 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --model ${expdir}/results/${trans_model}
 
         if [ ${reverse_direction} = true ]; then
-            score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
                 --remove_nonverbal ${remove_nonverbal} \
                 ${expdir}/${decode_dir} "en" ${dict}
         else
-            score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+            score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
                 --remove_nonverbal ${remove_nonverbal} \
-                ${expdir}/${decode_dir} ${tgt_lang} ${dict}
+                ${expdir}/${decode_dir} "${tgt_lang}" ${dict}
         fi
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
@@ -290,7 +291,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     echo "Finished"
 fi
 
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -n "${decode_config_asr}" ] && [ -n "${dict_asr}" ] && [ ${reverse_direction} = false ]; then
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model_dir}" ] && [ -n "${decode_config_asr}" ] && [ -n "${dict_asr}" ] && [ ${reverse_direction} = false ]; then
     echo "stage 6: Cascaded-ST decoding"
     if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]]; then
         # Average MT models
@@ -302,17 +303,17 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
     fi
 
     for x in ${trans_set}; do
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev); mkdir -p ${feat_trans_dir}
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}; mkdir -p ${feat_trans_dir}
         rtask=$(echo ${x} | cut -f -2 -d ".").en
         data_dir=data/${rtask}
 
         # ASR outputs
         asr_decode_dir=decode_${rtask}_$(basename ${decode_config_asr%.*})
-        json2text.py ${asr_model}/${asr_decode_dir}/data.json ${dict_asr} ${data_dir}/text_asr_ref.${src_case} ${data_dir}/text_asr_hyp.${src_case}
-        spm_decode --model=${bpemodel}.model --input_format=piece < ${data_dir}/text_asr_hyp.${src_case} | sed -e "s/▁/ /g" \
+        json2text.py ${asr_model_dir}/${asr_decode_dir}/data.json ${dict_asr} ${data_dir}/text_asr_ref.${src_case} ${data_dir}/text_asr_hyp.${src_case}
+        paste -d " " <(cut -d " " -f 1 ${data_dir}/text_asr_hyp.${src_case}) <(cut -d " " -f 2- ${data_dir}/text_asr_hyp.${src_case} | spm_decode --model=${bpemodel}.model --input_format=piece | sed -e "s/▁/ /g" | awk '{if(NF>0) {print $0;} else {print "emptyutterance";}}') \
             > ${data_dir}/text_asr_hyp.wrd.${src_case}
 
-        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data2json.sh --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
         update_json.sh --text ${data_dir}/text_asr_hyp.wrd.${src_case} --bpecode ${bpemodel}.model \
             ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json ${data_dir} ${dict}
@@ -326,7 +327,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
     for x in ${trans_set}; do
     (
         decode_dir=decode_${x}_$(basename ${decode_config%.*})_pipeline
-        feat_trans_dir=${dumpdir}/${x}_$(echo ${asr_model} | rev | cut -f 2 -d "/" | rev)
+        feat_trans_dir=${expdir}/$(echo ${asr_model_dir} | rev | cut -f 1 -d "/" | rev)/${x}
 
         # reset log for RTF calculation
         if [ -f ${expdir}/${decode_dir}/log/decode.1.log ]; then
@@ -346,9 +347,9 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] && [ -n "${asr_model}" ] && [ -
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             --remove_nonverbal ${remove_nonverbal} \
-            ${expdir}/${decode_dir} ${tgt_lang} ${dict}
+            ${expdir}/${decode_dir} "${tgt_lang}" ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
     ) &
diff --git a/egs/must_c_v2/st1/cmd.sh b/egs/must_c_v2/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/must_c_v2/st1/cmd.sh
+++ b/egs/must_c_v2/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/must_c_v2/st1/run.sh b/egs/must_c_v2/st1/run.sh
index 3965d403247..42fdbf878e6 100755
--- a/egs/must_c_v2/st1/run.sh
+++ b/egs/must_c_v2/st1/run.sh
@@ -7,7 +7,7 @@
 . ./cmd.sh || exit 1;
 
 # general configuration
-backend=pytorch # chainer or pytorch
+backend=pytorch
 stage=-1        # start from -1 if you need to start from data download
 stop_stage=100
 ngpu=1          # number of gpus during training ("0" uses cpu, otherwise use gpu)
@@ -56,6 +56,7 @@ must_c=/n/rd8/MUSTC_v2.0
 
 # target language related
 tgt_lang=de
+# you can choose from de, es, fr, it, nl, pt, ro, ru
 
 # bpemode (unigram or bpe)
 nbpe=8000
@@ -108,7 +109,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # Divide into source and target languages
     for x in train_sp.en-${tgt_lang} dev.en-${tgt_lang} tst-COMMON.en-${tgt_lang} tst-HE.en-${tgt_lang}; do
-        local/divide_lang.sh ${x} ${tgt_lang}
+        divide_lang.sh ${x} "en ${tgt_lang}"
     done
     for lang in ${tgt_lang} en; do
         cp -rf data/dev.en-${tgt_lang}.${lang} data/dev_org.en-${tgt_lang}.${lang}
@@ -123,16 +124,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     compute-cmvn-stats scp:data/${train_set}/feats.scp data/${train_set}/cmvn.ark
 
     # dump features for training
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_tr_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c_v2/st1/dump/${train_set}/delta${do_delta}/storage \
-          ${feat_tr_dir}/storage
-    fi
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${feat_dt_dir}/storage ]; then
-      utils/create_split_dir.pl \
-          /export/b{14,15,16,17}/${USER}/espnet-data/egs/must_c_v2/st1/dump/${train_dev}/delta${do_delta}/storage \
-          ${feat_dt_dir}/storage
-    fi
     dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
         data/${train_set}/feats.scp data/${train_set}/cmvn.ark exp/dump_feats/${train_set} ${feat_tr_dir}
     for x in ${train_dev} ${trans_set}; do
@@ -166,11 +157,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text data/${train_set}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
         data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     for x in ${train_dev} ${trans_set}; do
         feat_trans_dir=${dumpdir}/${x}/delta${do_delta}
-        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang ${tgt_lang} \
+        data2json.sh --feat ${feat_trans_dir}/feats.scp --text data/${x}/text.${tgt_case} --bpecode ${bpemodel}.model --lang "${tgt_lang}" \
             data/${x} ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json
     done
 
@@ -226,7 +217,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --train-json ${feat_tr_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
         --valid-json ${feat_dt_dir}/data_${bpemode}${nbpe}.${src_case}_${tgt_case}.json \
         --enc-init ${asr_model} \
-        --dec-init ${mt_model}
+        --dec-init ${mt_model} \
+        --n-iter-processes 2
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -277,7 +269,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             --remove_nonverbal ${remove_nonverbal} \
             ${expdir}/${decode_dir} ${tgt_lang} ${dict}
 
diff --git a/egs/polyphone_swiss_french/asr1/cmd.sh b/egs/polyphone_swiss_french/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/polyphone_swiss_french/asr1/cmd.sh
+++ b/egs/polyphone_swiss_french/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/puebla_nahuatl/asr1/cmd.sh b/egs/puebla_nahuatl/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/puebla_nahuatl/asr1/cmd.sh
+++ b/egs/puebla_nahuatl/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/puebla_nahuatl/st1/cmd.sh b/egs/puebla_nahuatl/st1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/puebla_nahuatl/st1/cmd.sh
+++ b/egs/puebla_nahuatl/st1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/puebla_nahuatl/st1/run.sh b/egs/puebla_nahuatl/st1/run.sh
index 29814ba90bd..28f9cd4c954 100755
--- a/egs/puebla_nahuatl/st1/run.sh
+++ b/egs/puebla_nahuatl/st1/run.sh
@@ -110,7 +110,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     # divide into source and target languages
     for x in train_${annotation_id}_sp dev_${annotation_id} test_${annotation_id}; do
-        local/divide_lang.sh ${x}
+        divide_lang.sh ${x} "${src_lang} ${tgt_lang}"
     done
 
     # compute global CMVN
@@ -257,7 +257,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
             --result-label ${expdir}/${decode_dir}/data.JOB.json \
             --model ${expdir}/results/${trans_model}
 
-        score_bleu.sh --case ${tgt_case} --bpe ${nbpe} --bpemodel ${bpemodel}.model \
+        score_bleu.sh --case ${tgt_case} --bpemodel ${bpemodel}.model \
             ${expdir}/${decode_dir} ${tgt_lang} ${dict}
 
         calculate_rtf.py --log-dir ${expdir}/${decode_dir}/log
diff --git a/egs/reverb/asr1/cmd.sh b/egs/reverb/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/reverb/asr1/cmd.sh
+++ b/egs/reverb/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/reverb/asr1_multich/cmd.sh b/egs/reverb/asr1_multich/cmd.sh
index 9f648974ff4..3099918dd5d 100644
--- a/egs/reverb/asr1_multich/cmd.sh
+++ b/egs/reverb/asr1_multich/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/ru_open_stt/asr1/cmd.sh b/egs/ru_open_stt/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/ru_open_stt/asr1/cmd.sh
+++ b/egs/ru_open_stt/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/swbd/asr1/cmd.sh b/egs/swbd/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/swbd/asr1/cmd.sh
+++ b/egs/swbd/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/swbd/asr1/local/eval2000_data_prep.sh b/egs/swbd/asr1/local/eval2000_data_prep.sh
index ace2e61a7a6..8d30095820d 100755
--- a/egs/swbd/asr1/local/eval2000_data_prep.sh
+++ b/egs/swbd/asr1/local/eval2000_data_prep.sh
@@ -40,7 +40,7 @@ tdir=$2
 dir=data/local/eval2000
 mkdir -p $dir
 
-find $sdir/english -iname '*.sph' | sort > $dir/sph.flist
+find -L $sdir/english -iname '*.sph' | sort > $dir/sph.flist
 sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
   > $dir/sph.scp
 
diff --git a/egs/swbd/asr1/local/fisher_data_prep.sh b/egs/swbd/asr1/local/fisher_data_prep.sh
index 59cdacd300d..6334f28ed8f 100755
--- a/egs/swbd/asr1/local/fisher_data_prep.sh
+++ b/egs/swbd/asr1/local/fisher_data_prep.sh
@@ -69,10 +69,10 @@ fi
 
 if [ $stage -le 0 ]; then
 
-  find $links/fe_03_p1_tran/data $links/fe_03_p2_tran/data -iname '*.txt'  > $tmpdir/transcripts.flist
+  find -L $links/fe_03_p1_tran/data $links/fe_03_p2_tran/data -iname '*.txt'  > $tmpdir/transcripts.flist
 
   for dir in fe_03_p{1,2}_sph{1,2,3,4,5,6,7}; do
-    find $links/$dir/ -iname '*.sph'
+    find -L $links/$dir/ -iname '*.sph'
   done > $tmpdir/sph.flist
 
   n=`cat $tmpdir/transcripts.flist | wc -l`
diff --git a/egs/swbd/asr1/local/map_acronyms_transcripts.py b/egs/swbd/asr1/local/map_acronyms_transcripts.py
index f1cf75ce683..ba02aaec34b 100755
--- a/egs/swbd/asr1/local/map_acronyms_transcripts.py
+++ b/egs/swbd/asr1/local/map_acronyms_transcripts.py
@@ -4,7 +4,7 @@
 # Apache 2.0
 
 # convert acronyms in swbd transcript to fisher convention
-# accoring to first two columns in the input acronyms mapping
+# according to first two columns in the input acronyms mapping
 
 import argparse
 import re
diff --git a/egs/swbd/asr1/local/rt03_data_prep.sh b/egs/swbd/asr1/local/rt03_data_prep.sh
index 1c6b179b460..8195929884d 100755
--- a/egs/swbd/asr1/local/rt03_data_prep.sh
+++ b/egs/swbd/asr1/local/rt03_data_prep.sh
@@ -29,7 +29,7 @@ rtroot=$sdir
 tdir=$sdir/data/references/eval03/english/cts
 sdir=$sdir/data/audio/eval03/english/cts
 
-find $sdir -iname '*.sph' | sort > $dir/sph.flist
+find -L $sdir -iname '*.sph' | sort > $dir/sph.flist
 sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
   > $dir/sph.scp
 
diff --git a/egs/swbd/asr1/local/swbd1_data_prep.sh b/egs/swbd/asr1/local/swbd1_data_prep.sh
index acd462f26df..b12f97c0231 100755
--- a/egs/swbd/asr1/local/swbd1_data_prep.sh
+++ b/egs/swbd/asr1/local/swbd1_data_prep.sh
@@ -46,7 +46,7 @@ sph2pipe=sph2pipe
   echo  "SWBD dictionary file does not exist" &&  exit 1;
 
 # find sph audio files
-find $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist
+find -L $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist
 
 n=`cat $dir/sph.flist | wc -l`
 [ $n -ne 2435 ] && [ $n -ne 2438 ] && \
diff --git a/egs/swbd/asr1/local/swbd1_map_words.pl b/egs/swbd/asr1/local/swbd1_map_words.pl
index 39f90d72816..fc35938d8ea 100755
--- a/egs/swbd/asr1/local/swbd1_map_words.pl
+++ b/egs/swbd/asr1/local/swbd1_map_words.pl
@@ -9,7 +9,7 @@
   if ($field_spec =~ m/^\d+$/) {
     $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
   }
-  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
     if ($1 ne "") {
       $field_begin = $1 - 1;    # Change to zero-based indexing.
     }
diff --git a/egs/swbd/asr1/run.sh b/egs/swbd/asr1/run.sh
index 39eef119442..ac5d3ffcba9 100755
--- a/egs/swbd/asr1/run.sh
+++ b/egs/swbd/asr1/run.sh
@@ -115,7 +115,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # remove utt having > 2000 frames or < 10 frames or
     # remove utt having > 400 characters or 0 characters
     remove_longshortdata.sh --maxchars 400 data/train_nodup data/train_nodup_trim
-    remove_longshortdata.sh --maxchars 400 data/dev data/${train_dev}
+    remove_longshortdata.sh --maxchars 400 data/train_dev data/${train_dev}
 
     # speed-perturbed
     utils/perturb_data_dir_speed.sh 0.9 data/train_nodup_trim data/temp1
@@ -164,10 +164,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
 
     # map acronym such as p._h._d. to p h d for train_set& dev_set
-    cp data/${train_set}/text data/${train_set}/text.backup
-    cp data/${train_dev}/text data/${train_dev}/text.backup
-    sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/${train_set}/text
-    sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/${train_dev}/text
+    cp data/${train_set}/text data/${train_set}/text.tmp
+    cp data/${train_dev}/text data/${train_dev}/text.tmp
+    sed -i 's/\._/ /g; s/them_1/them/g' data/${train_set}/text.tmp
+    sed -i 's/\._/ /g; s/them_1/them/g' data/${train_dev}/text.tmp
+    # remove . from second columns, skiping first column, which includes sp0.9, sp1.1 etc.
+    awk -F " " '{for(i=2;i<=NF;++i) gsub(/\._|\./,"",$i)}1' data/${train_set}/text.tmp > data/${train_set}/text
+    awk -F " " '{for(i=2;i<=NF;++i) gsub(/\._|\./,"",$i)}1' data/${train_dev}/text.tmp > data/${train_dev}/text
     if [ -n "${fisher_dir}" ]; then
         cp data/train_fisher/text data/train_fisher/text.backup
         sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/train_fisher/text
@@ -193,7 +196,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
+    data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --bpecode ${bpemodel}.model \
         data/${train_set} ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
     data2json.sh --feat ${feat_dt_dir}/feats.scp --bpecode ${bpemodel}.model \
         data/${train_dev} ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
diff --git a/egs/tedlium2/align1/cmd.sh b/egs/tedlium2/align1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/tedlium2/align1/cmd.sh
+++ b/egs/tedlium2/align1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/tedlium2/asr1/cmd.sh b/egs/tedlium2/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/tedlium2/asr1/cmd.sh
+++ b/egs/tedlium2/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/tedlium3/asr1/cmd.sh b/egs/tedlium3/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/tedlium3/asr1/cmd.sh
+++ b/egs/tedlium3/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/timit/asr1/cmd.sh b/egs/timit/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/timit/asr1/cmd.sh
+++ b/egs/timit/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/timit_ssc/ssr1/cmd.sh b/egs/timit_ssc/ssr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/timit_ssc/ssr1/cmd.sh
+++ b/egs/timit_ssc/ssr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/timit_ssc/ssr1/run.sh b/egs/timit_ssc/ssr1/run.sh
index c0df01afa1b..afac5f397e1 100755
--- a/egs/timit_ssc/ssr1/run.sh
+++ b/egs/timit_ssc/ssr1/run.sh
@@ -66,7 +66,7 @@ feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir}
 feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir}
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     ### Download pre-encoded features from SSC dataset; by default using dct-30 features.
-    ### You can choose to use auto encoders features and other dct features of different dimentions.
+    ### You can choose to use auto encoders features and other dct features of different dimensions.
     echo "stage 0: Dataset Download"
     local/ssc_data_prepare.sh ${training_features}
 fi
diff --git a/egs/tweb/tts1/cmd.sh b/egs/tweb/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/tweb/tts1/cmd.sh
+++ b/egs/tweb/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/tweb/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml b/egs/tweb/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
index 9c43524be5d..f7579822eaf 100644
--- a/egs/tweb/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
+++ b/egs/tweb/tts1/conf/tuning/train_pytorch_tacotron2.v3.yaml
@@ -1,7 +1,7 @@
 # NOTE(kan-bayashi): This config is copied from the other recipe, not yet tested.
 
 # This configuration uses reduction factor = 1 and location-sensitive attention.
-# Furthermore, to accelerate the learning of diaogonal attention, we additionaly
+# Furthermore, to accelerate the learning of diaogonal attention, we additionally
 # use guided attention loss. This leads super fast and robust attention learning.
 
 # encoder related
diff --git a/egs/tweb/tts1/local/data_prep.sh b/egs/tweb/tts1/local/data_prep.sh
index c76b886c8a2..537fd04a4de 100755
--- a/egs/tweb/tts1/local/data_prep.sh
+++ b/egs/tweb/tts1/local/data_prep.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Tomoki Hayashi
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/tweb/tts1/run.sh b/egs/tweb/tts1/run.sh
index b031d08347a..e4a316ecf30 100755
--- a/egs/tweb/tts1/run.sh
+++ b/egs/tweb/tts1/run.sh
@@ -99,7 +99,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     ### Task dependent. You have to design training and dev name by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 1: Feature Generation"
-    # Trim silence parts at the begining and the end of audio
+    # Trim silence parts at the beginning and the end of audio
     if ${do_trimming}; then
         trim_silence.sh --cmd "${train_cmd}" \
             --fs ${fs} \
diff --git a/egs/vais1000/tts1/cmd.sh b/egs/vais1000/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/vais1000/tts1/cmd.sh
+++ b/egs/vais1000/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/vais1000/tts1/run.sh b/egs/vais1000/tts1/run.sh
index 51fd2d4e3e3..00cf0525a3e 100755
--- a/egs/vais1000/tts1/run.sh
+++ b/egs/vais1000/tts1/run.sh
@@ -92,7 +92,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     ### Task dependent. You have to design training and dev name by yourself.
     ### But you can utilize Kaldi recipes in most cases
     echo "stage 1: Feature Generation"
-    # Trim silence parts at the begining and the end of audio
+    # Trim silence parts at the beginning and the end of audio
     if ${do_trimming}; then
         trim_silence.sh --cmd "${train_cmd}" \
             --fs ${fs} \
diff --git a/egs/vcc20/README.md b/egs/vcc20/README.md
index a80ca657610..ecab17a1bf6 100644
--- a/egs/vcc20/README.md
+++ b/egs/vcc20/README.md
@@ -38,7 +38,7 @@ The following packages do not come with the installation of ESPnet. Please insta
 ```
 cd <recipe>
 . ./path.sh
-pip install -U pypiyin parallel-wavagan
+pip install -U pypinyin parallel-wavegan
 ```
 
 - [pypinyin](https://pypi.org/project/pypinyin/): used in `tts1_en_zh` and `vc1_task2`.
diff --git a/egs/vcc20/tts1_en_de/cmd.sh b/egs/vcc20/tts1_en_de/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/vcc20/tts1_en_de/cmd.sh
+++ b/egs/vcc20/tts1_en_de/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/vcc20/tts1_en_de/local/download.sh b/egs/vcc20/tts1_en_de/local/download.sh
index 5645ac0bb17..2f10f603024 100755
--- a/egs/vcc20/tts1_en_de/local/download.sh
+++ b/egs/vcc20/tts1_en_de/local/download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
@@ -28,7 +29,7 @@ cwd=`pwd`
 if [ ! -e ${db}/${lang} ]; then
     mkdir -p ${db}
     cd ${db}
-    wget http://www.caito.de/data/Training/stt_tts/${lang}.tgz
+    wget http://data.solak.de/data/Training/stt_tts/${lang}.tgz
     tar xvf ${lang}.tgz
     rm ${lang}.tgz
     cd $cwd
diff --git a/egs/vcc20/tts1_en_de/local/update_json.sh b/egs/vcc20/tts1_en_de/local/update_json.sh
index 5c44c9ce875..cfa508d54b0 120000
--- a/egs/vcc20/tts1_en_de/local/update_json.sh
+++ b/egs/vcc20/tts1_en_de/local/update_json.sh
@@ -1 +1 @@
-../../vc1/local/update_json.sh
\ No newline at end of file
+../../vc1_task1/local/update_json.sh
\ No newline at end of file
diff --git a/egs/vcc20/tts1_en_de/run.sh b/egs/vcc20/tts1_en_de/run.sh
index 257db8d4c60..21586985ccc 100755
--- a/egs/vcc20/tts1_en_de/run.sh
+++ b/egs/vcc20/tts1_en_de/run.sh
@@ -97,7 +97,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     fbankdir=fbank
     for x in judy elliot angela rebecca ramona eva karlsson; do
-        # Trim silence parts at the begining and the end of audio
+        # Trim silence parts at the beginning and the end of audio
         if ${do_trimming}; then
             mkdir -p exp/trim_silence/${x}/figs  # avoid error
             trim_silence.sh --cmd "${train_cmd}" \
diff --git a/egs/vcc20/tts1_en_fi/cmd.sh b/egs/vcc20/tts1_en_fi/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/vcc20/tts1_en_fi/cmd.sh
+++ b/egs/vcc20/tts1_en_fi/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/vcc20/tts1_en_fi/local/data_prep_css10_fi.sh b/egs/vcc20/tts1_en_fi/local/data_prep_css10_fi.sh
index c04fbebc036..54aeac9ad10 100755
--- a/egs/vcc20/tts1_en_fi/local/data_prep_css10_fi.sh
+++ b/egs/vcc20/tts1_en_fi/local/data_prep_css10_fi.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2020 Nagoya University (Wen-Chin Huang)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/vcc20/tts1_en_fi/local/download.sh b/egs/vcc20/tts1_en_fi/local/download.sh
index 5645ac0bb17..2f10f603024 100755
--- a/egs/vcc20/tts1_en_fi/local/download.sh
+++ b/egs/vcc20/tts1_en_fi/local/download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
@@ -28,7 +29,7 @@ cwd=`pwd`
 if [ ! -e ${db}/${lang} ]; then
     mkdir -p ${db}
     cd ${db}
-    wget http://www.caito.de/data/Training/stt_tts/${lang}.tgz
+    wget http://data.solak.de/data/Training/stt_tts/${lang}.tgz
     tar xvf ${lang}.tgz
     rm ${lang}.tgz
     cd $cwd
diff --git a/egs/vcc20/tts1_en_fi/local/update_json.sh b/egs/vcc20/tts1_en_fi/local/update_json.sh
index 5c44c9ce875..cfa508d54b0 120000
--- a/egs/vcc20/tts1_en_fi/local/update_json.sh
+++ b/egs/vcc20/tts1_en_fi/local/update_json.sh
@@ -1 +1 @@
-../../vc1/local/update_json.sh
\ No newline at end of file
+../../vc1_task1/local/update_json.sh
\ No newline at end of file
diff --git a/egs/vcc20/tts1_en_fi/run.sh b/egs/vcc20/tts1_en_fi/run.sh
index 207227dfed5..d7a6778cc6c 100755
--- a/egs/vcc20/tts1_en_fi/run.sh
+++ b/egs/vcc20/tts1_en_fi/run.sh
@@ -100,7 +100,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     fbankdir=fbank
     for x in judy elliot fin; do
-        # Trim silence parts at the begining and the end of audio
+        # Trim silence parts at the beginning and the end of audio
         if ${do_trimming}; then
             mkdir -p exp/trim_silence/${x}/figs  # avoid error
             trim_silence.sh --cmd "${train_cmd}" \
diff --git a/egs/vcc20/tts1_en_zh/cmd.sh b/egs/vcc20/tts1_en_zh/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/vcc20/tts1_en_zh/cmd.sh
+++ b/egs/vcc20/tts1_en_zh/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/vcc20/tts1_en_zh/local/data_prep_csmsc.sh b/egs/vcc20/tts1_en_zh/local/data_prep_csmsc.sh
index 1c1453b5ac1..5a9a252aa07 100755
--- a/egs/vcc20/tts1_en_zh/local/data_prep_csmsc.sh
+++ b/egs/vcc20/tts1_en_zh/local/data_prep_csmsc.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2020 Nagoya University (Wen-Chin Huang)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/vcc20/tts1_en_zh/local/data_prep_mailabs.sh b/egs/vcc20/tts1_en_zh/local/data_prep_mailabs.sh
index 94200484140..1deef4e7dac 100755
--- a/egs/vcc20/tts1_en_zh/local/data_prep_mailabs.sh
+++ b/egs/vcc20/tts1_en_zh/local/data_prep_mailabs.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2020 Nagoya University (Wen-Chin Huang)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/vcc20/tts1_en_zh/local/download_csmsc.sh b/egs/vcc20/tts1_en_zh/local/download_csmsc.sh
index 52b882c2532..a094845fd81 100755
--- a/egs/vcc20/tts1_en_zh/local/download_csmsc.sh
+++ b/egs/vcc20/tts1_en_zh/local/download_csmsc.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/vcc20/tts1_en_zh/local/download_mailabs.sh b/egs/vcc20/tts1_en_zh/local/download_mailabs.sh
index 5645ac0bb17..2f10f603024 100755
--- a/egs/vcc20/tts1_en_zh/local/download_mailabs.sh
+++ b/egs/vcc20/tts1_en_zh/local/download_mailabs.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2019 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
@@ -28,7 +29,7 @@ cwd=`pwd`
 if [ ! -e ${db}/${lang} ]; then
     mkdir -p ${db}
     cd ${db}
-    wget http://www.caito.de/data/Training/stt_tts/${lang}.tgz
+    wget http://data.solak.de/data/Training/stt_tts/${lang}.tgz
     tar xvf ${lang}.tgz
     rm ${lang}.tgz
     cd $cwd
diff --git a/egs/vcc20/tts1_en_zh/local/update_json.sh b/egs/vcc20/tts1_en_zh/local/update_json.sh
index 5c44c9ce875..cfa508d54b0 120000
--- a/egs/vcc20/tts1_en_zh/local/update_json.sh
+++ b/egs/vcc20/tts1_en_zh/local/update_json.sh
@@ -1 +1 @@
-../../vc1/local/update_json.sh
\ No newline at end of file
+../../vc1_task1/local/update_json.sh
\ No newline at end of file
diff --git a/egs/vcc20/tts1_en_zh/run.sh b/egs/vcc20/tts1_en_zh/run.sh
index 323012634ef..1434942d597 100755
--- a/egs/vcc20/tts1_en_zh/run.sh
+++ b/egs/vcc20/tts1_en_zh/run.sh
@@ -94,7 +94,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 
     fbankdir=fbank
     for x in judy elliot csmsc; do
-        # Trim silence parts at the begining and the end of audio
+        # Trim silence parts at the beginning and the end of audio
         if ${do_trimming}; then
             mkdir -p exp/trim_silence/${x}/figs  # avoid error
             trim_silence.sh --cmd "${train_cmd}" \
diff --git a/egs/vcc20/vc1_task1/cmd.sh b/egs/vcc20/vc1_task1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/vcc20/vc1_task1/cmd.sh
+++ b/egs/vcc20/vc1_task1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/vcc20/vc1_task1/local/data_download.sh b/egs/vcc20/vc1_task1/local/data_download.sh
index 478a0a8a062..49b11e960bc 100755
--- a/egs/vcc20/vc1_task1/local/data_download.sh
+++ b/egs/vcc20/vc1_task1/local/data_download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2020 Nagoya University (Wen-Chin Huang)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
diff --git a/egs/vcc20/vc1_task1/local/pretrained_model_download.sh b/egs/vcc20/vc1_task1/local/pretrained_model_download.sh
index d9d473db8ff..b0a64bae72f 100755
--- a/egs/vcc20/vc1_task1/local/pretrained_model_download.sh
+++ b/egs/vcc20/vc1_task1/local/pretrained_model_download.sh
@@ -1,4 +1,5 @@
-#!/usr/bin/env bash -e
+#!/usr/bin/env bash
+set -e
 
 # Copyright 2020 Nagoya University (Wen-Chin Huang)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
@@ -56,4 +57,4 @@ if [ ! -e ${dir}/.complete ]; then
     download_from_google_drive.sh ${share_url} ${dir} ".tar.gz"
     touch ${dir}/.complete
 fi
-echo "Successfully finished donwload of pretrained model."
+echo "Successfully finished download of pretrained model."
diff --git a/egs/vcc20/vc1_task1/run.sh b/egs/vcc20/vc1_task1/run.sh
index 2e6301096a8..2dac2a183fa 100755
--- a/egs/vcc20/vc1_task1/run.sh
+++ b/egs/vcc20/vc1_task1/run.sh
@@ -126,7 +126,7 @@ feat_dt_dir=${dumpdir}/${dev_set}; mkdir -p ${feat_dt_dir}
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature Generation"
 
-    # Trim silence parts at the begining and the end of audio
+    # Trim silence parts at the beginning and the end of audio
     mkdir -p exp/trim_silence/${org_set}/figs  # avoid error
     trim_silence.sh --cmd "${train_cmd}" \
         --fs ${fs} \
diff --git a/egs/vcc20/vc1_task2/cmd.sh b/egs/vcc20/vc1_task2/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/vcc20/vc1_task2/cmd.sh
+++ b/egs/vcc20/vc1_task2/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/vcc20/vc1_task2/run.sh b/egs/vcc20/vc1_task2/run.sh
index 62d755413f8..2450b62a08b 100755
--- a/egs/vcc20/vc1_task2/run.sh
+++ b/egs/vcc20/vc1_task2/run.sh
@@ -127,7 +127,7 @@ feat_dt_dir=${dumpdir}/${dev_set}; mkdir -p ${feat_dt_dir}
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature Generation"
 
-    # Trim silence parts at the begining and the end of audio
+    # Trim silence parts at the beginning and the end of audio
     mkdir -p exp/trim_silence/${org_set}/figs  # avoid error
     trim_silence.sh --cmd "${train_cmd}" \
         --fs ${fs} \
diff --git a/egs/vcc20/voc1/cmd.sh b/egs/vcc20/voc1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/vcc20/voc1/cmd.sh
+++ b/egs/vcc20/voc1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/vivos/asr1/RESULTS.md b/egs/vivos/asr1/RESULTS.md
index 1f9c20724dd..e90054ce4f1 100644
--- a/egs/vivos/asr1/RESULTS.md
+++ b/egs/vivos/asr1/RESULTS.md
@@ -1,132 +1,17 @@
-# RNN-Transducer (enc: 4 x BLSTMP, dec: 1 x LSTM)
-
-- Environments
-  - date: `Fri Oct 16 20:22:19 CEST 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.3`
-  - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `20b0c89369d9dd3e05780b65fdd00a9b4f4891e5`
-  - Commit date: `Mon Oct 12 09:28:20 2020 -0400`
-
-- Model files (archived to rnn_transducer.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=1KtzW_F4escMuUTvTT41J-Bzdf4tiFBDn
-  - training config file: `conf/tuning/transducer/train_transducer.yaml`
-  - decoding config file: `conf/tuning/transducer/decode_default.yaml`
-  - cmvn file: `data/train_nodev/cmvn.ark`
-  - e2e file: `exp/train_nodev_pytorch_train_transducer/results/model.loss.best`
-  - e2e JSON file: `exp/train_nodev_pytorch_train_transducer/results/model.json`
-  - lm file: `exp/train_rnnlm_pytorch_lm_word7184/rnnlm.model.best`
-  - lm JSON file: `exp/train_rnnlm_pytorch_lm_word7184/model.json`
-  - dict file: `data/lang_1char/`
-
-## CER
-
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|32771|85.1|11.6|3.3|3.6|18.5|93.9|
-|decode_test_decode_alsd_lm_word7184|760|32771|85.9|11.0|3.2|3.4|17.6|92.1|
-|decode_test_decode_default|760|32771|85.0|11.6|3.5|3.4|18.4|93.7|
-|decode_test_decode_default_lm_word7184|760|32771|85.6|10.9|3.5|3.2|17.6|92.1|
-|decode_test_decode_nsc|760|32771|84.9|11.6|3.5|3.5|18.6|94.1|
-|decode_test_decode_nsc_lm_word7184|760|32771|85.7|10.9|3.3|3.3|17.6|92.9|
-|decode_test_decode_tsd|760|32771|84.9|11.5|3.6|3.3|18.4|94.3|
-|decode_test_decode_tsd_lm_word7184|760|32771|85.7|10.7|3.6|3.0|17.2|91.7|
-|decode_train_dev_decode_alsd|100|4007|85.9|11.5|2.5|2.4|16.5|98.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|4007|86.6|10.9|2.5|2.2|15.6|97.0|
-|decode_train_dev_decode_default|100|4007|85.0|12.0|2.9|2.4|17.4|99.0|
-|decode_train_dev_decode_default_lm_word7184|100|4007|85.9|11.3|2.8|2.2|16.2|96.0|
-|decode_train_dev_decode_nsc|100|4007|85.6|11.7|2.8|2.3|16.7|97.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|4007|86.6|10.8|2.5|2.2|15.6|97.0|
-|decode_train_dev_decode_tsd|100|4007|85.2|11.9|2.9|2.3|17.1|98.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|4007|86.5|10.5|2.9|2.0|15.4|96.0|
-
-## WER
-
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|7722|61.7|37.8|0.6|0.6|39.0|93.9|
-|decode_test_decode_alsd_lm_word7184|760|7722|64.6|34.8|0.6|0.6|35.9|92.1|
-|decode_test_decode_default|760|7722|61.7|37.7|0.6|0.6|38.9|93.7|
-|decode_test_decode_default_lm_word7184|760|7722|64.9|34.4|0.8|0.6|35.7|92.1|
-|decode_test_decode_nsc|760|7722|61.5|37.9|0.6|0.6|39.1|94.1|
-|decode_test_decode_nsc_lm_word7184|760|7722|64.3|35.0|0.7|0.6|36.3|92.9|
-|decode_test_decode_tsd|760|7722|61.7|37.7|0.6|0.6|38.9|94.3|
-|decode_test_decode_tsd_lm_word7184|760|7722|65.3|34.0|0.7|0.5|35.2|91.7|
-|decode_train_dev_decode_alsd|100|927|61.9|38.0|0.1|0.0|38.1|98.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|927|65.8|34.1|0.1|0.0|34.2|97.0|
-|decode_train_dev_decode_default|100|927|60.3|39.6|0.1|0.0|39.7|99.0|
-|decode_train_dev_decode_default_lm_word7184|100|927|64.1|35.8|0.1|0.0|35.9|96.0|
-|decode_train_dev_decode_nsc|100|927|61.7|38.2|0.1|0.1|38.4|97.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|927|65.0|34.8|0.1|0.1|35.1|97.0|
-|decode_train_dev_decode_tsd|100|927|60.9|38.9|0.1|0.0|39.1|98.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|927|65.8|34.0|0.2|0.0|34.2|96.0|
-
-# Transformer-Transducer (enc: VGG2L + 8 x Transformer, dec: 2 x Transformer)
-
-- Environments
-  - date: `Thu Nov 19 23:25:08 CET 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.4`
-  - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `e9c1a554f0fbeeaeedd0f7e5c9ab096d243011b2`
-  - Commit date: `Wed Nov 18 22:06:15 2020 +0100`
-
-- Model files (archived to transformer_transducer.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=1m-LzNfH6J51zW1-z6D2DLikiWxbUmifX
-  - training config file: `conf/tuning/transducer/train_transformer_transducer.yaml`
-  - decoding config file: `conf/tuning/transducer/decode_default.yaml`
-  - cmvn file: `data/train_nodev/cmvn.ark`
-  - e2e file: `exp/train_nodev_pytorch_train_transformer_transducer/results/model.last5.avg.best`
-  - e2e JSON file: `exp/train_nodev_pytorch_train_transformer_transducer/results/model.json`
-  - lm file: `exp/train_rnnlm_pytorch_lm_word7184/rnnlm.model.best`
-  - lm JSON file: `exp/train_rnnlm_pytorch_lm_word7184/model.json`
-  - dict file: `data/lang_1char/`
+# RNN-CTC (4 x BLSTMP)
 
 ## CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|32771|84.6|10.6|4.8|3.5|18.9|95.4|
-|decode_test_decode_alsd_lm_word7184|760|32771|85.2|9.6|5.2|3.0|17.8|93.6|
-|decode_test_decode_default|760|32771|84.7|10.7|4.6|3.6|18.9|95.5|
-|decode_test_decode_default_lm_word7184|760|32771|84.2|9.5|6.3|3.0|18.8|93.6|
-|decode_test_decode_nsc|760|32771|84.1|10.8|5.1|3.5|19.4|95.8|
-|decode_test_decode_nsc_lm_word7184|760|32771|84.1|9.7|6.3|2.9|18.8|94.2|
-|decode_test_decode_tsd|760|32771|83.6|10.7|5.6|3.3|19.6|96.1|
-|decode_test_decode_tsd_lm_word7184|760|32771|82.3|9.4|8.3|2.5|20.2|93.9|
-|decode_train_dev_decode_alsd|100|4007|85.4|11.5|3.2|2.9|17.5|99.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|4007|86.3|10.3|3.4|2.6|16.3|98.0|
-|decode_train_dev_decode_default|100|4007|84.5|11.7|3.7|2.8|18.3|99.0|
-|decode_train_dev_decode_default_lm_word7184|100|4007|85.3|10.0|4.8|2.6|17.4|98.0|
-|decode_train_dev_decode_nsc|100|4007|84.4|11.7|3.9|2.9|18.5|99.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|4007|85.6|9.9|4.4|2.6|17.0|99.0|
-|decode_train_dev_decode_tsd|100|4007|83.9|11.4|4.7|2.6|18.7|99.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|4007|84.4|9.4|6.3|2.3|17.9|99.0|
-
-## WER
+|decode_test_decode_ctcweight1.0|760|32771|80.1|13.0|6.8|2.3|22.2|98.7|
+|decode_test_decode_ctcweight1.0_lm|760|32771|84.2|12.0|3.8|3.3|19.1|91.8|
+|decode_test_decode_ctcweight1.0_lm_word7184|760|32771|83.0|12.7|4.3|3.2|20.2|93.9|
+|decode_train_dev_decode_ctcweight1.0|100|4007|82.6|12.0|5.4|1.7|19.1|99.0|
+|decode_train_dev_decode_ctcweight1.0_lm|100|4007|85.3|11.5|3.2|2.1|16.9|93.0|
+|decode_train_dev_decode_ctcweight1.0_lm_word7184|100|4007|84.1|12.3|3.5|2.2|18.1|99.0|
 
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|7722|58.4|39.5|2.1|0.6|42.2|95.4|
-|decode_test_decode_alsd_lm_word7184|760|7722|63.1|34.1|2.8|0.5|37.4|93.6|
-|decode_test_decode_default|760|7722|58.3|39.7|2.0|0.6|42.3|95.5|
-|decode_test_decode_default_lm_word7184|760|7722|62.2|33.8|4.0|0.5|38.3|93.6|
-|decode_test_decode_nsc|760|7722|57.8|39.9|2.3|0.6|42.8|95.8|
-|decode_test_decode_nsc_lm_word7184|760|7722|62.0|34.3|3.7|0.5|38.5|94.2|
-|decode_test_decode_tsd|760|7722|57.7|39.5|2.7|0.6|42.9|96.1|
-|decode_test_decode_tsd_lm_word7184|760|7722|61.3|33.1|5.7|0.4|39.2|93.9|
-|decode_train_dev_decode_alsd|100|927|58.6|40.5|1.0|0.1|41.5|99.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|927|63.9|34.7|1.4|0.0|36.1|98.0|
-|decode_train_dev_decode_default|100|927|57.2|41.3|1.5|0.1|42.9|99.0|
-|decode_train_dev_decode_default_lm_word7184|100|927|63.5|33.5|2.9|0.0|36.5|98.0|
-|decode_train_dev_decode_nsc|100|927|57.1|41.3|1.6|0.1|43.0|99.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|927|63.5|34.1|2.4|0.1|36.6|99.0|
-|decode_train_dev_decode_tsd|100|927|57.2|40.5|2.4|0.1|42.9|99.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|927|63.1|33.0|3.9|0.1|37.0|99.0|
-
-# Transformer/RNN-Transducer (enc: VGG2L + 8 x Transformer, dec: 1 x LSTM)
+# Conformer/Transformer-MTL (enc: Conv2DSubsampling + 8 x Conformer, dec: 2 x Transformer)
 
 - Environments
   - date: `Thu Nov 19 23:25:08 CET 2020`
@@ -137,13 +22,13 @@
   - Git hash: `e9c1a554f0fbeeaeedd0f7e5c9ab096d243011b2`
   - Commit date: `Wed Nov 18 22:06:15 2020 +0100`
 
-- Model files (archived to transformer-rnn_transducer.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=1C-vNmUtWJuy1j27lDumuE5guczbhLyw_
-  - training config file: `conf/tuning/transducer/train_transformer-rnn_transducer.yaml`
-  - decoding config file: `conf/tuning/transducer/decode_default.yaml`
+- Model files (archived to conformer_mtlalpha_0.3.tar.gz by `$ pack_model.sh`)
+  - model link: https://drive.google.com/open?id=1sDQXEMrmiCP0HPiLw-Z-q0Av_PdokFiZ
+  - training config file: `conf/tuning/train_conformer.yaml`
+  - decoding config file: `conf/tuning/decode_ctcweight0.3.yaml`
   - cmvn file: `data/train_nodev/cmvn.ark`
-  - e2e file: `exp/train_nodev_pytorch_train_transformer-rnn_transducer/results/model.last5.avg.best`
-  - e2e JSON file: `exp/train_nodev_pytorch_train_transformer-rnn_transducer/results/model.json`
+  - e2e file: `exp/train_nodev_pytorch_train_conformer/results/model.last5.avg.best`
+  - e2e JSON file: `exp/train_nodev_pytorch_train_conformer/results/model.json`
   - lm file: `exp/train_rnnlm_pytorch_lm_word7184/rnnlm.model.best`
   - lm JSON file: `exp/train_rnnlm_pytorch_lm_word7184/model.json`
   - dict file: `data/lang_1char/`
@@ -152,349 +37,276 @@
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|32771|85.5|10.2|4.2|2.9|17.3|93.7|
-|decode_test_decode_alsd_lm_word7184|760|32771|86.3|9.7|4.0|2.8|16.5|91.6|
-|decode_test_decode_default|760|32771|85.6|10.5|4.0|2.9|17.3|93.8|
-|decode_test_decode_default_lm_word7184|760|32771|86.5|9.8|3.7|2.8|16.4|91.4|
-|decode_test_decode_nsc|760|32771|85.6|10.3|4.1|2.8|17.3|94.5|
-|decode_test_decode_nsc_lm_word7184|760|32771|86.3|9.7|3.9|2.7|16.4|92.0|
-|decode_test_decode_tsd|760|32771|85.4|10.3|4.3|2.7|17.3|94.1|
-|decode_test_decode_tsd_lm_word7184|760|32771|86.0|9.6|4.4|2.4|16.4|91.7|
-|decode_train_dev_decode_alsd|100|4007|86.1|11.3|2.5|2.6|16.5|94.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|4007|86.7|10.8|2.5|2.5|15.8|92.0|
-|decode_train_dev_decode_default|100|4007|85.9|11.3|2.8|2.4|16.5|96.0|
-|decode_train_dev_decode_default_lm_word7184|100|4007|86.6|10.8|2.6|2.5|15.9|94.0|
-|decode_train_dev_decode_nsc|100|4007|86.0|11.2|2.8|2.4|16.4|96.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|4007|86.3|10.9|2.8|2.4|16.1|95.0|
-|decode_train_dev_decode_tsd|100|4007|86.0|11.0|3.0|2.1|16.1|95.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|4007|86.1|10.8|3.1|2.1|16.0|94.0|
+|decode_test_decode_ctcweight0.3|760|32771|89.4|8.1|2.5|2.2|12.9|90.3|
+|decode_test_decode_ctcweight0.3_lm_word7184|760|32771|91.5|6.1|2.3|2.0|10.4|77.6|
+|decode_train_dev_decode_ctcweight0.3|100|4007|89.8|8.7|1.4|1.8|12.0|94.0|
+|decode_train_dev_decode_ctcweight0.3_lm_word7184|100|4007|90.8|7.4|1.8|1.4|10.6|84.0|
 
 ## WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|7722|64.3|34.4|1.2|0.5|36.2|93.7|
-|decode_test_decode_alsd_lm_word7184|760|7722|67.0|31.7|1.3|0.5|33.5|91.6|
-|decode_test_decode_default|760|7722|64.3|34.7|1.0|0.5|36.2|93.8|
-|decode_test_decode_default_lm_word7184|760|7722|67.0|31.9|1.1|0.5|33.5|91.4|
-|decode_test_decode_nsc|760|7722|64.4|34.5|1.0|0.6|36.1|94.5|
-|decode_test_decode_nsc_lm_word7184|760|7722|66.9|31.9|1.2|0.5|33.6|92.0|
-|decode_test_decode_tsd|760|7722|64.4|34.4|1.2|0.5|36.2|94.1|
-|decode_test_decode_tsd_lm_word7184|760|7722|66.9|31.6|1.5|0.5|33.6|91.7|
-|decode_train_dev_decode_alsd|100|927|64.5|35.2|0.3|0.4|35.9|94.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|927|67.4|32.1|0.4|0.4|33.0|92.0|
-|decode_train_dev_decode_default|100|927|64.8|35.0|0.2|0.3|35.5|96.0|
-|decode_train_dev_decode_default_lm_word7184|100|927|67.2|32.5|0.3|0.3|33.1|94.0|
-|decode_train_dev_decode_nsc|100|927|64.5|35.3|0.2|0.3|35.8|96.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|927|66.3|33.3|0.3|0.3|34.0|95.0|
-|decode_train_dev_decode_tsd|100|927|65.0|34.6|0.3|0.3|35.3|95.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|927|66.5|33.0|0.5|0.3|33.9|94.0|
-
-# Transformer-Transducer (enc: 3 x TDNN-TDNN-Transformer, dec: 1 x LSTM)
+|decode_test_decode_ctcweight0.3|760|7722|69.1|30.3|0.6|0.4|31.4|90.3|
+|decode_test_decode_ctcweight0.3_lm_word7184|760|7722|78.5|20.6|0.9|0.4|21.9|77.6|
+|decode_train_dev_decode_ctcweight0.3|100|927|68.6|31.4|0.0|0.0|31.4|94.0|
+|decode_train_dev_decode_ctcweight0.3_lm_word7184|100|927|75.4|23.9|0.6|0.0|24.6|84.0|
+
+# Transducer
+
+## Summary
+
+|Model|Algo|CER¹|WER¹|SER¹|RTF¹²|
+|-|-|-|-|-|-|
+|RNN-T|default|16.8|38.3|94.1|0.121|
+|-|ALSD|16.8|38.4|94.1|0.109|
+|-|TSD|16.7|38.2|93.9|0.159|
+|-|NSC|16.6|37.9|94.1|0.175|
+|-|mAES|16.9|38.5|94.5|0.096|
+|RNN-T + Aux|default|15.4|36.7|93.0|0.119|
+|-|ALSD|15.4|36.6|93.4|0.109|
+|-|TSD|15.4|36.6|93.4|0.159|
+|-|NSC|15.5|36.7|93.0|0.176|
+|-|mAES|15.5|36.9|93.3|0.095|
+|Conformer/RNN-T|default|11.9|26.9|86.1|0.077|
+|-|ALSD|12.2|27.3|86.3|0.064|
+|-|TSD|12.0|27.0|86.2|0.095|
+|-|NSC|12.0|27.0|86.4|0.106|
+|-|mAES|12.0|27.1|85.8|0.049|
+|Conformer/RNN-T + Aux|default|11.5|26.1|84.5|0.076|
+|-|ALSD|11.7|26.3|83.9|0.063|
+|-|TSD|11.4|26.0|83.7|0.095|
+|-|NSC|11.4|26.0|84.2|0.107|
+|-|mAES|11.5|26.3|84.5|0.053|
+
+¹ Reported on the test set only.  
+² RTF was computed using `line-profiler` tool applied to [recognize method](https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/e2e_asr_transducer.py#L470). The reported value is averaged on 5 runs with `nj=1`. All experiments were performed using a single AMD EPYC 7502P.
+
+## RNN-Transducer (Enc: VGG + 4x BLSTM, Dec: 1x LSTM)
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Peak VRAM usage during training: ~ 18.9 GiB
+  - Training time: ~ 21 minutes
+  - Decoding time (8 jobs, `search-type: default`): ~ 44 seconds
 
 - Environments
-  - date: `Thu Nov 19 23:25:08 CET 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.4`
+  - date: `Sun Aug 15 10:39:18 CEST 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
   - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `e9c1a554f0fbeeaeedd0f7e5c9ab096d243011b2`
-  - Commit date: `Wed Nov 18 22:06:15 2020 +0100`
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
 
-- Model files (archived to tdnn_transformer-rnn_transducer.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=1N5W4Su5WYiAWIl2QPhcy8wsMnLY7SPQM
-  - training config file: `conf/tuning/transducer/train_tdnn_transformer-rnn_transducer.yaml`
+- Model files
+  - model link: https://drive.google.com/file/d/1JkepwVQBJAj-lZxRAyvTqw96pzAOv1Kd
+  - training config file: `conf/tuning/transducer/train_rnn_transducer.yaml`
   - decoding config file: `conf/tuning/transducer/decode_default.yaml`
   - cmvn file: `data/train_nodev/cmvn.ark`
-  - e2e file: `exp/train_nodev_pytorch_train_custom_tt/results/model.last5.avg.best`
-  - e2e JSON file: `exp/train_nodev_pytorch_train_custom_tt/results/model.json`
-  - lm file: `exp/train_rnnlm_pytorch_lm_word7184/rnnlm.model.best`
-  - lm JSON file: `exp/train_rnnlm_pytorch_lm_word7184/model.json`
+  - e2e file: `exp/train_nodev_pytorch_train_rnn_transducer/results/model.loss.best`
+  - e2e JSON file: `exp/train_nodev_pytorch_train_rnn_transducer/results/model.json`
   - dict file: `data/lang_1char/`
 
-## CER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|32771|86.7|9.7|3.7|2.7|16.1|92.4|
-|decode_test_decode_alsd_lm_word7184|760|32771|87.2|9.2|3.6|2.6|15.5|90.9|
-|decode_test_decode_default|760|32771|86.5|9.8|3.7|2.8|16.2|91.7|
-|decode_test_decode_default_lm_word7184|760|32771|87.2|9.1|3.7|2.7|15.5|90.0|
-|decode_test_decode_nsc|760|32771|86.6|9.7|3.7|2.7|16.1|92.9|
-|decode_test_decode_nsc_lm_word7184|760|32771|87.0|9.4|3.6|2.7|15.7|91.4|
-|decode_test_decode_tsd|760|32771|86.6|9.6|3.8|2.6|15.9|92.4|
-|decode_test_decode_tsd_lm_word7184|760|32771|87.2|8.9|3.8|2.4|15.1|90.0|
-|decode_train_dev_decode_alsd|100|4007|88.5|9.5|2.0|2.4|13.9|91.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|4007|89.4|8.6|2.0|2.4|13.1|87.0|
-|decode_train_dev_decode_default|100|4007|88.2|9.6|2.1|2.5|14.3|91.0|
-|decode_train_dev_decode_default_lm_word7184|100|4007|89.0|8.8|2.1|2.5|13.4|87.0|
-|decode_train_dev_decode_nsc|100|4007|88.6|9.1|2.3|2.3|13.7|94.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|4007|88.9|8.7|2.3|2.4|13.5|89.0|
-|decode_train_dev_decode_tsd|100|4007|88.6|9.1|2.2|2.2|13.6|92.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|4007|89.6|8.2|2.2|2.0|12.4|87.0|
-
-## WER
+|decode_test_decode_alsd|760|32771|86.0|10.9|3.1|2.8|16.8|94.1|
+|decode_test_decode_default|760|32771|85.9|10.9|3.2|2.6|16.8|94.1|
+|decode_test_decode_maes|760|32771|85.9|10.9|3.2|2.8|16.9|94.5|
+|decode_test_decode_nsc|760|32771|86.1|10.7|3.1|2.7|16.6|94.1|
+|decode_test_decode_tsd|760|32771|86.0|10.8|3.2|2.7|16.7|93.9|
+|decode_train_dev_decode_alsd|100|4007|85.3|12.6|2.1|2.7|17.4|99.0|
+|decode_train_dev_decode_default|100|4007|85.1|12.6|2.2|2.5|17.4|98.0|
+|decode_train_dev_decode_maes|100|4007|85.3|12.4|2.3|2.6|17.4|98.0|
+|decode_train_dev_decode_nsc|100|4007|85.0|12.7|2.3|2.4|17.4|99.0|
+|decode_train_dev_decode_tsd|100|4007|84.9|12.8|2.3|2.6|17.7|99.0|
+
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|7722|65.9|33.1|1.0|0.6|34.7|92.4|
-|decode_test_decode_alsd_lm_word7184|760|7722|68.0|30.9|1.1|0.5|32.5|90.9|
-|decode_test_decode_default|760|7722|65.5|33.5|1.0|0.5|35.0|91.7|
-|decode_test_decode_default_lm_word7184|760|7722|67.8|31.1|1.1|0.5|32.7|90.0|
-|decode_test_decode_nsc|760|7722|65.6|33.3|1.0|0.5|34.9|92.8|
-|decode_test_decode_nsc_lm_word7184|760|7722|67.3|31.7|1.0|0.5|33.2|91.3|
-|decode_test_decode_tsd|760|7722|66.0|33.0|1.0|0.5|34.5|92.2|
-|decode_test_decode_tsd_lm_word7184|760|7722|68.5|30.3|1.2|0.4|31.9|90.0|
-|decode_train_dev_decode_alsd|100|927|68.7|31.2|0.1|0.1|31.4|91.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|927|71.1|28.7|0.2|0.0|28.9|87.0|
-|decode_train_dev_decode_default|100|927|68.1|31.7|0.2|0.1|32.0|91.0|
-|decode_train_dev_decode_default_lm_word7184|100|927|70.4|29.3|0.2|0.1|29.7|86.0|
-|decode_train_dev_decode_nsc|100|927|68.1|31.7|0.2|0.0|31.9|94.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|927|69.5|30.3|0.2|0.0|30.5|89.0|
-|decode_train_dev_decode_tsd|100|927|68.6|31.2|0.2|0.0|31.4|92.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|927|71.7|28.0|0.2|0.0|28.3|87.0|
-
-# Conformer-Transducer (enc: VGG2L + 8 x Conformer, dec: 2 x Transformer)
+|decode_test_decode_alsd|760|7722|62.0|37.3|0.7|0.4|38.4|94.1|
+|decode_test_decode_default|760|7722|62.2|37.1|0.7|0.4|38.3|94.1|
+|decode_test_decode_maes|760|7722|62.0|37.4|0.6|0.4|38.5|94.5|
+|decode_test_decode_nsc|760|7722|62.6|36.7|0.7|0.5|37.9|94.1|
+|decode_test_decode_tsd|760|7722|62.2|37.1|0.7|0.4|38.2|93.9|
+|decode_train_dev_decode_alsd|100|927|59.7|40.3|0.0|0.1|40.5|99.0|
+|decode_train_dev_decode_default|100|927|59.4|40.6|0.0|0.1|40.7|98.0|
+|decode_train_dev_decode_maes|100|927|59.7|40.3|0.0|0.1|40.5|98.0|
+|decode_train_dev_decode_nsc|100|927|59.1|40.9|0.0|0.0|40.9|99.0|
+|decode_train_dev_decode_tsd|100|927|58.9|41.1|0.0|0.0|41.1|99.0|
+
+## RNN-Transducer (Enc: VGG + 4x BLSTM, Dec: 1x LSTM)
+##   + CTC loss + Label Smoothing loss
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Training time: ~ 21 minutes
+  - Peak memory consumption during training: ~ 18.2 GiB
+  - Decoding time (8 jobs, `search-type: default`): ~ 43 seconds 
 
 - Environments
-  - date: `Thu Nov 19 23:25:08 CET 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.4`
+  - date: `Sun Aug 15 10:39:18 CEST 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
   - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `e9c1a554f0fbeeaeedd0f7e5c9ab096d243011b2`
-  - Commit date: `Wed Nov 18 22:06:15 2020 +0100`
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
 
-- Model files (archived to conformer_transducer.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=15AkFzFLM4FTWcfNmt1Ca2fWu2K68wtH-
-  - training config file: `conf/tuning/transducer/train_conformer_transducer.yaml`
+- Model files
+  - model link: https://drive.google.com/file/d/1O16p57K2-Hrg69LNoJGY99I64JFjPWWE
+  - training config file: `conf/tuning/transducer/train_rnn_transducer_aux.yaml`
   - decoding config file: `conf/tuning/transducer/decode_default.yaml`
   - cmvn file: `data/train_nodev/cmvn.ark`
-  - e2e file: `exp/train_nodev_pytorch_train_conformer_transducer/results/model.last5.avg.best`
-  - e2e JSON file: `exp/train_nodev_pytorch_train_conformer_transducer/results/model.json`
-  - lm file: `exp/train_rnnlm_pytorch_lm_word7184/rnnlm.model.best`
-  - lm JSON file: `exp/train_rnnlm_pytorch_lm_word7184/model.json`
+  - e2e file: `exp/train_nodev_pytorch_train_rnn_transducer_aux/results/model.loss.best`
+  - e2e JSON file: `exp/train_nodev_pytorch_train_rnn_transducer_aux/results/model.json`
   - dict file: `data/lang_1char/`
 
-## CER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|32771|89.3|7.8|3.0|2.1|12.8|88.7|
-|decode_test_decode_alsd_lm_word7184|760|32771|90.0|7.2|2.8|2.0|11.9|84.9|
-|decode_test_decode_default|760|32771|89.6|7.8|2.6|2.1|12.5|88.0|
-|decode_test_decode_default_lm_word7184|760|32771|90.2|7.3|2.4|2.0|11.8|84.5|
-|decode_test_decode_nsc|760|32771|88.6|7.9|3.5|2.0|13.5|89.7|
-|decode_test_decode_nsc_lm_word7184|760|32771|89.3|7.3|3.4|1.9|12.6|85.8|
-|decode_test_decode_tsd|760|32771|88.6|7.9|3.5|1.9|13.4|89.6|
-|decode_test_decode_tsd_lm_word7184|760|32771|89.1|7.3|3.6|1.8|12.7|85.7|
-|decode_train_dev_decode_alsd|100|4007|88.5|9.5|2.1|1.9|13.4|93.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|4007|89.2|8.8|2.0|1.8|12.6|88.0|
-|decode_train_dev_decode_default|100|4007|88.6|9.4|2.0|1.8|13.1|93.0|
-|decode_train_dev_decode_default_lm_word7184|100|4007|89.1|8.7|2.1|1.7|12.5|87.0|
-|decode_train_dev_decode_nsc|100|4007|87.6|9.6|2.8|1.7|14.1|94.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|4007|88.1|9.0|2.9|1.6|13.6|88.0|
-|decode_train_dev_decode_tsd|100|4007|87.5|9.6|2.9|1.7|14.2|93.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|4007|88.1|8.7|3.2|1.5|13.5|87.0|
-
-## WER
+|decode_test_decode_alsd|760|32771|87.1|10.0|2.9|2.5|15.4|93.4|
+|decode_test_decode_default|760|32771|87.0|10.0|3.0|2.4|15.4|93.0|
+|decode_test_decode_maes|760|32771|87.0|10.1|3.0|2.4|15.5|93.3|
+|decode_test_decode_nsc|760|32771|87.0|10.0|3.0|2.5|15.5|93.0|
+|decode_test_decode_tsd|760|32771|87.1|9.9|3.0|2.4|15.4|93.4|
+|decode_train_dev_decode_alsd|100|4007|87.3|10.8|1.8|2.0|14.6|95.0|
+|decode_train_dev_decode_default|100|4007|87.4|10.6|2.0|2.0|14.5|95.0|
+|decode_train_dev_decode_maes|100|4007|87.5|10.7|1.9|2.0|14.5|96.0|
+|decode_train_dev_decode_nsc|100|4007|87.3|10.7|2.0|1.9|14.6|95.0|
+|decode_train_dev_decode_tsd|100|4007|87.5|10.5|2.0|1.8|14.3|95.0|
+
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|7722|70.2|28.9|0.9|0.3|30.2|88.7|
-|decode_test_decode_alsd_lm_word7184|760|7722|73.3|25.8|0.9|0.3|27.1|84.9|
-|decode_test_decode_default|760|7722|70.5|28.8|0.6|0.3|29.8|88.0|
-|decode_test_decode_default_lm_word7184|760|7722|73.2|26.2|0.6|0.3|27.1|84.5|
-|decode_test_decode_nsc|760|7722|69.1|29.8|1.1|0.3|31.2|89.7|
-|decode_test_decode_nsc_lm_word7184|760|7722|72.1|26.8|1.1|0.3|28.3|85.8|
-|decode_test_decode_tsd|760|7722|69.2|29.8|1.0|0.3|31.1|89.6|
-|decode_test_decode_tsd_lm_word7184|760|7722|72.3|26.5|1.2|0.3|28.1|85.7|
-|decode_train_dev_decode_alsd|100|927|68.2|31.8|0.0|0.0|31.8|93.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|927|70.9|29.1|0.0|0.0|29.1|88.0|
-|decode_train_dev_decode_default|100|927|68.3|31.7|0.0|0.0|31.7|93.0|
-|decode_train_dev_decode_default_lm_word7184|100|927|70.8|29.0|0.2|0.0|29.2|87.0|
-|decode_train_dev_decode_nsc|100|927|66.8|32.9|0.3|0.0|33.2|94.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|927|69.5|30.0|0.5|0.0|30.5|88.0|
-|decode_train_dev_decode_tsd|100|927|66.7|33.0|0.3|0.0|33.3|93.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|927|69.8|29.7|0.5|0.0|30.2|87.0|
-
-# Conformer/RNN-Transducer (enc: VGG2L + 8 x Conformer, dec: 1 x LSTM)
+|decode_test_decode_alsd|760|7722|63.9|35.5|0.6|0.4|36.6|93.4|
+|decode_test_decode_default|760|7722|63.8|35.5|0.7|0.5|36.7|93.0|
+|decode_test_decode_maes|760|7722|63.6|35.8|0.6|0.5|36.9|93.3|
+|decode_test_decode_nsc|760|7722|63.8|35.5|0.7|0.5|36.7|93.0|
+|decode_test_decode_tsd|760|7722|63.8|35.5|0.7|0.4|36.6|93.4|
+|decode_train_dev_decode_alsd|100|927|63.3|36.7|0.0|0.0|36.7|95.0|
+|decode_train_dev_decode_default|100|927|63.1|36.9|0.0|0.0|36.9|95.0|
+|decode_train_dev_decode_maes|100|927|63.4|36.6|0.0|0.0|36.6|96.0|
+|decode_train_dev_decode_nsc|100|927|63.2|36.7|0.1|0.1|36.9|95.0|
+|decode_train_dev_decode_tsd|100|927|63.4|36.5|0.1|0.0|36.6|95.0|
+
+## Conformer/RNN-Transducer (Enc: VGG + 8x Conformer, Dec: 1x LSTM)
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Training time: ~ 1 hour
+  - Peak memory consumption during training: ~ 11.8 GiB
+  - Decoding time (8 job, `search-type: default`): ~ 28 seconds
+  - Model averaging: `n_average=10`, `use_valbest_average=true`
 
 - Environments
-  - date: `Thu Nov 19 23:25:08 CET 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.4`
+  - date: `Sun Aug 15 10:39:18 CEST 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
   - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `e9c1a554f0fbeeaeedd0f7e5c9ab096d243011b2`
-  - Commit date: `Wed Nov 18 22:06:15 2020 +0100`
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
 
-- Model files (archived to conformer-rnn_transducer.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=17-8XfOQAH-6zuRMTZfJ_awojJtchO95l
+- Model files
+  - model link: https://drive.google.com/file/d/1f8nWN76n0iUI1bkRKTJ8wCk2gPF6tHov
   - training config file: `conf/tuning/transducer/train_conformer-rnn_transducer.yaml`
   - decoding config file: `conf/tuning/transducer/decode_default.yaml`
   - cmvn file: `data/train_nodev/cmvn.ark`
-  - e2e file: `exp/train_nodev_pytorch_train_conformer-rnn_transducer/results/model.last5.avg.best`
+  - e2e file: `exp/train_nodev_pytorch_train_conformer-rnn_transducer/results/model.val10.avg.best`
   - e2e JSON file: `exp/train_nodev_pytorch_train_conformer-rnn_transducer/results/model.json`
-  - lm file: `exp/train_rnnlm_pytorch_lm_word7184/rnnlm.model.best`
-  - lm JSON file: `exp/train_rnnlm_pytorch_lm_word7184/model.json`
   - dict file: `data/lang_1char/`
 
-## CER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|32771|89.7|7.4|2.9|2.1|12.4|85.7|
-|decode_test_decode_alsd_lm_word7184|760|32771|90.2|7.1|2.7|2.1|11.9|82.5|
-|decode_test_decode_default|760|32771|90.1|7.5|2.5|2.2|12.2|85.5|
-|decode_test_decode_default_lm_word7184|760|32771|90.5|7.1|2.4|2.1|11.6|82.9|
-|decode_test_decode_nsc|760|32771|89.9|7.5|2.6|2.2|12.3|86.3|
-|decode_test_decode_nsc_lm_word7184|760|32771|90.4|7.0|2.5|2.1|11.7|83.4|
-|decode_test_decode_tsd|760|32771|89.8|7.4|2.8|2.1|12.3|86.1|
-|decode_test_decode_tsd_lm_word7184|760|32771|90.2|7.0|2.8|2.0|11.8|83.4|
-|decode_train_dev_decode_alsd|100|4007|89.3|8.9|1.8|2.0|12.7|93.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|4007|89.9|8.3|1.8|1.9|12.0|91.0|
-|decode_train_dev_decode_default|100|4007|89.1|8.7|2.1|1.9|12.7|93.0|
-|decode_train_dev_decode_default_lm_word7184|100|4007|89.7|8.4|1.9|1.8|12.2|91.0|
-|decode_train_dev_decode_nsc|100|4007|89.5|8.4|2.1|1.8|12.3|93.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|4007|89.8|8.3|1.9|1.9|12.0|93.0|
-|decode_train_dev_decode_tsd|100|4007|89.4|8.4|2.2|1.8|12.4|93.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|4007|89.7|8.2|2.1|1.7|11.9|91.0|
-
-## WER
+|decode_test_decode_alsd|760|32771|89.9|7.5|2.6|2.1|12.2|86.3|
+|decode_test_decode_default|760|32771|90.2|7.4|2.3|2.1|11.9|86.1|
+|decode_test_decode_maes|760|32771|90.1|7.5|2.4|2.1|12.0|85.8|
+|decode_test_decode_nsc|760|32771|90.2|7.5|2.3|2.1|12.0|86.4|
+|decode_test_decode_tsd|760|32771|90.1|7.5|2.3|2.1|12.0|86.2|
+|decode_train_dev_decode_alsd|100|4007|91.5|7.1|1.3|1.5|10.0|86.0|
+|decode_train_dev_decode_default|100|4007|91.2|7.3|1.5|1.5|10.3|90.0|
+|decode_train_dev_decode_maes|100|4007|91.2|7.3|1.5|1.5|10.3|89.0|
+|decode_train_dev_decode_nsc|100|4007|91.2|7.3|1.5|1.5|10.3|87.0|
+|decode_train_dev_decode_tsd|100|4007|91.2|7.3|1.5|1.4|10.3|88.0|
+
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|7722|73.1|26.0|0.9|0.4|27.3|85.7|
-|decode_test_decode_alsd_lm_word7184|760|7722|75.0|24.0|0.9|0.4|25.4|82.5|
-|decode_test_decode_default|760|7722|73.5|25.9|0.6|0.4|26.9|85.5|
-|decode_test_decode_default_lm_word7184|760|7722|75.5|23.8|0.7|0.4|24.9|82.9|
-|decode_test_decode_nsc|760|7722|73.3|26.0|0.7|0.4|27.2|86.3|
-|decode_test_decode_nsc_lm_word7184|760|7722|75.2|24.1|0.8|0.4|25.2|83.4|
-|decode_test_decode_tsd|760|7722|73.3|25.9|0.7|0.4|27.1|86.1|
-|decode_test_decode_tsd_lm_word7184|760|7722|75.1|23.9|1.0|0.4|25.3|83.4|
-|decode_train_dev_decode_alsd|100|927|69.6|30.4|0.0|0.0|30.4|93.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|927|71.8|28.2|0.0|0.0|28.2|91.0|
-|decode_train_dev_decode_default|100|927|69.1|30.9|0.0|0.0|30.9|93.0|
-|decode_train_dev_decode_default_lm_word7184|100|927|71.3|28.7|0.0|0.0|28.7|91.0|
-|decode_train_dev_decode_nsc|100|927|69.8|30.2|0.0|0.0|30.2|93.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|927|71.3|28.7|0.0|0.0|28.7|93.0|
-|decode_train_dev_decode_tsd|100|927|69.7|30.3|0.0|0.0|30.3|93.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|927|72.3|27.6|0.1|0.0|27.7|91.0|
-
-# Conformer/RNN-Transducer (enc: 3 x TDNN-TDNN-Conformer, dec: 1 x LSTM)
+|decode_test_decode_alsd|760|7722|73.1|26.2|0.7|0.4|27.3|86.3|
+|decode_test_decode_default|760|7722|73.5|26.0|0.5|0.4|26.9|86.1|
+|decode_test_decode_default_lm_word7184|760|7722|74.5|24.9|0.6|0.5|25.9|84.6|
+|decode_test_decode_maes|760|7722|73.4|26.1|0.5|0.4|27.1|85.8|
+|decode_test_decode_nsc|760|7722|73.4|26.0|0.5|0.4|27.0|86.4|
+|decode_test_decode_tsd|760|7722|73.4|26.1|0.5|0.4|27.0|86.2|
+|decode_train_dev_decode_alsd|100|927|74.6|25.4|0.0|0.0|25.4|86.0|
+|decode_train_dev_decode_default|100|927|73.7|26.3|0.0|0.0|26.3|90.0|
+|decode_train_dev_decode_default_lm_word7184|100|927|73.8|26.2|0.0|0.0|26.2|87.0|
+|decode_train_dev_decode_maes|100|927|73.6|26.4|0.0|0.0|26.4|89.0|
+|decode_train_dev_decode_nsc|100|927|73.6|26.4|0.0|0.0|26.4|87.0|
+|decode_train_dev_decode_tsd|100|927|73.7|26.3|0.0|0.0|26.3|88.0|
+
+## Conformer/RNN-Transducer (Enc: VGG + 8x Conformer, Dec: 1x LSTM)
+##   + CTC loss + Label Smoothing loss
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Training time: ~ 1 hour
+  - Peak memory consumption during training: ~ 13.2 GiB
+  - Decoding time (8 job, `search-type: default`): ~ 27 seconds
+  - Model averaging: `n_average=10`, `use_valbest_average=true`
 
 - Environments
-  - date: `Thu Nov 19 23:25:08 CET 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.4`
+  - date: `Sun Aug 15 10:39:18 CEST 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
   - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `e9c1a554f0fbeeaeedd0f7e5c9ab096d243011b2`
-  - Commit date: `Wed Nov 18 22:06:15 2020 +0100`
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
 
-- Model files (archived to tdnn_conformer-rnn_transducer.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=1qUz5UK3rdHRLWPQNuWLCQOmlyrbiUDA3
-  - training config file: `conf/tuning/transducer/train_tdnn_conformer-rnn_transducer.yaml`
+- Model files
+  - model link: https://drive.google.com/file/d/1y3jZl4vRMK_OoyZOEvFfL6AsbG2gAVBI
+  - training config file: `conf/tuning/transducer/train_conformer-rnn_transducer_aux.yaml`
   - decoding config file: `conf/tuning/transducer/decode_default.yaml`
   - cmvn file: `data/train_nodev/cmvn.ark`
-  - e2e file: `exp/train_nodev_pytorch_train_custom_ct/results/model.last10.avg.best`
-  - e2e JSON file: `exp/train_nodev_pytorch_train_custom_ct/results/model.json`
-  - lm file: `exp/train_rnnlm_pytorch_lm_word7184/rnnlm.model.best`
-  - lm JSON file: `exp/train_rnnlm_pytorch_lm_word7184/model.json`
+  - e2e file: `exp/train_nodev_pytorch_train_conformer-rnn_transducer_aux/results/model.val10.avg.best`
+  - e2e JSON file: `exp/train_nodev_pytorch_train_conformer-rnn_transducer_aux/results/model.json`
   - dict file: `data/lang_1char/`
 
-## CER
-
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|32771|90.2|7.4|2.4|2.5|12.2|84.1|
-|decode_test_decode_alsd_lm_word7184|760|32771|90.6|7.1|2.3|2.4|11.8|82.0|
-|decode_test_decode_default|760|32771|90.2|7.6|2.2|2.4|12.2|83.9|
-|decode_test_decode_default_lm_word7184|760|32771|90.7|7.1|2.2|2.2|11.6|81.8|
-|decode_test_decode_nsc|760|32771|90.2|7.5|2.3|2.5|12.3|85.5|
-|decode_test_decode_nsc_lm_word7184|760|32771|90.6|7.2|2.2|2.4|11.8|84.5|
-|decode_test_decode_tsd|760|32771|90.2|7.5|2.3|2.5|12.2|85.1|
-|decode_test_decode_tsd_lm_word7184|760|32771|90.6|7.1|2.3|2.2|11.7|82.8|
-|decode_train_dev_decode_alsd|100|4007|92.3|6.4|1.2|1.8|9.5|88.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|4007|92.3|6.3|1.3|2.0|9.7|89.0|
-|decode_train_dev_decode_default|100|4007|92.2|6.5|1.2|1.8|9.6|85.0|
-|decode_train_dev_decode_default_lm_word7184|100|4007|92.4|6.3|1.3|1.7|9.3|84.0|
-|decode_train_dev_decode_nsc|100|4007|92.2|6.5|1.3|2.0|9.8|84.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|4007|92.3|6.4|1.3|1.9|9.6|88.0|
-|decode_train_dev_decode_tsd|100|4007|92.1|6.5|1.3|2.0|9.8|85.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|4007|92.1|6.3|1.5|1.7|9.6|85.0|
-
-## WER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_alsd|760|7722|73.4|25.9|0.7|0.5|27.1|84.1|
-|decode_test_decode_alsd_lm_word7184|760|7722|74.7|24.6|0.7|0.4|25.7|82.0|
-|decode_test_decode_default|760|7722|73.3|26.1|0.6|0.4|27.1|83.9|
-|decode_test_decode_default_lm_word7184|760|7722|75.1|24.3|0.6|0.4|25.3|81.8|
-|decode_test_decode_nsc|760|7722|72.7|26.7|0.6|0.5|27.8|85.4|
-|decode_test_decode_nsc_lm_word7184|760|7722|74.3|25.1|0.6|0.5|26.2|84.3|
-|decode_test_decode_tsd|760|7722|73.1|26.3|0.6|0.5|27.4|85.1|
-|decode_test_decode_tsd_lm_word7184|760|7722|74.8|24.6|0.6|0.4|25.6|82.6|
-|decode_train_dev_decode_alsd|100|927|76.9|23.1|0.0|0.1|23.2|88.0|
-|decode_train_dev_decode_alsd_lm_word7184|100|927|77.2|22.8|0.0|0.1|22.9|89.0|
-|decode_train_dev_decode_default|100|927|76.5|23.5|0.0|0.1|23.6|85.0|
-|decode_train_dev_decode_default_lm_word7184|100|927|77.5|22.5|0.0|0.1|22.7|84.0|
-|decode_train_dev_decode_nsc|100|927|76.2|23.8|0.0|0.5|24.4|84.0|
-|decode_train_dev_decode_nsc_lm_word7184|100|927|76.8|23.2|0.0|0.4|23.6|88.0|
-|decode_train_dev_decode_tsd|100|927|76.5|23.5|0.0|0.5|24.1|85.0|
-|decode_train_dev_decode_tsd_lm_word7184|100|927|77.5|22.5|0.0|0.5|23.1|85.0|
-
-# RNN-CTC (4 x BLSTMP)
-
-## CER
+|decode_test_decode_alsd|760|32771|90.3|7.1|2.6|2.0|11.7|83.9|
+|decode_test_decode_default|760|32771|90.5|7.2|2.3|2.0|11.5|84.5|
+|decode_test_decode_maes|760|32771|90.5|7.2|2.3|2.0|11.5|84.5|
+|decode_test_decode_nsc|760|32771|90.6|7.1|2.3|2.0|11.4|84.2|
+|decode_test_decode_tsd|760|32771|90.5|7.1|2.4|2.0|11.4|83.7|
+|decode_train_dev_decode_alsd|100|4007|92.1|6.7|1.2|1.2|9.0|93.0|
+|decode_train_dev_decode_default|100|4007|91.7|7.1|1.2|1.4|9.7|94.0|
+|decode_train_dev_decode_maes|100|4007|91.7|7.0|1.3|1.3|9.6|93.0|
+|decode_train_dev_decode_nsc|100|4007|91.8|6.9|1.3|1.3|9.5|93.0|
+|decode_train_dev_decode_tsd|100|4007|91.9|6.9|1.3|1.3|9.4|92.0|
+
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_test_decode_ctcweight1.0|760|32771|80.1|13.0|6.8|2.3|22.2|98.7|
-|decode_test_decode_ctcweight1.0_lm|760|32771|84.2|12.0|3.8|3.3|19.1|91.8|
-|decode_test_decode_ctcweight1.0_lm_word7184|760|32771|83.0|12.7|4.3|3.2|20.2|93.9|
-|decode_train_dev_decode_ctcweight1.0|100|4007|82.6|12.0|5.4|1.7|19.1|99.0|
-|decode_train_dev_decode_ctcweight1.0_lm|100|4007|85.3|11.5|3.2|2.1|16.9|93.0|
-|decode_train_dev_decode_ctcweight1.0_lm_word7184|100|4007|84.1|12.3|3.5|2.2|18.1|99.0|
-
-# Conformer/Transformer-MTL (enc: Conv2DSubsampling + 8 x Conformer, dec: 2 x Transformer)
-
-- Environments
-  - date: `Thu Nov 19 23:25:08 CET 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.4`
-  - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `e9c1a554f0fbeeaeedd0f7e5c9ab096d243011b2`
-  - Commit date: `Wed Nov 18 22:06:15 2020 +0100`
-
-- Model files (archived to conformer_mtlalpha_0.3.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=1sDQXEMrmiCP0HPiLw-Z-q0Av_PdokFiZ
-  - training config file: `conf/tuning/train_conformer.yaml`
-  - decoding config file: `conf/tuning/decode_ctcweight0.3.yaml`
-  - cmvn file: `data/train_nodev/cmvn.ark`
-  - e2e file: `exp/train_nodev_pytorch_train_conformer/results/model.last5.avg.best`
-  - e2e JSON file: `exp/train_nodev_pytorch_train_conformer/results/model.json`
-  - lm file: `exp/train_rnnlm_pytorch_lm_word7184/rnnlm.model.best`
-  - lm JSON file: `exp/train_rnnlm_pytorch_lm_word7184/model.json`
-  - dict file: `data/lang_1char/`
-
-## CER
-
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|decode_test_decode_ctcweight0.3|760|32771|89.4|8.1|2.5|2.2|12.9|90.3|
-|decode_test_decode_ctcweight0.3_lm_word7184|760|32771|91.5|6.1|2.3|2.0|10.4|77.6|
-|decode_train_dev_decode_ctcweight0.3|100|4007|89.8|8.7|1.4|1.8|12.0|94.0|
-|decode_train_dev_decode_ctcweight0.3_lm_word7184|100|4007|90.8|7.4|1.8|1.4|10.6|84.0|
-
-## WER
-
-|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
-|---|---|---|---|---|---|---|---|---|
-|decode_test_decode_ctcweight0.3|760|7722|69.1|30.3|0.6|0.4|31.4|90.3|
-|decode_test_decode_ctcweight0.3_lm_word7184|760|7722|78.5|20.6|0.9|0.4|21.9|77.6|
-|decode_train_dev_decode_ctcweight0.3|100|927|68.6|31.4|0.0|0.0|31.4|94.0|
-|decode_train_dev_decode_ctcweight0.3_lm_word7184|100|927|75.4|23.9|0.6|0.0|24.6|84.0|
\ No newline at end of file
+|decode_test_decode_alsd|760|7722|74.1|25.3|0.7|0.3|26.3|83.9|
+|decode_test_decode_default|760|7722|74.2|25.3|0.5|0.4|26.1|84.5|
+|decode_test_decode_maes|760|7722|74.1|25.4|0.5|0.4|26.3|84.5|
+|decode_test_decode_nsc|760|7722|74.3|25.2|0.4|0.4|26.0|84.1|
+|decode_test_decode_tsd|760|7722|74.3|25.2|0.5|0.4|26.0|83.7|
+|decode_train_dev_decode_alsd|100|927|75.6|24.4|0.0|0.0|24.4|93.0|
+|decode_train_dev_decode_default|100|927|74.6|25.4|0.0|0.0|25.4|94.0|
+|decode_train_dev_decode_maes|100|927|74.4|25.6|0.0|0.0|25.6|93.0|
+|decode_train_dev_decode_nsc|100|927|74.6|25.4|0.0|0.0|25.4|93.0|
+|decode_train_dev_decode_tsd|100|927|74.6|25.4|0.0|0.0|25.4|92.0|
diff --git a/egs/vivos/asr1/cmd.sh b/egs/vivos/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/vivos/asr1/cmd.sh
+++ b/egs/vivos/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/vivos/asr1/conf/tuning/transducer/decode_alsd.yaml b/egs/vivos/asr1/conf/tuning/transducer/decode_alsd.yaml
index 169867435cc..9742aa785f2 100644
--- a/egs/vivos/asr1/conf/tuning/transducer/decode_alsd.yaml
+++ b/egs/vivos/asr1/conf/tuning/transducer/decode_alsd.yaml
@@ -1,6 +1,5 @@
 # decoding parameters
 batch: 0
-beam-size: 10
+beam-size: 5
 search-type: alsd
 u-max: 100
-score-norm: True
diff --git a/egs/vivos/asr1/conf/tuning/transducer/decode_default.yaml b/egs/vivos/asr1/conf/tuning/transducer/decode_default.yaml
index b62b87b7f73..5d9b91874c3 100644
--- a/egs/vivos/asr1/conf/tuning/transducer/decode_default.yaml
+++ b/egs/vivos/asr1/conf/tuning/transducer/decode_default.yaml
@@ -1,5 +1,5 @@
 # decoding parameters
 batch: 0
-beam-size: 10
+beam-size: 5
 search-type: default
 score-norm: True
diff --git a/egs/vivos/asr1/conf/tuning/transducer/decode_maes.yaml b/egs/vivos/asr1/conf/tuning/transducer/decode_maes.yaml
new file mode 100644
index 00000000000..6f36c3deb24
--- /dev/null
+++ b/egs/vivos/asr1/conf/tuning/transducer/decode_maes.yaml
@@ -0,0 +1,8 @@
+# decoding parameters
+batch: 0
+beam-size: 5
+search-type: maes
+nstep: 3
+prefix-alpha: 1
+expansion-gamma: 0.3
+expansion-beta: 0
diff --git a/egs/vivos/asr1/conf/tuning/transducer/decode_nsc.yaml b/egs/vivos/asr1/conf/tuning/transducer/decode_nsc.yaml
index a479f3bb423..d77849b3f29 100644
--- a/egs/vivos/asr1/conf/tuning/transducer/decode_nsc.yaml
+++ b/egs/vivos/asr1/conf/tuning/transducer/decode_nsc.yaml
@@ -1,7 +1,6 @@
 # decoding parameters
 batch: 0
-beam-size: 10
+beam-size: 5
 search-type: nsc
 nstep: 3
 prefix-alpha: 2
-score-norm: True
diff --git a/egs/vivos/asr1/conf/tuning/transducer/decode_tsd.yaml b/egs/vivos/asr1/conf/tuning/transducer/decode_tsd.yaml
index 7fb032389ae..0f23b8470b5 100644
--- a/egs/vivos/asr1/conf/tuning/transducer/decode_tsd.yaml
+++ b/egs/vivos/asr1/conf/tuning/transducer/decode_tsd.yaml
@@ -1,6 +1,4 @@
-# decoding parameters
 batch: 0
-beam-size: 10
+beam-size: 5
 search-type: tsd
 max-sym-exp: 4
-score-norm: False
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
index 55772ffe452..e5611e774b2 100644
--- a/egs/vivos/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
+++ b/egs/vivos/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
@@ -1,5 +1,5 @@
 # minibatch related
-batch-size: 30
+batch-size: 32
 maxlen-in: 800
 maxlen-out: 150
 
@@ -8,11 +8,12 @@ criterion: loss
 early-stop-criterion: "validation/main/loss"
 sortagrad: 0
 opt: noam
-transformer-lr: 10.0
-transformer-warmup-steps: 8000
-epochs: 30
+noam-lr: 2.0
+noam-adim: 256
+optimizer-warmup-steps: 4000
+epochs: 40
 patience: 0
-accum-grad: 4
+accum-grad: 2
 grad-clip: 5.0
 
 # network architecture
@@ -30,16 +31,16 @@ enc-block-arch:
           macaron_style: True
           use_conv_mod: True
           conv_mod_kernel: 15
-          dropout-rate: 0.4
-          pos-dropout-rate: 0.4
-          att-dropout-rate: 0.4
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+          pos-dropout-rate: 0.3
 enc-block-repeat: 8
 ## decoder related
 dtype: lstm
 dlayers: 1
-dec-embed-dim: 512
-dunits: 512
-dropout-rate-decoder: 0.2
+dec-embed-dim: 256
+dunits: 256
+dropout-rate-decoder: 0.1
 dropout-rate-embed-decoder: 0.2
 ## joint network related
 joint-dim: 128
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux.yaml
new file mode 100644
index 00000000000..b5bd667da64
--- /dev/null
+++ b/egs/vivos/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux.yaml
@@ -0,0 +1,55 @@
+# minibatch related
+batch-size: 32
+maxlen-in: 800
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+noam-lr: 2.0
+noam-adim: 256
+optimizer-warmup-steps: 4000
+epochs: 40
+patience: 0
+accum-grad: 2
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+enc-block-arch:
+        - type: conformer
+          d_hidden: 320
+          d_ff: 320
+          heads: 4
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 15
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+          pos-dropout-rate: 0.3
+enc-block-repeat: 8
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 256
+dunits: 256
+dropout-rate-decoder: 0.1
+dropout-rate-embed-decoder: 0.2
+## joint network related
+joint-dim: 128
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+use-ctc-loss: True
+ctc-loss-weight: 0.3
+ctc-loss-dropout-rate: 0.1
+use-lm-loss: True
+lm-loss-weight: 0.3
+lm-loss-smoothing-rate: 0.05
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_conformer_transducer.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_conformer_transducer.yaml
deleted file mode 100644
index 0021fc3e6df..00000000000
--- a/egs/vivos/asr1/conf/tuning/transducer/train_conformer_transducer.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# minibatch related
-batch-size: 30
-maxlen-in: 800
-maxlen-out: 150
-
-# optimization related
-criterion: loss
-early-stop-criterion: "validation/main/loss"
-sortagrad: 0
-opt: noam
-transformer-lr: 10.0
-transformer-warmup-steps: 8000
-epochs: 30
-patience: 0
-accum-grad: 4
-grad-clip: 5.0
-
-# network architecture
-## general
-custom-enc-positional-encoding-type: rel_pos
-custom-enc-self-attn-type: rel_self_attn
-## encoder related
-etype: custom
-custom-enc-input-layer: vgg2l
-enc-block-arch:
-        - type: conformer
-          d_hidden: 320
-          d_ff: 320
-          heads: 4
-          macaron_style: True
-          use_conv_mod: True
-          conv_mod_kernel: 15
-          dropout-rate: 0.4
-          att-dropout-rate: 0.4
-          pos-dropout-rate: 0.4
-enc-block-repeat: 8
-## decoder related
-dtype: custom
-custom-dec-input-layer: embed
-dec-block-arch:
-        - type: transformer
-          d_hidden: 320
-          d_ff: 300
-          heads: 4
-          dropout-rate: 0.1
-          pos-dropout-rate: 0.1
-          att-dropout-rate: 0.1
-dec-block-repeat: 2
-## joint network related
-joint-dim: 128
-
-# transducer related
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_rnn_transducer.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_rnn_transducer.yaml
new file mode 100644
index 00000000000..d2487ad6c22
--- /dev/null
+++ b/egs/vivos/asr1/conf/tuning/transducer/train_rnn_transducer.yaml
@@ -0,0 +1,31 @@
+# minibatch related
+batch-size: 32
+maxlen-in: 800
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: adadelta
+epochs: 20
+patience: 3
+
+# network architecture
+## encoder related
+etype: vggblstm
+elayers: 4
+eunits: 512
+dropout-rate: 0.2
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 256
+dunits: 256
+dropout-rate-decoder: 0.1
+dropout-rate-embed-decoder: 0.1
+## joint network related
+joint-dim: 256
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_rnn_transducer_aux.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_rnn_transducer_aux.yaml
new file mode 100644
index 00000000000..760cb4fd20d
--- /dev/null
+++ b/egs/vivos/asr1/conf/tuning/transducer/train_rnn_transducer_aux.yaml
@@ -0,0 +1,37 @@
+# minibatch related
+batch-size: 32
+maxlen-in: 800
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: adadelta
+epochs: 20
+patience: 3
+
+# network architecture
+## encoder related
+etype: vggblstm
+elayers: 4
+eunits: 512
+dropout-rate: 0.2
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 256
+dunits: 256
+dropout-rate-decoder: 0.1
+dropout-rate-embed-decoder: 0.1
+## joint network related
+joint-dim: 256
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+use-ctc-loss: True
+ctc-loss-weight: 0.5
+ctc-loss-dropout-rate: 0.1
+use-lm-loss: True
+lm-loss-weight: 0.2
+lm-loss-smoothing-rate: 0.05
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_tdnn_conformer-rnn_transducer.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_tdnn_conformer-rnn_transducer.yaml
deleted file mode 100644
index 33bde220fd9..00000000000
--- a/egs/vivos/asr1/conf/tuning/transducer/train_tdnn_conformer-rnn_transducer.yaml
+++ /dev/null
@@ -1,115 +0,0 @@
-# minibatch related
-batch-size: 28
-maxlen-in: 500
-maxlen-out: 150
-
-# optimization related
-criterion: loss
-early-stop-criterion: "validation/main/loss"
-sortagrad: 0
-opt: noam
-transformer-lr: 5.0
-transformer-warmup-steps: 8000
-epochs: 30
-patience: 0
-accum-grad: 2
-
-# network architecture
-## general
-custom-enc-positional-encoding-type: rel_pos
-custom-enc-self-attn-type: rel_self_attn
-## encoder related
-etype: custom
-custom-enc-input-layer: linear
-enc-block-arch:
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 3
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.3
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 3
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.3
-        - type: conformer
-          d_hidden: 512
-          d_ff: 512
-          heads: 8
-          macaron_style: True
-          use_conv_mod: True
-          conv_mod_kernel: 31
-          att-dropout-rate: 0.3
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 7
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.2
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 7
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.2
-        - type: conformer
-          d_hidden: 512
-          d_ff: 512
-          heads: 8
-          macaron_style: True
-          use_conv_mod: True
-          conv_mod_kernel: 31
-          att-dropout-rate: 0.2
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 7
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.1
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 7
-          dilation: 1
-          stride: 3
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.1
-        - type: conformer
-          d_hidden: 512
-          d_ff: 512
-          heads: 4
-          macaron_style: True
-          use_conv_mod: True
-          conv_mod_kernel: 17
-          att-dropout-rate: 0.1
-## decoder related
-dtype: lstm
-dlayers: 1
-dec-embed-dim: 512
-dunits: 512
-dropout-rate-decoder: 0.1
-dropout-rate-embed-decoder: 0.2
-
-## joint network related
-joint-dim: 128
-
-# transducer related
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
\ No newline at end of file
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_tdnn_transformer-rnn_transducer.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_tdnn_transformer-rnn_transducer.yaml
deleted file mode 100644
index 2f2988b207e..00000000000
--- a/egs/vivos/asr1/conf/tuning/transducer/train_tdnn_transformer-rnn_transducer.yaml
+++ /dev/null
@@ -1,104 +0,0 @@
-# minibatch related
-batch-size: 28
-maxlen-in: 500
-maxlen-out: 150
-
-# optimization related
-criterion: loss
-early-stop-criterion: "validation/main/loss"
-sortagrad: 0
-opt: noam
-transformer-lr: 5.0
-transformer-warmup-steps: 8000
-epochs: 30
-patience: 0
-accum-grad: 4
-
-# network architecture
-## general
-## encoder related
-etype: custom
-custom-enc-input-layer: linear
-enc-block-arch:
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 3
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.2
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 3
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.2
-        - type: transformer
-          d_hidden: 512
-          d_ff: 512
-          heads: 8
-          att-dropout-rate: 0.2
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 7
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.2
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 7
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.2
-        - type: transformer
-          d_hidden: 512
-          d_ff: 512
-          heads: 8
-          att-dropout-rate: 0.2
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 7
-          dilation: 1
-          stride: 1
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.2
-        - type: tdnn
-          idim: 512
-          odim: 512
-          ctx_size: 7
-          dilation: 1
-          stride: 3
-          use-relu: True
-          use-batch-norm: True
-          dropout-rate: 0.2
-        - type: transformer
-          d_hidden: 512
-          d_ff: 512
-          heads: 4
-          att-dropout-rate: 0.2
-## decoder related
-dtype: lstm
-dlayers: 1
-dec-embed-dim: 512
-dunits: 512
-dropout-rate-decoder: 0.2
-dropout-rate-embed-decoder: 0.1
-
-## joint network related
-joint-dim: 128
-
-# transducer related
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
\ No newline at end of file
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_transducer.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_transducer.yaml
deleted file mode 100644
index 120f5bad3f5..00000000000
--- a/egs/vivos/asr1/conf/tuning/transducer/train_transducer.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-# minibatch related
-batch-size: 30
-maxlen-in: 800
-maxlen-out: 150
-
-# optimization related
-criterion: loss
-early-stop-criterion: "validation/main/loss"
-sortagrad: 0
-opt: adadelta
-epochs: 15
-patience: 3
-
-# network architecture
-## encoder related
-etype: blstmp
-elayers: 4
-eunits: 320
-eprojs: 320
-subsample: "1_2_2_1_1"
-dropout-rate: 0.0
-## decoder related
-dtype: lstm
-dlayers: 1
-dec-embed-dim: 300
-dunits: 300
-dropout-rate-decoder: 0.1
-dropout-rate-embed-decoder: 0.1
-## joint network related
-joint-dim: 300
-
-# transducer related
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_transformer-rnn_transducer.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_transformer-rnn_transducer.yaml
deleted file mode 100644
index 4b2ae94bddb..00000000000
--- a/egs/vivos/asr1/conf/tuning/transducer/train_transformer-rnn_transducer.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# minibatch related
-batch-size: 30
-maxlen-in: 800
-maxlen-out: 150
-
-# optimization related
-criterion: loss
-early-stop-criterion: "validation/main/loss"
-sortagrad: 0
-opt: noam
-transformer-lr: 10.0
-transformer-warmup-steps: 8000
-epochs: 30
-patience: 0
-accum-grad: 4
-grad-clip: 5.0
-
-# network architecture
-## encoder related
-etype: custom
-custom-enc-input-layer: vgg2l
-enc-block-arch:
-        - type: transformer
-          d_hidden: 320
-          d_ff: 320
-          heads: 4
-          att-dropout-rate: 0.4
-enc-block-repeat: 8
-## decoder related
-dtype: lstm
-dlayers: 1
-dec-embed-dim: 512
-dunits: 512
-dropout-rate-decoder: 0.2
-dropout-rate-embed-decoder: 0.1
-## joint network related
-joint-dim: 128
-
-# transducer related
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
diff --git a/egs/vivos/asr1/conf/tuning/transducer/train_transformer_transducer.yaml b/egs/vivos/asr1/conf/tuning/transducer/train_transformer_transducer.yaml
deleted file mode 100644
index 2efca4ac4d3..00000000000
--- a/egs/vivos/asr1/conf/tuning/transducer/train_transformer_transducer.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-# minibatch related
-batch-size: 30
-maxlen-in: 800
-maxlen-out: 150
-
-# optimization related
-criterion: loss
-early-stop-criterion: "validation/main/loss"
-sortagrad: 0
-opt: noam
-transformer-lr: 10.0
-transformer-warmup-steps: 8000
-epochs: 30
-patience: 0
-accum-grad: 4
-grad-clip: 5.0
-
-# network architecture
-## encoder related
-etype: custom
-custom-enc-input-layer: vgg2l
-enc-block-arch:
-        - type: transformer
-          d_hidden: 320
-          d_ff: 320
-          heads: 4
-          att-dropout-rate: 0.4
-enc-block-repeat: 8
-## decoder related
-dtype: custom
-custom-dec-input-layer: embed
-dec-block-arch:
-        - type: transformer
-          d_hidden: 320
-          d_ff: 300
-          heads: 4
-          dropout-rate: 0.1
-          pos-dropout-rate: 0.1
-          att-dropout-rate: 0.1
-dec-block-repeat: 2
-## joint network related
-joint-dim: 128
-
-# transducer related
-model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
diff --git a/egs/voxforge/asr1/RESULTS.md b/egs/voxforge/asr1/RESULTS.md
index e143449e9f7..4c98425cd53 100644
--- a/egs/voxforge/asr1/RESULTS.md
+++ b/egs/voxforge/asr1/RESULTS.md
@@ -164,149 +164,262 @@ exp/tr_it_a10/decode_et_it_beam20_eacc.best_p0_len0.0-0.8/result.txt:| SPKR
 exp/tr_it_a10/decode_et_it_beam20_eacc.best_p0_len0.0-0.8/result.txt:| Sum/Avg               | 1050   77586 | 86.3    5.6    8.1    2.8   16.5   98.3 |
 ```
 
-# Below are preliminaries results for transducer and transducer-attention.
-
-
-# RNN-Transducer ('rnnt')
+# Transducer
+
+## Summary
+
+|Model|Algo|CER¹|WER¹|SER¹|RTF¹²|
+|-|-|-|-|-|-|
+|RNN-T|default|12.3|42.5|96.6|0.097|
+|-|ALSD|12.2|42.9|96.6|0.083|
+|-|TSD|12.0|42.0|96.4|0.139|
+|-|NSC|12.0|42.3|96.4|0.156|
+|-|mAES|12.1|42.3|96.6|0.075|
+|RNN-T + Aux|default|11.6|40.6|95.5|0.098|
+|-|ALSD|11.5|40.5|95.6|0.082|
+|-|TSD|11.3|39.7|94.9|0.140|
+|-|NSC|11.3|40.0|95.3|0.156|
+|-|mAES|11.5|40.2|95.2|0.076|
+|Conformer/RNN-T|default|8.8|32.6|92.6|0.137|
+|-|ALSD|8.7|32.6|92.4|0.151|
+|-|TSD|8.8|32.8|92.8|0.298|
+|-|NSC|8.9|33.1|93.1|0.325|
+|-|mAES|8.7|32.8|92.8|0.108|
+|Conformer/RNN-T + Aux|default|7.9|28.7|88.0|0.159|
+|-|ALSD|7.9|28.7|88.8|0.146|
+|-|TSD³|7.8|28.9|88.6|0.202|
+|-|NSC³|7.8|29.0|88.6|0.224|
+|-|mAES|7.8|28.9|88.7|0.109|
+
+¹ Reported on the test set only.  
+² RTF was computed using `line-profiler` tool applied to [recognize method](https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/e2e_asr_transducer.py#L470). The reported value is averaged on 5 runs with `nj=1`. All experiments were performed using a single AMD EPYC 7502P.  
+³ Here, the number of required expansions at each timestep for the time-synchronous algorithms can be lowered with almost no degradation in terms of CER/WER. Because of its adaptive nature, mAES will automatically adjusts the number of required expansions at each time step.  
+  Thus, we use `max-sym-exp: 3` for TSD and `nstep: 2` for NSC when decoding with Conformer/RNN-T model trained with aux. tasks.
+
+## RNN-Transducer (Enc: VGG + 4x BLSTM, Dec: 1x LSTM)
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Peak VRAM usage during training: ~ 18.1 GiB
+  - Training time: ~ 21 minutes
+  - Decoding time (16 jobs, `search-type: default`): ~ 53 seconds
 
 - Environments
-  - date: `Mon Jul 13 11:18:28 CEST 2020`
-  - python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.6.2`
+  - date: `Wed Aug 18 07:56:16 UTC 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
   - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.0.1.post2`
-  - Git hash: `c1a32dab8d3b5d1e213e1e74c0a1f355b2adf6f5`
-  - Commit date: `Sun Jul 12 13:46:35 2020 +0200`
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
+
+- Model files
+  - model link: https://drive.google.com/file/d/1mMAqbmF-GgTCWFwl3EXunpLTdWGQQmyo
+  - training config file: `conf/tuning/transducer/train_rnn_transducer.yaml`
+  - decoding config file: `conf/tuning/transducer/decode_default.yaml`
+  - cmvn file: `data/tr_it/cmvn.ark`
+  - e2e file: `exp/tr_it_pytorch_train_rnn_transducer/results/model.loss.best`
+  - e2e JSON file: `exp/tr_it_pytorch_train_rnn_transducer/results/model.json`
+  - dict file: `data/lang_1char/`
 
-## CER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dt_it_decode_alsd|1082|79133|90.3|4.8|4.9|2.6|12.3|97.0|
-|decode_dt_it_decode_default|1082|79133|89.5|4.8|5.6|2.5|13.0|97.4|
-|decode_dt_it_decode_nsc|1082|79133|90.3|4.7|5.0|2.5|12.2|97.2|
-|decode_dt_it_decode_tsd|1082|79133|90.2|4.6|5.2|2.5|12.3|97.0|
-|decode_et_it_decode_alsd|1055|77966|90.1|5.0|4.8|2.7|12.5|97.7|
-|decode_et_it_decode_default|1055|77966|89.6|5.0|5.4|2.3|12.8|98.2|
-|decode_et_it_decode_nsc|1055|77966|90.2|4.9|4.8|2.4|12.2|97.2|
-|decode_et_it_decode_tsd|1055|77966|90.1|4.9|5.0|2.4|12.3|97.2|
+|decode_dt_it_decode_alsd|1035|75494|89.6|5.0|5.4|2.6|13.0|98.2|
+|decode_dt_it_decode_default|1035|75494|89.5|5.0|5.5|2.6|13.1|98.4|
+|decode_dt_it_decode_maes|1035|75494|89.3|5.0|5.6|2.4|13.1|98.1|
+|decode_dt_it_decode_nsc|1035|75494|89.4|5.0|5.6|2.4|13.0|98.5|
+|decode_dt_it_decode_tsd|1035|75494|89.4|4.9|5.6|2.4|12.9|98.4|
+|decode_et_it_decode_alsd|1103|81228|90.4|4.9|4.6|2.6|12.2|96.6|
+|decode_et_it_decode_default|1103|81228|90.4|4.9|4.7|2.7|12.2|96.6|
+|decode_et_it_decode_maes|1103|81228|90.4|4.8|4.8|2.5|12.1|96.6|
+|decode_et_it_decode_nsc|1103|81228|90.5|4.8|4.7|2.5|12.0|96.4|
+|decode_et_it_decode_tsd|1103|81228|90.5|4.7|4.8|2.4|12.0|96.4|
 
-## WER
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dt_it_decode_alsd|1082|13235|62.7|31.3|5.9|4.9|42.1|97.0|
-|decode_dt_it_decode_default|1082|13235|61.0|32.3|6.7|4.5|43.5|97.4|
-|decode_dt_it_decode_nsc|1082|13235|62.9|31.0|6.1|4.7|41.9|97.2|
-|decode_dt_it_decode_tsd|1082|13235|62.7|30.9|6.4|4.5|41.8|97.0|
-|decode_et_it_decode_alsd|1055|12990|61.9|31.7|6.5|5.9|44.0|97.7|
-|decode_et_it_decode_default|1055|12990|60.7|32.3|7.0|5.3|44.6|98.2|
-|decode_et_it_decode_nsc|1055|12990|62.4|31.2|6.4|5.2|42.8|97.2|
-|decode_et_it_decode_tsd|1055|12990|62.1|31.3|6.5|5.0|42.9|97.2|
+|decode_dt_it_decode_alsd|1035|12587|60.2|32.7|7.1|4.5|44.3|98.2|
+|decode_dt_it_decode_default|1035|12587|60.2|32.7|7.1|4.5|44.4|98.4|
+|decode_dt_it_decode_maes|1035|12587|60.1|32.6|7.3|4.3|44.2|98.1|
+|decode_dt_it_decode_nsc|1035|12587|60.1|32.5|7.4|4.3|44.1|98.5|
+|decode_dt_it_decode_tsd|1035|12587|60.4|32.2|7.4|4.1|43.8|98.4|
+|decode_et_it_decode_alsd|1103|13699|62.0|31.8|6.1|5.0|42.9|96.6|
+|decode_et_it_decode_default|1103|13699|62.5|31.4|6.1|4.9|42.5|96.6|
+|decode_et_it_decode_maes|1103|13699|62.2|31.4|6.3|4.5|42.3|96.6|
+|decode_et_it_decode_nsc|1103|13699|62.3|31.5|6.2|4.6|42.3|96.4|
+|decode_et_it_decode_tsd|1103|13699|62.4|31.3|6.3|4.4|42.0|96.4|
+
+## RNN-Transducer (Enc: VGG + 4x BLSTM, Dec: 1x LSTM)
+##   + CTC loss + Label Smoothing loss + aux. Transducer loss + symm. KL div loss
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Peak VRAM usage during training: ~ 18.4 GiB
+  - Training time: ~ 25 minutes and 20 seconds
+  - Decoding time (16 jobs, `search-type: default`): ~ 55 seconds
 
-# RNN-Transducer + encoder pre-initialization (CTC)
+- Environments
+  - date: `Wed Aug 18 07:56:16 UTC 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
+  - chainer version: `chainer 6.0.0`
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
+
+- Model files
+  - model link: https://drive.google.com/file/d/1ZW_wSJYEiXyp0n_Cj09ucgh2ncZ6-RK1
+  - training config file: `conf/tuning/transducer/train_rnn_transducer_aux.yaml`
+  - decoding config file: `conf/tuning/transducer/decode_default.yaml`
+  - cmvn file: `data/tr_it/cmvn.ark`
+  - e2e file: `exp/tr_it_pytorch_train_rnn_transducer_aux/results/model.loss.best`
+  - e2e JSON file: `exp/tr_it_pytorch_train_rnn_transducer_aux/results/model.json`
+  - dict file: `data/lang_1char/`
 
-## CER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dt_it_decode_alsd|1082|79133|90.9|4.8|4.3|2.6|11.7|97.0|
-|decode_dt_it_decode_default|1082|79133|90.7|4.7|4.6|2.5|11.8|97.5|
-|decode_dt_it_decode_nsc|1082|79133|90.9|4.7|4.4|2.5|11.6|97.1|
-|decode_dt_it_decode_tsd|1082|79133|90.8|4.7|4.5|2.4|11.6|97.2|
-|decode_et_it_decode_alsd|1055|77966|91.0|4.8|4.2|2.4|11.4|97.8|
-|decode_et_it_decode_default|1055|77966|90.8|4.8|4.4|2.2|11.4|97.7|
-|decode_et_it_decode_nsc|1055|77966|91.0|4.7|4.2|2.3|11.3|97.3|
-|decode_et_it_decode_tsd|1055|77966|91.0|4.7|4.3|2.2|11.3|97.3|
+|decode_dt_it_decode_alsd|1035|75494|90.2|4.7|5.2|2.3|12.1|98.6|
+|decode_dt_it_decode_default|1035|75494|89.9|4.6|5.5|2.2|12.3|97.7|
+|decode_dt_it_decode_maes|1035|75494|89.9|4.7|5.4|2.2|12.3|97.9|
+|decode_dt_it_decode_nsc|1035|75494|90.0|4.6|5.4|2.1|12.1|97.8|
+|decode_dt_it_decode_tsd|1035|75494|90.0|4.6|5.4|2.1|12.1|97.4|
+|decode_et_it_decode_alsd|1103|81228|90.8|4.6|4.5|2.3|11.5|95.6|
+|decode_et_it_decode_default|1103|81228|90.6|4.6|4.8|2.2|11.6|95.5|
+|decode_et_it_decode_maes|1103|81228|90.7|4.6|4.7|2.1|11.5|95.2|
+|decode_et_it_decode_nsc|1103|81228|90.8|4.5|4.7|2.1|11.3|95.3|
+|decode_et_it_decode_tsd|1103|81228|90.8|4.5|4.7|2.1|11.3|94.9|
 
-## WER
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dt_it_decode_alsd|1082|13235|61.9|32.4|5.7|4.9|42.9|97.0|
-|decode_dt_it_decode_default|1082|13235|61.5|32.5|6.0|4.5|43.1|97.5|
-|decode_dt_it_decode_nsc|1082|13235|61.9|32.2|5.9|4.6|42.7|97.1|
-|decode_dt_it_decode_tsd|1082|13235|61.9|32.2|5.9|4.5|42.6|97.2|
-|decode_et_it_decode_alsd|1055|12990|61.5|33.1|5.5|4.7|43.3|97.8|
-|decode_et_it_decode_default|1055|12990|61.0|33.0|6.1|4.3|43.3|97.7|
-|decode_et_it_decode_nsc|1055|12990|61.6|32.8|5.6|4.5|43.0|97.3|
-|decode_et_it_decode_tsd|1055|12990|61.5|32.7|5.8|4.4|42.9|97.3|
+|decode_dt_it_decode_alsd|1035|12587|62.0|30.9|7.1|3.8|41.8|98.6|
+|decode_dt_it_decode_default|1035|12587|61.5|31.0|7.5|3.6|42.1|97.7|
+|decode_dt_it_decode_maes|1035|12587|61.7|30.9|7.4|3.4|41.7|97.9|
+|decode_dt_it_decode_nsc|1035|12587|61.8|30.7|7.5|3.4|41.6|97.8|
+|decode_dt_it_decode_tsd|1035|12587|62.1|30.6|7.4|3.5|41.4|97.4|
+|decode_et_it_decode_alsd|1103|13699|63.5|30.3|6.2|4.0|40.5|95.6|
+|decode_et_it_decode_default|1103|13699|63.3|30.6|6.2|3.9|40.6|95.5|
+|decode_et_it_decode_maes|1103|13699|63.4|30.2|6.3|3.6|40.2|95.2|
+|decode_et_it_decode_nsc|1103|13699|63.7|29.9|6.3|3.7|40.0|95.3|
+|decode_et_it_decode_tsd|1103|13699|64.0|29.7|6.3|3.7|39.7|94.9|
+
+## Conformer/RNN-Transducer (Enc: VGG + 8x Conformer, Dec: 1x LSTM)
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Peak VRAM usage during training: ~ 24.8 GiB
+  - Training time: ~ 4 hours and 13 minutes
+  - Decoding time (16 jobs, `search-type: default`): ~ 2 minutes and 30 seconds
+  - Model averaging: `n_average=20`, `use_valbest_average=false`
 
-# RNN-Transducer + encoder pre-initialization (CTC) + decoder pre-initialization (LM)
+- Environments
+  - date: `Wed Aug 18 07:56:16 UTC 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
+  - chainer version: `chainer 6.0.0`
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
+
+- Model files
+  - model link: https://drive.google.com/file/d/1JLy91GORrg-iw_ZmRfevGaLjMZ_NmLa6
+  - training config file: `conf/tuning/transducer/train_conformer-rnn_transducer.yaml`
+  - decoding config file: `conf/tuning/transducer/decode_default.yaml`
+  - cmvn file: `data/tr_it/cmvn.ark`
+  - e2e file: `exp/tr_it_pytorch_train_conformer-rnn_transducer/results/model.last20.avg.best`
+  - e2e JSON file: `exp/tr_it_pytorch_train_conformer-rnn_transducer/results/model.json`
+  - dict file: `data/lang_1char/`
 
-## CER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dt_it_decode_alsd|1082|79133|90.9|4.7|4.4|2.5|11.6|97.3|
-|decode_dt_it_decode_default|1082|79133|90.6|4.7|4.7|2.2|11.6|97.1|
-|decode_dt_it_decode_nsc|1082|79133|90.8|4.7|4.5|2.3|11.5|96.9|
-|decode_dt_it_decode_tsd|1082|79133|90.7|4.7|4.6|2.2|11.5|96.9|
-|decode_et_it_decode_alsd|1055|77966|90.9|4.8|4.3|2.3|11.4|97.5|
-|decode_et_it_decode_default|1055|77966|90.7|4.7|4.6|2.2|11.5|97.5|
-|decode_et_it_decode_nsc|1055|77966|90.9|4.7|4.4|2.2|11.3|97.5|
-|decode_et_it_decode_tsd|1055|77966|90.8|4.7|4.5|2.2|11.4|97.6|
+|decode_dt_it_decode_alsd|1035|75494|92.7|3.6|3.7|2.2|9.5|94.5|
+|decode_dt_it_decode_default|1035|75494|92.7|3.6|3.7|2.2|9.5|94.8|
+|decode_dt_it_decode_maes|1035|75494|92.6|3.6|3.8|2.1|9.5|95.1|
+|decode_dt_it_decode_nsc|1035|75494|92.5|3.7|3.9|2.1|9.7|95.2|
+|decode_dt_it_decode_tsd|1035|75494|92.4|3.6|3.9|2.1|9.6|94.9|
+|decode_et_it_decode_alsd|1103|81228|93.4|3.5|3.0|2.2|8.7|92.4|
+|decode_et_it_decode_default|1103|81228|93.4|3.5|3.0|2.2|8.8|92.6|
+|decode_et_it_decode_maes|1103|81228|93.3|3.5|3.1|2.1|8.7|92.8|
+|decode_et_it_decode_nsc|1103|81228|93.2|3.6|3.2|2.1|8.9|93.1|
+|decode_et_it_decode_tsd|1103|81228|93.2|3.5|3.2|2.0|8.8|92.8|
 
-## WER
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dt_it_decode_alsd|1082|13235|62.0|32.2|5.8|4.5|42.5|97.3|
-|decode_dt_it_decode_default|1082|13235|61.2|32.2|6.6|3.9|42.7|97.1|
-|decode_dt_it_decode_nsc|1082|13235|61.9|32.0|6.2|4.1|42.2|96.9|
-|decode_dt_it_decode_tsd|1082|13235|61.8|31.9|6.3|4.0|42.2|96.9|
-|decode_et_it_decode_alsd|1055|12990|61.5|32.5|6.0|4.7|43.2|97.5|
-|decode_et_it_decode_default|1055|12990|61.2|32.4|6.4|4.2|43.0|97.5|
-|decode_et_it_decode_nsc|1055|12990|61.7|32.2|6.1|4.4|42.7|97.5|
-|decode_et_it_decode_tsd|1055|12990|61.6|32.2|6.2|4.3|42.7|97.6|
-
-# Conformer/RNN-Transducer (enc: 8 x Conformer, dec: 1 x LSTM)
-# modified decoding params:
-#   - general: n_average=20
-#   - alsd: beam-size=10, u-max=300, score-norm=True
-#   - nsc: nstep=4, prefix-alpha=3
-#   - tsd: max-sym-exp=5, score-norm=True
+|decode_dt_it_decode_alsd|1035|12587|69.9|24.2|5.8|4.1|34.2|94.5|
+|decode_dt_it_decode_default|1035|12587|70.0|24.1|5.9|4.2|34.1|94.8|
+|decode_dt_it_decode_maes|1035|12587|69.6|24.4|6.1|4.0|34.4|95.1|
+|decode_dt_it_decode_nsc|1035|12587|69.2|24.4|6.4|4.1|34.9|95.2|
+|decode_dt_it_decode_tsd|1035|12587|69.2|24.4|6.4|3.9|34.7|94.9|
+|decode_et_it_decode_alsd|1103|13699|71.5|23.4|5.1|4.1|32.6|92.4|
+|decode_et_it_decode_default|1103|13699|71.4|23.7|4.9|4.0|32.6|92.6|
+|decode_et_it_decode_maes|1103|13699|71.1|23.7|5.2|3.9|32.8|92.8|
+|decode_et_it_decode_nsc|1103|13699|70.7|24.0|5.3|3.9|33.2|93.1|
+|decode_et_it_decode_tsd|1103|13699|70.9|23.7|5.4|3.7|32.8|92.8|
+
+## Conformer/RNN-Transducer (Enc: VGG + 8x Conformer, Dec: 1x LSTM)
+##   + CTC loss + Label Smoothing loss + aux. Transducer loss + symm. KL div loss
+
+- General information
+  - GPU: Nvidia A100 40Gb
+  - Peak VRAM usage during training: ~ 26.9 GiB
+  - Training time: ~ 4 hours and 31 minutes
+  - Decoding time (16 jobs, `search-type: default`): ~ 2 minutes and 50 seconds
+  - Model averaging: `n_average=20`, `use_valbest_average=false`
 
 - Environments
-  - date: `Fri Nov 27 11:41:31 CET 2020`
-  - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-  - espnet version: `espnet 0.9.4`
+  - date: `Wed Aug 18 07:56:16 UTC 2021`
+  - python version: `3.8.5 (default, Sept  4 2020, 07:30:14)  [GCC 7.3.0]`
+  - espnet version: `espnet 0.10.2a1`
   - chainer version: `chainer 6.0.0`
-  - pytorch version: `pytorch 1.4.0`
-  - Git hash: `b0ec37da0357c3b612833b02b45994eaaa4370ae`
-  - Commit date: `Fri Nov 27 10:14:20 2020 +0100
+  - pytorch version: `pytorch 1.8.1`
+  - Git hash: `4406f25ebf507daf33f68787ff0e3699a0937913`
+  - Commit date: `Sat Aug 14 06:25:18 2021 -0400`
 
-- Model files (archived to conformer-rnn_transducer.tar.gz by `$ pack_model.sh`)
-  - model link: https://drive.google.com/open?id=13HHW3Qs5Yk4vzmEuOHl-VJ46M-e3qtZ6
-  - training config file: `conf/tuning/transducer/train_conformer-rnn_transducer.yaml`
+- Model files
+  - model link: https://drive.google.com/file/d/1imTL8FyhmIO5OmSl1h-D91UwI6S0elx-
+  - training config file: `conf/tuning/transducer/train_conformer-rnn_transducer_aux.yaml`
   - decoding config file: `conf/tuning/transducer/decode_default.yaml`
   - cmvn file: `data/tr_it/cmvn.ark`
-  - e2e file: `exp/tr_it_pytorch_train_conformer-rnn_transducer/results/model.last20.avg.best`
-  - e2e JSON file: `exp/tr_it_pytorch_train_conformer-rnn_transducer/results/model.json`
+  - e2e file: `exp/tr_it_pytorch_train_conformer-rnn_transducer_aux/results/model.last20.avg.best`
+  - e2e JSON file: `exp/tr_it_pytorch_train_conformer-rnn_transducer_aux/results/model.json`
   - dict file: `data/lang_1char/`
 
-## CER
+### CER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dt_it_decode_alsd|1082|79133|93.6|3.5|2.9|2.4|8.8|93.5|
-|decode_dt_it_decode_default|1082|79133|93.6|3.4|3.0|2.4|8.8|93.3|
-|decode_dt_it_decode_nsc|1082|79133|93.6|3.4|3.0|2.3|8.7|93.2|
-|decode_dt_it_decode_tsd|1082|79133|93.6|3.4|3.0|2.4|8.8|93.3|
-|decode_et_it_decode_alsd|1055|77966|93.4|3.5|3.1|2.1|8.6|93.3|
-|decode_et_it_decode_default|1055|77966|93.5|3.5|3.1|2.1|8.7|92.9|
-|decode_et_it_decode_nsc|1055|77966|93.4|3.5|3.1|2.1|8.7|93.3|
-|decode_et_it_decode_tsd|1055|77966|93.4|3.5|3.1|2.1|8.7|93.0|
+|decode_dt_it_decode_alsd|1035|75494|93.1|3.4|3.5|1.9|8.8|93.6|
+|decode_dt_it_decode_default|1035|75494|93.1|3.5|3.5|1.9|8.8|93.3|
+|decode_dt_it_decode_maes|1035|75494|93.0|3.4|3.5|1.8|8.8|93.2|
+|decode_dt_it_decode_nsc|1035|75494|93.1|3.5|3.5|1.8|8.8|93.5|
+|decode_dt_it_decode_tsd|1035|75494|93.0|3.4|3.5|1.8|8.8|93.4|
+|decode_et_it_decode_alsd|1103|81228|94.0|3.2|2.8|1.9|7.9|88.8|
+|decode_et_it_decode_default|1103|81228|94.0|3.2|2.8|1.9|7.9|88.0|
+|decode_et_it_decode_maes|1103|81228|93.9|3.2|3.0|1.7|7.9|88.2|
+|decode_et_it_decode_nsc|1103|81228|93.9|3.2|3.0|1.7|7.8|88.6|
+|decode_et_it_decode_tsd|1103|81228|93.9|3.2|3.0|1.7|7.8|88.6|
 
-## WER
+### WER
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
-|decode_dt_it_decode_alsd|1082|13235|71.5|23.9|4.6|4.1|32.7|93.5|
-|decode_dt_it_decode_default|1082|13235|71.4|23.9|4.7|4.3|32.9|93.3|
-|decode_dt_it_decode_nsc|1082|13235|71.4|23.9|4.7|4.2|32.9|93.2|
-|decode_dt_it_decode_tsd|1082|13235|71.4|23.9|4.7|4.3|32.9|93.3|
-|decode_et_it_decode_alsd|1055|12990|71.8|23.6|4.6|4.2|32.4|93.3|
-|decode_et_it_decode_default|1055|12990|71.8|23.6|4.6|4.4|32.6|92.9|
-|decode_et_it_decode_nsc|1055|12990|71.7|23.6|4.7|4.2|32.5|93.3|
-|decode_et_it_decode_tsd|1055|12990|71.7|23.7|4.6|4.2|32.5|93.0|
\ No newline at end of file
+|decode_dt_it_decode_alsd|1035|12587|71.8|22.7|5.5|3.5|31.7|93.6|
+|decode_dt_it_decode_default|1035|12587|71.7|22.9|5.4|3.4|31.7|93.3|
+|decode_dt_it_decode_maes|1035|12587|71.7|22.8|5.5|3.3|31.6|93.2|
+|decode_dt_it_decode_nsc|1035|12587|71.6|22.9|5.5|3.3|31.6|93.5|
+|decode_dt_it_decode_tsd|1035|12587|71.8|22.7|5.5|3.3|31.5|93.4|
+|decode_et_it_decode_alsd|1103|13699|74.3|21.4|4.2|3.1|28.7|88.8|
+|decode_et_it_decode_default|1103|13699|74.3|21.3|4.3|3.1|28.7|88.0|
+|decode_et_it_decode_maes|1103|13699|73.9|21.5|4.6|2.8|28.9|88.7|
+|decode_et_it_decode_nsc|1103|13699|73.9|21.5|4.6|2.9|29.0|88.6|
+|decode_et_it_decode_tsd|1103|13699|73.9|21.4|4.7|2.8|28.9|88.6|
diff --git a/egs/voxforge/asr1/cmd.sh b/egs/voxforge/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/voxforge/asr1/cmd.sh
+++ b/egs/voxforge/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/decode_alsd.yaml b/egs/voxforge/asr1/conf/tuning/transducer/decode_alsd.yaml
index 7c1d93ba25c..1d9609c5c2d 100644
--- a/egs/voxforge/asr1/conf/tuning/transducer/decode_alsd.yaml
+++ b/egs/voxforge/asr1/conf/tuning/transducer/decode_alsd.yaml
@@ -2,5 +2,5 @@
 batch: 0
 beam-size: 5
 search-type: alsd
-u-max: 200
-score-norm: False
+u-max: 100
+
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/decode_default.yaml b/egs/voxforge/asr1/conf/tuning/transducer/decode_default.yaml
index b62b87b7f73..5d9b91874c3 100644
--- a/egs/voxforge/asr1/conf/tuning/transducer/decode_default.yaml
+++ b/egs/voxforge/asr1/conf/tuning/transducer/decode_default.yaml
@@ -1,5 +1,5 @@
 # decoding parameters
 batch: 0
-beam-size: 10
+beam-size: 5
 search-type: default
 score-norm: True
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/decode_maes.yaml b/egs/voxforge/asr1/conf/tuning/transducer/decode_maes.yaml
new file mode 100644
index 00000000000..69ddc7a4410
--- /dev/null
+++ b/egs/voxforge/asr1/conf/tuning/transducer/decode_maes.yaml
@@ -0,0 +1,9 @@
+# decoding parameters
+batch: 0
+beam-size: 5
+search-type: maes
+nstep: 3
+prefix-alpha: 1
+expansion-gamma: 1.3
+expansion-beta: 1
+
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/decode_nsc.yaml b/egs/voxforge/asr1/conf/tuning/transducer/decode_nsc.yaml
index 772be4be285..d11b4a47fb7 100644
--- a/egs/voxforge/asr1/conf/tuning/transducer/decode_nsc.yaml
+++ b/egs/voxforge/asr1/conf/tuning/transducer/decode_nsc.yaml
@@ -3,5 +3,5 @@ batch: 0
 beam-size: 5
 search-type: nsc
 nstep: 3
-prefix-alpha: 2
-score-norm: True
+prefix-alpha: 1
+
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/decode_tsd.yaml b/egs/voxforge/asr1/conf/tuning/transducer/decode_tsd.yaml
index 4b9f36fe964..8d5a88b708b 100644
--- a/egs/voxforge/asr1/conf/tuning/transducer/decode_tsd.yaml
+++ b/egs/voxforge/asr1/conf/tuning/transducer/decode_tsd.yaml
@@ -3,4 +3,3 @@ batch: 0
 beam-size: 5
 search-type: tsd
 max-sym-exp: 4
-score-norm: False
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml b/egs/voxforge/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
index 3c13b0111e8..1e42a5b4805 100644
--- a/egs/voxforge/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
+++ b/egs/voxforge/asr1/conf/tuning/transducer/train_conformer-rnn_transducer.yaml
@@ -8,8 +8,9 @@ criterion: loss
 early-stop-criterion: "validation/main/loss"
 sortagrad: 0
 opt: noam
-transformer-lr: 10.0
-transformer-warmup-steps: 25000
+noam-lr: 10.0
+noam-adim: 256
+optimizer-warmup-steps: 25000
 epochs: 200
 patience: 0
 accum-grad: 2
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux.yaml b/egs/voxforge/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux.yaml
new file mode 100644
index 00000000000..0ea69d78bf3
--- /dev/null
+++ b/egs/voxforge/asr1/conf/tuning/transducer/train_conformer-rnn_transducer_aux.yaml
@@ -0,0 +1,62 @@
+# minibatch related
+batch-size: 64
+maxlen-in: 512
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: noam
+noam-lr: 10.0
+noam-adim: 256
+optimizer-warmup-steps: 25000
+epochs: 200
+patience: 0
+accum-grad: 2
+grad-clip: 5.0
+
+# network architecture
+## general
+custom-enc-positional-encoding-type: rel_pos
+custom-enc-self-attn-type: rel_self_attn
+custom-enc-pw-activation-type: swish
+## encoder related
+etype: custom
+custom-enc-input-layer: vgg2l
+enc-block-arch:
+        - type: conformer
+          d_hidden: 256
+          d_ff: 2048
+          heads: 4
+          macaron_style: True
+          use_conv_mod: True
+          conv_mod_kernel: 7
+          dropout-rate: 0.3
+          att-dropout-rate: 0.3
+enc-block-repeat: 8
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 1024
+dunits: 1024
+dropout-rate-embed-decoder: 0.2
+dropout-rate-decoder: 0.1
+## joint network related
+joint-dim: 256
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+use-ctc-loss: True
+ctc-loss-weight: 0.5
+ctc-loss-dropout-rate: 0.1
+use-lm-loss: True
+lm-loss-weight: 0.2
+lm-loss-smoothing-rate: 0.05
+use-aux-transducer-loss: True
+aux-transducer-loss-weight: 0.3
+aux-transducer-loss-enc-output-layers: "[4, 6]"
+aux-transducer-loss-mlp-dim: 256
+aux-transducer-loss-mlp-dropout-rate: 0.1
+use-symm-kl-div-loss: True
+symm-kl-div-loss-weight: 0.2
\ No newline at end of file
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/train_transducer.yaml b/egs/voxforge/asr1/conf/tuning/transducer/train_rnn_transducer.yaml
similarity index 100%
rename from egs/voxforge/asr1/conf/tuning/transducer/train_transducer.yaml
rename to egs/voxforge/asr1/conf/tuning/transducer/train_rnn_transducer.yaml
diff --git a/egs/voxforge/asr1/conf/tuning/transducer/train_rnn_transducer_aux.yaml b/egs/voxforge/asr1/conf/tuning/transducer/train_rnn_transducer_aux.yaml
new file mode 100644
index 00000000000..4e31dc82711
--- /dev/null
+++ b/egs/voxforge/asr1/conf/tuning/transducer/train_rnn_transducer_aux.yaml
@@ -0,0 +1,44 @@
+# minibatch related
+batch-size: 24
+maxlen-in: 800
+maxlen-out: 150
+
+# optimization related
+criterion: loss
+early-stop-criterion: "validation/main/loss"
+sortagrad: 0
+opt: adadelta
+epochs: 20
+patience: 3
+
+# network architecture
+## encoder related
+etype: vggblstm
+elayers: 4
+eunits: 320
+dropout-rate: 0.4
+## decoder related
+dtype: lstm
+dlayers: 1
+dec-embed-dim: 256
+dunits: 256
+dropout-rate-decoder: 0.2
+dropout-rate-embed-decoder: 0.2
+## joint network related
+joint-dim: 256
+
+# transducer related
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transducer:E2E"
+use-ctc-loss: True
+ctc-loss-weight: 0.5
+ctc-loss-dropout-rate: 0.1
+use-lm-loss: True
+lm-loss-weight: 0.5
+lm-loss-smoothing-rate: 0.05
+use-aux-transducer-loss: True
+aux-transducer-loss-weight: 0.3
+aux-transducer-loss-enc-output-layers: "[2]"
+aux-transducer-loss-mlp-dim: 256
+aux-transducer-loss-mlp-dropout-rate: 0.1
+use-symm-kl-div-loss: True
+symm-kl-div-loss-weight: 0.3
diff --git a/egs/wsj/asr1/cmd.sh b/egs/wsj/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/wsj/asr1/cmd.sh
+++ b/egs/wsj/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/wsj/asr1/conf/tuning/train_pytorch_transformer_interctc.yaml b/egs/wsj/asr1/conf/tuning/train_pytorch_transformer_interctc.yaml
new file mode 100644
index 00000000000..1f656ed6dab
--- /dev/null
+++ b/egs/wsj/asr1/conf/tuning/train_pytorch_transformer_interctc.yaml
@@ -0,0 +1,54 @@
+# Sample config for "Layer Pruning on Demand with Intermediate CTC"
+# https://arxiv.org/abs/2106.09216
+
+# network architecture
+# encoder related
+elayers: 24
+eunits: 2048
+# decoder related
+# NOTE: we don't use any decoder as `mtlalpha == 1.0`.
+dlayers: 0
+dunits: 2048
+# attention related
+adim: 256
+aheads: 4
+
+# hybrid CTC/attention
+# 1.0: CTC only
+mtlalpha: 1.0
+
+# label smoothing
+lsm-weight: 0.1
+
+# minibatch related
+batch-size: 32
+maxlen-in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+
+# optimization related
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+opt: noam
+accum-grad: 2
+grad-clip: 5
+patience: 0
+epochs: 100
+dropout-rate: 0.1
+
+# transformer specific setting
+backend: pytorch
+model-module: "espnet.nets.pytorch_backend.e2e_asr_transformer:E2E"
+transformer-input-layer: conv2d     # encoder architecture type
+transformer-lr: 3.0  # 10.0 may be too large for CTC-only models
+transformer-warmup-steps: 25000
+transformer-attn-dropout-rate: 0.0
+transformer-length-normalized-loss: false
+transformer-init: pytorch
+
+# intermediate CTC + stochastic depth
+intermediate-ctc-layer: '6,12'
+intermediate-ctc-weight: 0.66
+stochastic-depth-rate: 0.3
+
+# Report CER & WER
+report-cer: true
+report-wer: true
diff --git a/egs/wsj_mix/asr1/cmd.sh b/egs/wsj_mix/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100755
--- a/egs/wsj_mix/asr1/cmd.sh
+++ b/egs/wsj_mix/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/yesno/asr1/cmd.sh b/egs/yesno/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/yesno/asr1/cmd.sh
+++ b/egs/yesno/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/yesno/tts1/cmd.sh b/egs/yesno/tts1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/yesno/tts1/cmd.sh
+++ b/egs/yesno/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/yoloxochitl_mixtec/asr1/cmd.sh b/egs/yoloxochitl_mixtec/asr1/cmd.sh
index 4d70c9c7a79..7b70ef5e06e 100644
--- a/egs/yoloxochitl_mixtec/asr1/cmd.sh
+++ b/egs/yoloxochitl_mixtec/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -56,7 +56,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs/yoloxochitl_mixtec/asr1/local/data_prep.py b/egs/yoloxochitl_mixtec/asr1/local/data_prep.py
index 78506eb3bdc..91fcee41249 100755
--- a/egs/yoloxochitl_mixtec/asr1/local/data_prep.py
+++ b/egs/yoloxochitl_mixtec/asr1/local/data_prep.py
@@ -4,6 +4,7 @@
 import os
 import re
 import shutil
+import soundfile as sf
 import string
 import sys
 from xml.dom.minidom import parse
@@ -413,6 +414,8 @@ def TraverseData(
                 continue
             left_channel_segments, right_channel_segments = segment_info
 
+            f = sf.SoundFile(sound_files[afile])
+            max_length = len(f) / f.samplerate
             print(
                 'sox -t wavpcm "%s" -c 1 -r 16000 -t wavpcm %s-L.wav remix 1'
                 % (sound_files[afile], os.path.join(new_data_dir, afile)),
@@ -431,6 +434,8 @@ def TraverseData(
                     afile,
                     PackZero(segment_number),
                 )
+                if float(segment[1]) > max_length:
+                    continue
                 print(
                     "%s %s-L %s %s" % (segment_id, afile, segment[0], segment[1]),
                     file=segments,
@@ -459,6 +464,8 @@ def TraverseData(
                         afile,
                         PackZero(segment_number),
                     )
+                    if float(segment[1]) > max_length:
+                        continue
                     print(
                         "%s %s-R %s %s" % (segment_id, afile, segment[0], segment[1]),
                         file=segments,
diff --git a/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.py b/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.py
deleted file mode 100644
index fc459d5d30a..00000000000
--- a/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import argparse
-import os
-
-
-def gen_new_segments(datadir, spk_list):
-    if not os.path.isfile(os.path.join(datadir, "segments")):
-        raise ValueError("no segments file found in datadir")
-
-    new_segments = open(os.path.join(datadir, "new_segments"), "w", encoding="utf-8")
-    segments = open(os.path.join(datadir, "segments"), "r", encoding="utf-8")
-    while True:
-        line = segments.readline()
-        if not line:
-            break
-        spk = line.split("_")[0]
-        if spk in spk_list:
-            new_segments.write(line)
-    new_segments.close(), segments.close()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--source", "-s", type=str, help="source data directory")
-    parser.add_argument("--conf", "-c", type=str, help="split by speaker")
-    parser.add_argument("--tag", "-t", type=str, help="the prefix of data spit result")
-    parser.add_argument(
-        "--train", type=str, default="", help="specific name for training dir"
-    )
-    parser.add_argument("--dev", type=str, default="", help="specific name for dev dir")
-    parser.add_argument(
-        "--test", type=str, default="", help="specific name for test dir"
-    )
-    args = parser.parse_args()
-
-    with open(args.conf, "r", encoding="utf-8") as f:
-        f_content = f.read().strip().split("\n")
-        split_info = {}
-        for line in f_content:
-            line = line.split(",")
-            split_info[line[0]] = line[1:]
-
-    # construct dataset
-    train_dir = (
-        "data/train_{}".format(args.source)
-        if args.train == ""
-        else "data/{}".format(args.train)
-    )
-    test_dir = (
-        "data/test_{}".format(args.source)
-        if args.test == ""
-        else "data/{}".format(args.test)
-    )
-    dev_dir = (
-        "data/dev_{}".format(args.source)
-        if args.dev == ""
-        else "data/{}".format(args.dev)
-    )
-
-    gen_new_segments(train_dir, split_info["train"])
-    gen_new_segments(test_dir, split_info["test"])
-    gen_new_segments(dev_dir, split_info["dev"])
diff --git a/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh b/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh
index 3f050dee92d..3ae8a2913df 100755
--- a/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh
+++ b/egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh
@@ -1,7 +1,17 @@
+#!/bin/bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+
+perdt=10 # percent for dev set
+peret=10 # percent for eval set
+
 . utils/parse_options.sh
 
-if [ $# != 5 ]; then
-    echo "Usage: $0 <src-data-dir> <dest-trdata-dir> <dest-dtdata-dir> <dest-etdata-dir> <spk_list_conf>>";
+if [ $# != 4 ]; then
+    echo "Usage: $0 <src-data-dir> <dest-trdata-dir> <dest-dtdata-dir> <dest-etdata-dir>";
     exit 1;
 fi
 
@@ -9,18 +19,47 @@ sdata=$1
 trdata=$2
 dtdata=$3
 etdata=$4
-spk_list=$5
 
-# get a temp dir
-./utils/copy_data_dir.sh data/${sdata} data/${trdata}
-./utils/copy_data_dir.sh data/${sdata} data/${dtdata}
-./utils/copy_data_dir.sh data/${sdata} data/${etdata}
+tmpdata=$trdata/tmp
+mkdir -p $tmpdata
+mkdir -p $dtdata
+mkdir -p $etdata
+
+# make a unique prompts files
+# some transcripts have multiple spaces and need tr -s " " to remove them
+cut -f 2- -d" " $sdata/text | tr -s " " | sort | uniq > $tmpdata/prompts
+num_prompt=`wc -l $tmpdata/prompts | awk '{print $1}'`
+
+num_dt=`echo "$num_prompt * $perdt / 100" | bc`
+num_et=`echo "$num_prompt * $peret / 100" | bc`
+echo "number of dev set prompts: $num_dt"
+echo "number of eval set prompts: $num_et"
+
+# dt
+utils/shuffle_list.pl $tmpdata/prompts | head -n $num_dt > $tmpdata/dt_prompts
+# et
+utils/shuffle_list.pl $tmpdata/prompts | head -n `echo "$num_dt + $num_et" | bc` \
+    | tail -n $num_et > $tmpdata/et_prompts
+# tr
+nrest=`echo "$num_dt + $num_et + 1" | bc`
+utils/shuffle_list.pl $tmpdata/prompts | \
+    tail -n +$nrest > $tmpdata/tr_prompts
+echo "number of train set prompts: `wc -l $tmpdata/tr_prompts | awk '{print $1}'`"
+
+# it takes very long time when # prompts is large
+cat $sdata/text | local/filter_text.py -f $tmpdata/dt_prompts | awk '{print $1}' | sort > $tmpdata/dt.ids
+echo "finished text extraction for dev set #utt = `wc -l $tmpdata/dt.ids | awk '{print $1}'`"
+cat $sdata/text | local/filter_text.py -f $tmpdata/et_prompts | awk '{print $1}' | sort > $tmpdata/et.ids
+echo "finished text extraction for test set #utt = `wc -l $tmpdata/et.ids | awk '{print $1}'`"
+cat $tmpdata/dt.ids $tmpdata/et.ids | sort > $tmpdata/dtet.ids
+cat $sdata/text | awk '{print $1}' | sort > $tmpdata/all.ids
+diff $tmpdata/all.ids $tmpdata/dtet.ids | awk '/^</{print $2}' | sort > $tmpdata/tr.ids
+echo "finished text extraction for train set #utt = `wc -l $tmpdata/tr.ids | awk '{print $1}'`"
 
-python3 ./local/split_tr_dt_et.py -s ${trdata} -c ${spk_list} --train ${trdata} --test ${etdata} --dev ${dtdata}
-mv data/${trdata}/new_segments data/${trdata}/segments
-mv data/${etdata}/new_segments data/${etdata}/segments
-mv data/${dtdata}/new_segments data/${dtdata}/segments
+reduce_data_dir.sh $sdata $tmpdata/dt.ids $dtdata
+reduce_data_dir.sh $sdata $tmpdata/et.ids $etdata
+reduce_data_dir.sh $sdata $tmpdata/tr.ids $trdata
 
-./utils/fix_data_dir.sh data/${trdata}
-./utils/fix_data_dir.sh data/${etdata}
-./utils/fix_data_dir.sh data/${dtdata}
+utils/fix_data_dir.sh $dtdata
+utils/fix_data_dir.sh $etdata
+utils/fix_data_dir.sh $trdata
diff --git a/egs/yoloxochitl_mixtec/asr1/run.sh b/egs/yoloxochitl_mixtec/asr1/run.sh
index e4586109587..14bc8c4dc94 100755
--- a/egs/yoloxochitl_mixtec/asr1/run.sh
+++ b/egs/yoloxochitl_mixtec/asr1/run.sh
@@ -21,7 +21,7 @@ annotation_type=eaf
 annotation_id=mixtec_surface
 text_format=surface # underlying_full, underlying_reduce
 
-# wav and transcription data directoy
+# wav and transcription data directory
 download_dir=
 wavdir=${download_dir}/Sound-files-Narratives-for-ASR
 annodir=${download_dir}/Transcriptions-for-ASR/ELAN-files-with-underlying-and-surface-tiers
@@ -75,10 +75,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
                               -m ${annotation_type} -i local/speaker_wav_mapping_mixtec_remove_reserve.csv \
                               -f ${text_format}
 
-    data/${annotation_id}/remix_script.sh
+    chmod +x ./data/${annotation_id}/remix_script.sh
+    ./data/${annotation_id}/remix_script.sh
 
-    # split by speakers ( official split of data)
-    local/split_tr_dt_et.sh ${annotation_id} ${train_set} ${train_dev} ${test_set} local/spk-train-test-split.txt
+    # ESPNet Version (same as voxforge)
+    # consider duplicated sentences (does not consider speaker split)
+    # filter out the same sentences (also same text) of test&dev set from validated set
+    local/split_tr_dt_et.sh data/${annotation_id} data/${train_set} data/${train_dev} data/${test_set}
 
     # add speed perturbation
     train_set_org=${train_set}
diff --git a/egs2/README.md b/egs2/README.md
index cfdac39c262..8da8f300214 100755
--- a/egs2/README.md
+++ b/egs2/README.md
@@ -6,37 +6,96 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 
 ## Overview of example information
 
-| Directory name          | Corpus name                                                                | Task                                       | Language       | URL                                                                            | Note         |
-| ----------------------- | -------------------------------------------------------------------------- | ------------------------------------------ | -------------- | ------------------------------------------------------------------------------ | ------------ |
-| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                     | ASR                                        | ZH             | http://www.aishelltech.com/kysjcp                                              |              |
-| ami                     | The AMI Meeting Corpus                                                     | ASR                                        | EN             | http://groups.inf.ed.ac.uk/ami/corpus/                                         |              |
-| an4                     | CMU AN4 database                                                           | ASR/TTS                                    | EN             | http://www.speech.cs.cmu.edu/databases/an4/                                    |              |
-| babel                   | IARPA Babel corups                                                         | ASR                                        | ~20 Languages  | https://www.iarpa.gov/index.php/research-programs/babel                        |              |
-| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                  | ASR/Multichannel ASR                       | EN             | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                        |              |
-| commonvoice             | The Mozilla Common Voice                                                   | ASR                                        | 13 Languages   | https://voice.mozilla.org/datasets                                             |              |
-| csj                     | Corpus of Spontaneous Japanese                                             | ASR                                        | JP             | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                  |              |
-| csmsc                   | Chinese Standard Mandarin Speech Copus                                     | TTS                                        | ZH             | https://www.data-baker.com/open_source.html                                    |              |
-| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                    | Multi-Array ASR                            | EN             | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj          |              |
-| how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding          | ASR/Machine Translation/Speech Translation | EN->PT         | https://github.com/srvk/how2-dataset                                           |              |
-| jsss                    | JSSS: Japanese speech corpus for summarization and simplification          | TTS                                        | JP             | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus  |              |
-| jsut                    | Japanese speech corpus of Saruwatari-lab., University of Tokyo             | ASR/TTS                                    | JP             | https://sites.google.com/site/shinnosuketakamichi/publication/jsut             |              |
-| jvs                     | JVS (Japanese versatile speech) corpus                                     | TTS                                        | JP             | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus   |              |
-| laborotv                | LaboroTVSpeech (A large-scale Japanese speech corpus on TV recordings)     | ASR                                        | JP             | https://laboro.ai/column/eg-laboro-tv-corpus-jp                                |              |
-| librispeech             | LibriSpeech ASR corpus                                                     | ASR                                        | EN             | http://www.openslr.org/12                                                      |              |
-| ljspeech                | The LJ Speech Dataset                                                      | TTS                                        | EN             | https://keithito.com/LJ-Speech-Dataset/                                        |              |
-| mini_an4                | Mini version of CMU AN4 database for the integration test                  | ASR/TTS                                    | EN             | http://www.speech.cs.cmu.edu/databases/an4/                                    |              |
-| nsc                     | National Speech Corpus                                                     | ASR                                        | EN-SG          | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus|              |
-| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)         | ASR                                        | 8 languages    | http://www.openslr.org/94/                                                     |              |
-| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)               | Multilingual ASR                           | 52 languages   |                                                                                |              |
-| ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                              | ASR                                        | RU             | https://github.com/snakers4/open_stt                                           |              |
-| reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge | ASR                                        | EN             | https://reverb2014.dereverberation.com/                                        |              |
-| spgispeech              | SPGISpeech 5k corpus                                                       | ASR                                        | EN             | https://datasets.kensho.com/datasets/scribe                   |              |
-| timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                           | ASR                                        | EN             | https://catalog.ldc.upenn.edu/LDC93S1                                          |              |
-| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                | TTS                                        | EN             | http://www.udialogue.org/download/cstr-vctk-corpus.html                        |              |
-| vivos                   | VIVOS (Vietnamese corpus for ASR)                                          | ASR                                        | VI             | https://ailab.hcmus.edu.vn/vivos/                                              |              |
-| voxforge                | VoxForge                                                                   | ASR                                        | 7 languages    | http://www.voxforge.org/                                                       |              |
-| wsj                     | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete                              | ASR                                        | EN             | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A |              |
-| wsj0_2mix               | MERL WSJ0-mix multi-speaker dataset                                        | ASR/SE                                     | EN             | http://www.merl.com/demos/deep-clustering                                      |              |
-| wsj0_2mix_spatialized   | MERL WSJ0-mix multi-speaker dataset (Spatialized version)                  | ASR/Multichannel ASR/SE                    | EN             | http://www.merl.com/demos/deep-clustering                                      |              |
-| yesno                   | The "yesno" corpus                                                         | ASR                                        | HE             | http://www.openslr.org/1                                                       |              |
-| zeroth_korean           | Zeroth-Korean                                                              | ASR                                        | KR             | http://www.openslr.org/40                                                      |              |
+| Directory name          | Corpus name                                                                             | Task                    | Language              | URL                                                                                                          | Note         |
+| ----------------------- | --------------------------------------------------------------------------------------- | ----------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------ | ------------ |
+| aidatatang_200zh        | Aidatatang_200zh A free Chinese Mandarin speech corpus                                  | ASR                     | CMN                   | http://www.openslr.org/resources/62                                                                          |              |
+| aishell                 | AISHELL-ASR0009-OS1 Open Source Mandarin Speech Corpus                                  | ASR                     | CMN                   | http://www.aishelltech.com/kysjcp                                                                            |              |
+| aishell3                | AISHELL3 Mandarin multi-speaker text-to-speech                                          | TTS                     | CMN                   | https://www.openslr.org/93/                                                                                  |              |
+| ami                     | The AMI Meeting Corpus                                                                  | ASR                     | ENG                   | http://groups.inf.ed.ac.uk/ami/corpus/                                                                       |              |
+| an4                     | CMU AN4 database                                                                        | ASR/TTS                 | ENG                   | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
+| babel                   | IARPA Babel corups                                                                      | ASR                     | ~20 languages         | https://www.iarpa.gov/index.php/research-programs/babel                                                      |              |
+| bn_openslr53            | Large bengali ASR training dataset                                                      | ASR                     | BEN                   | https://openslr.org/53/                                                                                      |              |
+| catslu               	  | CATSLU-MAPS                                                                             | SLU                     | CMN           	      | https://sites.google.com/view/catslu/home                                                                    |              |
+| chime4                  | The 4th CHiME Speech Separation and Recognition Challenge                               | ASR/Multichannel ASR    | ENG                   | http://spandh.dcs.shef.ac.uk/chime_challenge/chime2016/                                                      |              |
+| cmu_indic               | CMU INDIC                                                                               | TTS                     | 7 languages           | http://festvox.org/cmu_indic/                                                                                |              |
+| commonvoice             | The Mozilla Common Voice                                                                | ASR                     | 13 languages          | https://voice.mozilla.org/datasets                                                                           |              |
+| csj                     | Corpus of Spontaneous Japanese                                                          | ASR                     | JPN                   | https://pj.ninjal.ac.jp/corpus_center/csj/en/                                                                |              |
+| csmsc                   | Chinese Standard Mandarin Speech Copus                                                  | TTS                     | CMN                   | https://www.data-baker.com/open_source.html                                                                  |              |
+| css10                   | CSS10: A Collection of Single Speaker Speech Datasets for 10 Languages                  | TTS                     | 10 langauges          | https://github.com/Kyubyong/css10                                                                            |              |
+| dirha_wsj               | Distant-speech Interaction for Robust Home Applications                                 | Multichannel ASR        | ENG                   | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj                                        |              |
+| dns_ins20               | Deep Noise Suppression Challenge – INTERSPEECH 2020                                     | SE                      | 7 languages + singing | https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/ |              |
+| dsing                   | Automatic Lyric Transcription from Karaoke Vocal Tracks (From DAMP Sing300x30x2)        | ASR (ALT)               | ENG singing           | https://github.com/groadabike/Kaldi-Dsing-task                                                               |              |
+| fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation                                 | ASR/ST                  | SPA->ENG              | https://catalog.ldc.upenn.edu/LDC2014T23                                                                     |              |
+| fsc                     | Fluent Speech Commands Dataset                                                          | SLU                     | ENG                   | https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/               |              |
+| fsc_unseen              | Fluent Speech Commands Dataset MASE Eval Unseen splits                                  | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| fsc_challenge           | Fluent Speech Commands Dataset MASE Eval Challenge splits                               | SLU                     | ENG                   | https://github.com/maseEval/mase                                                                             |              |
+| gigaspeech              | GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio | ASR                     | ENG                   | https://github.com/SpeechColab/GigaSpeech                                                                    |              |
+| grabo                   | Grabo dataset                                                                           | SLU                     | ENG + NLD             | https://www.esat.kuleuven.be/psi/spraak/downloads/                                                           |              |
+| hkust                   | HKUST/MTS: A very large scale Mandarin telephone speech corpus                          | ASR                     | CMN                   | https://catalog.ldc.upenn.edu/LDC2005S15                                                                     |              |
+| hui_acg                 | HUI-audio-corpus-german                                                                 | TTS                     | DEU                   | https://opendata.iisys.de/datasets.html#hui-audio-corpus-german                                              |              |
+| how2                    | How2: A Large-scale Dataset for Multimodal Language Understanding                       | ASR/MT/ST               | ENG->POR              | https://github.com/srvk/how2-dataset                                                                         |              |
+| iemocap                 | IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database              | SLU                     | ENG                   | https://sail.usc.edu/iemocap/                                                                                |              |
+| iwslt21_low_resource    | ALFFA, IARPA Babel, Gamayun, IWSLT 2021                                                 | ASR                     | SWA                   | http://www.openslr.org/25/ https://catalog.ldc.upenn.edu/LDC2017S05 https://gamayun.translatorswb.org/data/ https://iwslt.org/2021/low-resource |              |
+| jdcinal                 | Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags              | SLU               | JPN                   | http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip |              |
+| jkac                    | J-KAC: Japanese Kamishibai and audiobook corpus                                         | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus                               |              |
+| jmd                     | JMD: Japanese multi-dialect corpus for speech synthesis                                 | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus                                 |              |
+| jsss                    | JSSS: Japanese speech corpus for summarization and simplification                       | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jsss_corpus                                |              |
+| jsut                    | Japanese speech corpus of Saruwatari-lab., University of Tokyo                          | ASR/TTS                 | JPN                  | https://sites.google.com/site/shinnosuketakamichi/publication/jsut                                           |              |
+| jtubespeech             | Japanese YouTube Speech corpus                                                          | ASR/TTS                 | JPN                  |                                                                                                             |              |
+| jv_openslr35            | Javanese                                                                                | ASR                     | JAV                  | http://www.openslr.org/35                                                                                    |              |
+| jvs                     | JVS (Japanese versatile speech) corpus                                                  | TTS                     | JPN                  | https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus                                 |              |
+| ksponspeech             | KsponSpeech (Korean spontaneous speech) corpus                                          | ASR                     | KOR                  | https://aihub.or.kr/aidata/105                                                                               |              |
+| kss                     | Korean single speaker corpus                                                            | TTS                     | KOR                  | https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset                                        |              |
+| laborotv                | LaboroTVSpeech (A large-scale Japanese speech corpus on TV recordings)                  | ASR                     | JPN                  | https://laboro.ai/column/eg-laboro-tv-corpus-jp                                                              |              |
+| librimix                | LibriMix: An Open-Source Dataset for Generalizable Speech Separation                    | SE                      | ENG                  | https://github.com/JorisCos/LibriMix                                                                         |              |
+| librispeech             | LibriSpeech ASR corpus                                                                  | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
+| librispeech_100         | LibriSpeech ASR corpus 100h subset                                                      | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
+| libritts                | LibriTTS corpus                                                                         | TTS                     | ENG                  | http://www.openslr.org/60                                                                                    |              |
+| ljspeech                | The LJ Speech Dataset                                                                   | TTS                     | ENG                  | https://keithito.com/LJ-Speech-Dataset/                                                                      |              |
+| lrs3                    | The Oxford-BBC Lip Reading Sentences 3 (LRS3) Dataset                                   | ASR                     | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html                                                  |              |
+| lrs2                    | The Oxford-BBC Lip Reading Sentences 2 (LRS2) Dataset                                   | Lipreading/ASR          | ENG                  | https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html                                                  |              |
+| mini_an4                | Mini version of CMU AN4 database for the integration test                               | ASR/TTS/SE              | ENG                  | http://www.speech.cs.cmu.edu/databases/an4/                                                                  |              |
+| mini_librispeech        | Mini version of Librispeech corpus                                                      | DIAR                    | ENG                  | https://openslr.org/31/                                                                                      |              |
+| mls                     | MLS (A large multilingual corpus derived from LibriVox audiobooks)                      | ASR                     | 8 languages          | http://www.openslr.org/94/                                                                                   |              |
+| mr_openslr64            | OpenSLR Marathi Corpus                                                                  | ASR                     | MAR                  | http://www.openslr.org/64/                                                                                   |              |
+| ms_indic_is18           | Microsoft Speech Corpus (Indian languages)                                              | ASR                     | 3 langs: TEL TAM GUJ | https://msropendata.com/datasets/7230b4b1-912d-400e-be58-f84e0512985e                                        |              |
+| nsc                     | National Speech Corpus                                                                  | ASR                     | ENG-SG               | https://www.imda.gov.sg/programme-listing/digital-services-lab/national-speech-corpus                        |              |
+| open_li52               | Corpus combination with 52 languages(Commonvocie + voxforge)                            | Multilingual ASR        | 52 languages         |                                                                                                             |              |
+| polyphone_swiss_french  | Swiss French Polyphone corpus                                                           | ASR                     | FRA                  | http://catalog.elra.info/en-us/repository/browse/ELRA-S0030_02                                               |              |
+| primewords_chinese      | Primewords Chinese Corpus Set 1                                                         | ASR                     | CMN                  | https://www.openslr.org/47/                                                                                  |              |
+| puebla_nahuatl          | Highland Puebla Nahuatl corpus (endangered language in central Mexico)                  | ASR                     | HPN                  | https://www.openslr.org/92/                                                                                  |              |
+| reverb                  | REVERB (REverberant Voice Enhancement and Recognition Benchmark) challenge              | ASR                     | ENG                  | https://reverb2014.dereverberation.com/                                                                      |              |
+| ru_open_stt             | Russian Open Speech To Text (STT/ASR) Dataset                                           | ASR                     | RUS                  | https://github.com/snakers4/open_stt                                                                         |              |
+| ruslan                  | RUSLAN: Russian Spoken Language Corpus For Speech Synthesis                             | TTS                     | RUS                  | https://ruslan-corpus.github.io/                                                                             |              |
+| snips                   | SNIPS: A dataset for spoken language understanding                                      | SLU                     | ENG                  | https://github.com/sonos/spoken-language-understanding-research-datasets                                     |              |
+| seame                   | SEAME: a Mandarin-English Code-switching Speech Corpus in South-East Asia               | ASR                     | ENG + CMN            | https://catalog.ldc.upenn.edu/LDC2015S04                                                                     |              |
+| siwis                   | SIWIS: Spoken Interaction with Interpretation in Switzerland                            | TTS                     | FRA                  | https://https://datashare.ed.ac.uk/handle/10283/2353                                                                 |              |
+| slue-voxceleb           | SLUE: Spoken Language Understanding Evaluation                                          | SLU                     | ENG                  | https://github.com/asappresearch/slue-toolkit                                                                |              |
+| slurp                   | SLURP: A Spoken Language Understanding Resource Package                                 | SLU                     | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
+| slurp_entity            | SLURP: A Spoken Language Understanding Resource Package                                 | SLU/Entity Classifi.    | ENG                  | https://github.com/pswietojanski/slurp                                                                       |              |
+| sms_wsj                 | SMS-WSJ: A database for in-depth analysis of multi-channel source separation algorithms | SE                      | ENG                  | https://github.com/fgnt/sms_wsj                                                                              |              |
+| speechcommands          | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition                    | SLU                     | ENG                  | https://www.tensorflow.org/datasets/catalog/speech_commands                                                  |              |
+| spgispeech              | SPGISpeech 5k corpus                                                                    | ASR                     | ENG                  | https://datasets.kensho.com/datasets/scribe                                                                  |              |
+| su_openslr36            | Sundanese                                                                               | ASR                     | SUN                  | http://www.openslr.org/36                                                                                    |              |
+| swbd                    | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h)                 | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC97S62                                                                       |              |
+| swbd_da                 | NXT Switchboard Annotations                                                             | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2009T26                                                                     |              |
+| swbd_sentiment          | Speech Sentiment Annotations                                                            | SLU                     | ENG                  | https://catalog.ldc.upenn.edu/LDC2020T14                                                                    |              |
+| tedlium2                | TED-LIUM corpus release 2                                                               | ASR                     | ENG                  | https://www.openslr.org/19/, http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf                |              |
+| thchs30                 | A Free Chinese Speech Corpus Released by CSLT@Tsinghua University                       | TTS                     | CMN                  | https://www.openslr.org/18/                                                                                  |              |
+| timit                   | TIMIT Acoustic-Phonetic Continuous Speech Corpus                                        | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S1                                                                        |              |
+| totonac                 | Highland Totonac corpus (endangered language in central Mexico)                         | ASR                     | TOS                  | http://www.openslr.org/107/                                                                                  |              |
+| tsukuyomi               | つくよみちゃんコーパス                                                                      | TTS                     | JPN                  | https://tyc.rei-yumesaki.net/material/corpus                                                                 |              |
+| vctk                    | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit                             | ASR/TTS                 | ENG                  | http://www.udialogue.org/download/cstr-vctk-corpus.html                                                      |              |
+| vctk_noisyreverb        | Noisy reverberant speech database (48kHz)                                               | SE                      | ENG                  | https://datashare.ed.ac.uk/handle/10283/2826                                                                 |              |
+| vivos                   | VIVOS (Vietnamese corpus for ASR)                                                       | ASR                     | VIE                  | https://ailab.hcmus.edu.vn/vivos/                                                                            |              |
+| voxforge                | VoxForge                                                                                | ASR                     | 7 languages          | http://www.voxforge.org/                                                                                     |              |
+| wenetspeech             | WenetSpeech: A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition          | ASR                     | CMN                  | https://wenet-e2e.github.io/WenetSpeech/                                                                     |              |
+| wham                    | The WSJ0 Hipster Ambient Mixtures (WHAM!) dataset                                       | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
+| whamr                   | WHAMR!: Noisy and Reverberant Single-Channel Speech Separation                          | SE                      | ENG                  | https://wham.whisper.ai/                                                                                     |              |
+| wsj                     | CSR-I (WSJ0) Complete, CSR-II (WSJ1) Complete                                           | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC93S6A,https://catalog.ldc.upenn.edu/LDC94S13A                               |              |
+| wsj0_2mix               | MERL WSJ0-mix multi-speaker dataset                                                     | ASR/SE                  | ENG                  | http://www.merl.com/demos/deep-clustering                                                                    |              |
+| wsj0_2mix_spatialized   | MERL WSJ0-mix multi-speaker dataset (Spatialized version)                               | ASR/Multichannel ASR/SE | ENG                  | http://www.merl.com/demos/deep-clustering                                                                    |              |
+| yesno                   | The "yesno" corpus                                                                      | ASR                     | HEB                  | http://www.openslr.org/1                                                                                     |              |
+| yoloxochitl_mixtec      | Yoloxochitl-Mixtec corpus (endangered language in central Mexico)                       | ASR                     | XTY                  | http://www.openslr.org/89                                                                                    |              |
+| zeroth_korean           | Zeroth-Korean                                                                           | ASR                     | KOR                  | http://www.openslr.org/40                                                                                    |              |
+| zh_openslr38            | ST-CMDS-20170001_1, Free ST Chinese Mandarin Corpus                                     | ASR                     | CMN                  | http://www.openslr.org/38                                                                                    |              |
\ No newline at end of file
diff --git a/egs2/TEMPLATE/README.md b/egs2/TEMPLATE/README.md
index 7e63fc3bbd4..4c9b5da6f7e 100644
--- a/egs2/TEMPLATE/README.md
+++ b/egs2/TEMPLATE/README.md
@@ -1,60 +1,34 @@
 # TEMPLATE
+## Run ESPnet with your own corpus
 
-ESPnet2 doesn't prepare different recipes for each corpus unlike ESPnet1, but we prepare common recipes for each task, which are named as `asr.sh`, `enh.sh`, `tts.sh`, or etc. We carefully designed these common scripts to perform with any types of corpus, so ideally you can train using your own corpus without modifying almost all parts of these recipes. Only you have to do is just creating `local/data.sh`.
-
-## How to make/port new recipe? 
 
-1. Create directory in egs/
+1. Copying a template directory
     ```bash
     % task=asr1  # enh1, tts1, mt1, st1
     % egs2/TEMPLATE/${task}/setup.sh egs2/foo/${task}
     ```
-   
-1. Create `run.sh` and `local/data.sh` somehow
-    ```bash
-    % cd egs2/foo/${task}
-    % cp ../../mini_an4/${task}/run.sh .
-    % vi run.sh
-    ```
     
-    `run.sh` is a thin wrapper of a common recipe for each task as follows,
-    
-    ```bash
-    # The contents of run.sh
-    ./asr.sh \
-      --train_set train \
-      --valid_set dev \
-      --test_sets "dev test1 test2" \
-      --srctexts "data/train/text" "$@"
+1. Create `egs2/foo/${task}/data` directory to put your corpus: See https://github.com/espnet/data_example or next section.
+1. Run (e.g. `asr` case)
     ```
-    
-    - We use a common recipe, thus you must absorb the difference of each corpus by the command line options of `asr.sh`.
-    - We expect that `local/data.sh` generates training data (e.g., `data/train`), validation data (e.g., `data/dev`), and (multiple) test data (e.g, `data/test1` and `data/test2`), which have Kaldi style (See stage1 of `asr.sh`). 
-    - Note that some corpora only provide the test data and would not officially prepare the development set. In this case, you can prepare the validation data by extracting the part of the training data and regard the rest of training data as a new training data by yourself (e.g., check `egs2/csj/asr1/local/data.sh`).
-    - Also, the validation data used during training must be a single data directory. If you have multiple validation data directories, you must combine them by using `utils/combine_data.sh`.
-    - On the other hand, the recipe accepts multiple test data directories during inference. So, you can include the validation data to evaluate the ASR performance of the validation data.
-    - If you'll create your recipe from scratch, you have to understand Kaldi data structure. See the next section. 
-    - If you'll port the recipe from ESPnet1 or Kaldi, you need to embed the data preparation part of the original recipe in `local/data.sh`. Note that the common steps include `Feature extraction`, `Speed Perturbation`, and `Removing long/short utterances`, so you don't need to do them at `local/data.sh`
-
+    cd egs2/foo/${task}  # We always assume that our scripts are executed at this directory.
    
-1. If the recipe uses some corpora and they are not listed in `db.sh`, then write it.
-    ```bash
-    ...
-    YOUR_CORPUS=
-    ...
-    ```
-
-1. If the recipe depends on some special tools, then write the requirements to `local/path.sh`
-
-    path.sh:
-    ```bash
-    # e.g. flac command is required
-    if ! which flac &> /dev/null; then 
-        echo "Error: flac is not installed"
-        return 1
-    fi
-    ```
-
+    # Assuming Stage1 creating `data`, so you can skip it if you have `data`.  
+    ./asr.sh \
+     --stage 2 \
+     --ngpu 1 \
+     --train_set train \
+     --valid_set valid \
+     --test_sets "test" \
+     --lm_train_text "data/train/text"
+     
+    # Use CUDA_VISIBLE_DEVICES to specify a gpu device id
+    # If you meet CUDA out of memory error, change `batch_bins` ( or `batch_size`)
+    ```
+1. For more detail
+    - Read the config files: e.g. https://github.com/espnet/espnet/tree/master/egs2/librispeech/asr1/conf
+    - Read the main script: e.g. https://github.com/espnet/espnet/blob/master/egs2/TEMPLATE/asr1/asr.sh
+    - Documentation: https://espnet.github.io/espnet/
 
 ## About Kaldi style data directory
 
@@ -157,3 +131,60 @@ utils/validate_data_dir.sh --no-feats data/train
 utils/validate_data_dir.sh --no-feats data/dev
 utils/validate_data_dir.sh --no-feats data/test
 ```
+
+
+## (For developers) How to make/port new recipe? 
+
+ESPnet2 doesn't prepare different recipes for each corpus unlike ESPnet1, but we prepare common recipes for each task, which are named as `asr.sh`, `enh.sh`, `tts.sh`, or etc. We carefully designed these common scripts to perform with any types of corpus, so ideally you can train using your own corpus without modifying almost all parts of these recipes. Only you have to do is just creating `local/data.sh`.
+
+
+1. Create directory in egs/
+    ```bash
+    % task=asr1  # enh1, tts1, mt1, st1
+    % egs2/TEMPLATE/${task}/setup.sh egs2/foo/${task}
+    ```
+   
+1. Create `run.sh` and `local/data.sh` somehow
+    ```bash
+    % cd egs2/foo/${task}
+    % cp ../../mini_an4/${task}/run.sh .
+    % vi run.sh
+    ```
+    
+    `run.sh` is a thin wrapper of a common recipe for each task as follows,
+    
+    ```bash
+    # The contents of run.sh
+    ./asr.sh \
+      --train_set train \
+      --valid_set dev \
+      --test_sets "dev test1 test2" \
+      --lm_train_text "data/train/text" "$@"
+    ```
+    
+    - We use a common recipe, thus you must absorb the difference of each corpus by the command line options of `asr.sh`.
+    - We expect that `local/data.sh` generates training data (e.g., `data/train`), validation data (e.g., `data/dev`), and (multiple) test data (e.g, `data/test1` and `data/test2`), which have Kaldi style (See stage1 of `asr.sh`). 
+    - Note that some corpora only provide the test data and would not officially prepare the development set. In this case, you can prepare the validation data by extracting the part of the training data and regard the rest of training data as a new training data by yourself (e.g., check `egs2/csj/asr1/local/data.sh`).
+    - Also, the validation data used during training must be a single data directory. If you have multiple validation data directories, you must combine them by using `utils/combine_data.sh`.
+    - On the other hand, the recipe accepts multiple test data directories during inference. So, you can include the validation data to evaluate the ASR performance of the validation data.
+    - If you'll create your recipe from scratch, you have to understand Kaldi data structure. See the next section. 
+    - If you'll port the recipe from ESPnet1 or Kaldi, you need to embed the data preparation part of the original recipe in `local/data.sh`. Note that the common steps include `Feature extraction`, `Speed Perturbation`, and `Removing long/short utterances`, so you don't need to do them at `local/data.sh`
+
+   
+1. If the recipe uses some corpora and they are not listed in `db.sh`, then write it.
+    ```bash
+    ...
+    YOUR_CORPUS=
+    ...
+    ```
+
+1. If the recipe depends on some special tools, then write the requirements to `local/path.sh`
+
+    path.sh:
+    ```bash
+    # e.g. flac command is required
+    if ! which flac &> /dev/null; then 
+        echo "Error: flac is not installed"
+        return 1
+    fi
+    ```
diff --git a/egs2/TEMPLATE/asr1/asr.sh b/egs2/TEMPLATE/asr1/asr.sh
index 1e4f84f9ff5..f4d7a8ad24a 100755
--- a/egs2/TEMPLATE/asr1/asr.sh
+++ b/egs2/TEMPLATE/asr1/asr.sh
@@ -29,6 +29,7 @@ skip_data_prep=false # Skip data preparation stages.
 skip_train=false     # Skip training stages.
 skip_eval=false      # Skip decoding and evaluation stages.
 skip_upload=true     # Skip packing and uploading stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
 ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
 num_nodes=1          # The number of nodes.
 nj=32                # The number of parallel jobs.
@@ -62,12 +63,17 @@ bpe_input_sentence_size=100000000 # Size of input sentence for BPE.
 bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE
 bpe_char_cover=1.0  # character coverage when modeling BPE
 
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
 # Language model related
 use_lm=true       # Use language model for ASR decoding.
 lm_tag=           # Suffix to the result dir for language model training.
-lm_exp=           # Specify the direcotry path for LM experiment.
+lm_exp=           # Specify the directory path for LM experiment.
                   # If this option is specified, lm_tag is ignored.
-lm_stats_dir=     # Specify the direcotry path for LM statistics.
+lm_stats_dir=     # Specify the directory path for LM statistics.
 lm_config=        # Config for language model training.
 lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
                   # Note that it will overwrite args in lm config.
@@ -78,21 +84,41 @@ word_vocab_size=10000 # Size of word vocabulary.
 
 # ASR model related
 asr_tag=       # Suffix to the result dir for asr model training.
-asr_exp=       # Specify the direcotry path for ASR experiment.
+asr_exp=       # Specify the directory path for ASR experiment.
                # If this option is specified, asr_tag is ignored.
-asr_stats_dir= # Specify the direcotry path for ASR statistics.
+asr_stats_dir= # Specify the directory path for ASR statistics.
 asr_config=    # Config for asr model training.
 asr_args=      # Arguments for asr model training, e.g., "--max_epoch 10".
                # Note that it will overwrite args in asr config.
+pretrained_model=              # Pretrained model to load
+ignore_init_mismatch=false      # Ignore initial mismatch
 feats_normalize=global_mvn # Normalizaton layer type.
 num_splits_asr=1           # Number of splitting for lm corpus.
 
+# Upload model related
+hf_repo=
+
 # Decoding related
+use_k2=false      # Whether to use k2 based decoder
+k2_ctc_decoding=true
+use_nbest_rescoring=true # use transformer-decoder
+                         # and transformer language model for nbest rescoring
+num_paths=1000 # The 3rd argument of k2.random_paths.
+nll_batch_size=100 # Affect GPU memory usage when computing nll
+                   # during nbest rescoring
+k2_config=./conf/decode_asr_transformer_with_k2.yaml
+
+use_streaming=false # Whether to use streaming decoding
+
+use_maskctc=false # Whether to use maskctc decoding
+
+batch_size=1
 inference_tag=    # Suffix to the result dir for decoding.
 inference_config= # Config for decoding.
 inference_args=   # Arguments for decoding, e.g., "--lm_weight 0.1".
                   # Note that it will overwrite args in inference config.
-inference_lm=valid.loss.ave.pth       # Language modle path for decoding.
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
 inference_asr_model=valid.acc.ave.pth # ASR model path for decoding.
                                       # e.g.
                                       # inference_asr_model=train.loss.best.pth
@@ -165,9 +191,9 @@ Options:
 
     # Language model related
     --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
-    --lm_exp          # Specify the direcotry path for LM experiment.
+    --lm_exp          # Specify the directory path for LM experiment.
                       # If this option is specified, lm_tag is ignored (default="${lm_exp}").
-    --lm_stats_dir    # Specify the direcotry path for LM statistics (default="${lm_stats_dir}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
     --lm_config       # Config for language model training (default="${lm_config}").
     --lm_args         # Arguments for language model training (default="${lm_args}").
                       # e.g., --lm_args "--max_epoch 10"
@@ -178,13 +204,15 @@ Options:
 
     # ASR model related
     --asr_tag          # Suffix to the result dir for asr model training (default="${asr_tag}").
-    --asr_exp          # Specify the direcotry path for ASR experiment.
+    --asr_exp          # Specify the directory path for ASR experiment.
                        # If this option is specified, asr_tag is ignored (default="${asr_exp}").
-    --asr_stats_dir    # Specify the direcotry path for ASR statistics (default="${asr_stats_dir}").
+    --asr_stats_dir    # Specify the directory path for ASR statistics (default="${asr_stats_dir}").
     --asr_config       # Config for asr model training (default="${asr_config}").
     --asr_args         # Arguments for asr model training (default="${asr_args}").
                        # e.g., --asr_args "--max_epoch 10"
                        # Note that it will overwrite args in asr config.
+    --pretrained_model=          # Pretrained model to load (default="${pretrained_model}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
     --feats_normalize  # Normalizaton layer type (default="${feats_normalize}").
     --num_splits_asr   # Number of splitting for lm corpus  (default="${num_splits_asr}").
 
@@ -194,9 +222,11 @@ Options:
     --inference_args      # Arguments for decoding (default="${inference_args}").
                           # e.g., --inference_args "--lm_weight 0.1"
                           # Note that it will overwrite args in inference config.
-    --inference_lm        # Language modle path for decoding (default="${inference_lm}").
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
     --inference_asr_model # ASR model path for decoding (default="${inference_asr_model}").
     --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+    --use_streaming       # Whether to use streaming decoding (default="${use_streaming}").
+    --use_maskctc         # Whether to use maskctc decoding (default="${use_streaming}").
 
     # [Task dependent] Set the datadir name created by local/data.sh
     --train_set     # Name of training set (required).
@@ -376,6 +406,9 @@ fi
 if [ -z "${lm_exp}" ]; then
     lm_exp="${expdir}/lm_${lm_tag}"
 fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
 
 
 if [ -z "${inference_tag}" ]; then
@@ -391,7 +424,16 @@ if [ -z "${inference_tag}" ]; then
     if "${use_lm}"; then
         inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
     fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
     inference_tag+="_asr_model_$(echo "${inference_asr_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+      inference_tag+="_k2_ctc_decoding_${k2_ctc_decoding}"
+      inference_tag+="_use_nbest_rescoring_${use_nbest_rescoring}"
+    fi
 fi
 
 # ========================== Main stages start from here. ==========================
@@ -444,7 +486,7 @@ if ! "${skip_data_prep}"; then
                     _suf=""
                 fi
                 utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
-                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
                 _opts=
                 if [ -e data/"${dset}"/segments ]; then
                     # "segments" is used for splitting wav files which are written in "wav".scp
@@ -510,6 +552,11 @@ if ! "${skip_data_prep}"; then
                 <data/"${dset}"/cmvn.scp awk ' { print($1,"<DUMMY>") }' > data/"${dset}"/wav.scp
                 utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
 
+                # Derive the the frame length and feature dimension
+                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
                 pyscripts/feats/feat-to-shape.py "scp:head -n 1 ${data_feats}${_suf}/${dset}/feats.scp |" - | \
                     awk '{ print $2 }' | cut -d, -f2 > "${data_feats}${_suf}/${dset}/feats_dim"
 
@@ -844,10 +891,24 @@ if ! "${skip_train}"; then
     fi
 
 
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
     if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2- -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
         _asr_train_dir="${data_feats}/${train_set}"
         _asr_valid_dir="${data_feats}/${valid_set}"
-        log "Stage 9: ASR collect stats: train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}"
+        log "Stage 10: ASR collect stats: train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}"
 
         _opts=
         if [ -n "${asr_config}" ]; then
@@ -897,8 +958,8 @@ if ! "${skip_train}"; then
         utils/split_scp.pl "${key_file}" ${split_scps}
 
         # 2. Generate run.sh
-        log "Generate '${asr_stats_dir}/run.sh'. You can resume the process from stage 9 using this script"
-        mkdir -p "${asr_stats_dir}"; echo "${run_args} --stage 9 \"\$@\"; exit \$?" > "${asr_stats_dir}/run.sh"; chmod +x "${asr_stats_dir}/run.sh"
+        log "Generate '${asr_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${asr_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${asr_stats_dir}/run.sh"; chmod +x "${asr_stats_dir}/run.sh"
 
         # 3. Submit jobs
         log "ASR collect-stats started... log: '${_logdir}/stats.*.log'"
@@ -945,10 +1006,10 @@ if ! "${skip_train}"; then
     fi
 
 
-    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
         _asr_train_dir="${data_feats}/${train_set}"
         _asr_valid_dir="${data_feats}/${valid_set}"
-        log "Stage 10: ASR Training: train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}"
+        log "Stage 11: ASR Training: train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}"
 
         _opts=
         if [ -n "${asr_config}" ]; then
@@ -1015,8 +1076,8 @@ if ! "${skip_train}"; then
             _opts+="--train_shape_file ${asr_stats_dir}/train/text_shape.${token_type} "
         fi
 
-        log "Generate '${asr_exp}/run.sh'. You can resume the process from stage 10 using this script"
-        mkdir -p "${asr_exp}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${asr_exp}/run.sh"; chmod +x "${asr_exp}/run.sh"
+        log "Generate '${asr_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${asr_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${asr_exp}/run.sh"; chmod +x "${asr_exp}/run.sh"
 
         # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
         log "ASR training started... log: '${asr_exp}/train.log'"
@@ -1048,6 +1109,8 @@ if ! "${skip_train}"; then
                 --valid_shape_file "${asr_stats_dir}/valid/speech_shape" \
                 --valid_shape_file "${asr_stats_dir}/valid/text_shape.${token_type}" \
                 --resume true \
+                --init_param ${pretrained_model} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
                 --fold_length "${_fold_length}" \
                 --fold_length "${asr_text_fold_length}" \
                 --output_dir "${asr_exp}" \
@@ -1092,8 +1155,8 @@ fi
 
 
 if ! "${skip_eval}"; then
-    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
-        log "Stage 11: Decoding: training_dir=${asr_exp}"
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${asr_exp}"
 
         if ${gpu_inference}; then
             _cmd="${cuda_cmd}"
@@ -1116,10 +1179,31 @@ if ! "${skip_eval}"; then
                 _opts+="--lm_file ${lm_exp}/${inference_lm} "
             fi
         fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
 
         # 2. Generate run.sh
-        log "Generate '${asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 11 using this script"
-        mkdir -p "${asr_exp}/${inference_tag}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${asr_exp}/${inference_tag}/run.sh"; chmod +x "${asr_exp}/${inference_tag}/run.sh"
+        log "Generate '${asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${asr_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${asr_exp}/${inference_tag}/run.sh"; chmod +x "${asr_exp}/${inference_tag}/run.sh"
+        if "${use_k2}"; then
+          # Now only _nj=1 is verified if using k2
+          asr_inference_tool="espnet2.bin.asr_inference_k2"
+
+          _opts+="--is_ctc_decoding ${k2_ctc_decoding} "
+          _opts+="--use_nbest_rescoring ${use_nbest_rescoring} "
+          _opts+="--num_paths ${num_paths} "
+          _opts+="--nll_batch_size ${nll_batch_size} "
+          _opts+="--k2_config ${k2_config} "
+        else
+          if "${use_streaming}"; then
+              asr_inference_tool="espnet2.bin.asr_inference_streaming"
+          elif "${use_maskctc}"; then
+              asr_inference_tool="espnet2.bin.asr_inference_maskctc"
+          else
+              asr_inference_tool="espnet2.bin.asr_inference"
+          fi
+        fi
 
         for dset in ${test_sets}; do
             _data="${data_feats}/${dset}"
@@ -1143,7 +1227,13 @@ if ! "${skip_eval}"; then
             # 1. Split the key file
             key_file=${_data}/${_scp}
             split_scps=""
-            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            if "${use_k2}"; then
+              # Now only _nj=1 is verified if using k2
+              _nj=1
+            else
+              _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            fi
+
             for n in $(seq "${_nj}"); do
                 split_scps+=" ${_logdir}/keys.${n}.scp"
             done
@@ -1154,7 +1244,8 @@ if ! "${skip_eval}"; then
             log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
             # shellcheck disable=SC2086
             ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
-                ${python} -m espnet2.bin.asr_inference \
+                ${python} -m ${asr_inference_tool} \
+                    --batch_size ${batch_size} \
                     --ngpu "${_ngpu}" \
                     --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
                     --key_file "${_logdir}"/keys.JOB.scp \
@@ -1165,17 +1256,19 @@ if ! "${skip_eval}"; then
 
             # 3. Concatenates the output files from each jobs
             for f in token token_int score text; do
-                for i in $(seq "${_nj}"); do
-                    cat "${_logdir}/output.${i}/1best_recog/${f}"
-                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+                if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                  for i in $(seq "${_nj}"); do
+                      cat "${_logdir}/output.${i}/1best_recog/${f}"
+                  done | sort -k1 >"${_dir}/${f}"
+                fi
             done
         done
     fi
 
 
-    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
-        log "Stage 12: Scoring"
-        if [ "${token_type}" = pnh ]; then
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Scoring"
+        if [ "${token_type}" = phn ]; then
             log "Error: Not implemented for token_type=phn"
             exit 1
         fi
@@ -1293,9 +1386,10 @@ fi
 
 
 packed_model="${asr_exp}/${asr_exp##*/}_${inference_asr_model%.*}.zip"
-if ! "${skip_upload}"; then
-    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
-        log "Stage 13: Pack model: ${packed_model}"
+if [ -z "${download_model}" ]; then
+    # Skip pack preparation if using a downloaded model
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Pack model: ${packed_model}"
 
         _opts=
         if "${use_lm}"; then
@@ -1310,19 +1404,25 @@ if ! "${skip_upload}"; then
         if [ "${token_type}" = bpe ]; then
             _opts+="--option ${bpemodel} "
         fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
         # shellcheck disable=SC2086
         ${python} -m espnet2.bin.pack asr \
             --asr_train_config "${asr_exp}"/config.yaml \
             --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
             ${_opts} \
             --option "${asr_exp}"/RESULTS.md \
+            --option "${asr_exp}"/RESULTS.md \
             --option "${asr_exp}"/images \
             --outpath "${packed_model}"
     fi
+fi
 
-
-    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
-        log "Stage 14: Upload model to Zenodo: ${packed_model}"
+if ! "${skip_upload}"; then
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Upload model to Zenodo: ${packed_model}"
+        log "Warning: Upload model to Zenodo will be deprecated. We encourage to use Hugging Face"
 
         # To upload your model, you need to do:
         #   1. Sign up to Zenodo: https://zenodo.org/
@@ -1377,7 +1477,59 @@ EOF
             --publish false
     fi
 else
-    log "Skip the uploading stages"
+    log "Skip the uploading stage"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace, follow the following steps described here https://github.com/espnet/espnet/blob/master/CONTRIBUTING.md#132-espnet2-recipes" && \
+	    exit 1
+        log "Stage 16: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1
+
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=automatic-speech-recognition
+        # shellcheck disable=SC2034
+        espnet_task=ASR
+        # shellcheck disable=SC2034
+        task_exp=${asr_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/asr1/cmd.sh b/egs2/TEMPLATE/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/TEMPLATE/asr1/cmd.sh
+++ b/egs2/TEMPLATE/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
old mode 100644
new mode 100755
index f233c8d7c94..31008b9502c
--- a/egs2/TEMPLATE/asr1/db.sh
+++ b/egs2/TEMPLATE/asr1/db.sh
@@ -1,11 +1,17 @@
 # Set the path of your corpus
 # "downloads" means the corpus can be downloaded by the recipe automatically
 
+AIDATATANG_200ZH=downloads
 AISHELL=downloads
+AISHELL3=downloads
+AISHELL4=downloads
+ALFFA=downloads
 AN4=downloads
 DIRHA_ENGLISH_PHDEV=
 DIRHA_WSJ=
 DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed"  # Output file path
+DNS=
+DSING=downloads
 WSJ0=
 WSJ1=
 WSJCAM0=
@@ -17,27 +23,44 @@ CSJDATATOP=
 CSJVER=dvd  ## Set your CSJ format (dvd or usb).
             ## Usage    :
             ## Case DVD : We assume CSJ DVDs are copied in this directory with the names dvd1, dvd2,...,dvd17.
-            ##            Neccesary directory is dvd3 - dvd17.
+            ##            Necessary directory is dvd3 - dvd17.
             ##            e.g. $ ls $CSJDATATOP(DVD) => 00README.txt dvd1 dvd2 ... dvd17
             ##
-            ## Case USB : Neccesary directory is MORPH/SDB and WAV
+            ## Case USB : Necessary directory is MORPH/SDB and WAV
             ##            e.g. $ ls $CSJDATATOP(USB) => 00README.txt DOC MORPH ... WAV fileList.csv
-            ## Case merl :MERL setup. Neccesary directory is WAV and sdb
+            ## Case merl :MERL setup. Necessary directory is WAV and sdb
 CSMSC=downloads
+CSS10=
 HKUST1=
 HKUST2=
+HUI_ACG=downloads
+HUB4_SPANISH=
 LABOROTV=
 TEDXJP=
 LIBRISPEECH=
+LIBRILIGHT_LIMITED=
+FSC=
+SLURP=
+VOXCELEB=
 MINI_LIBRISPEECH=downloads
+MISP2021=
+LIBRIMIX=downloads
 LIBRITTS=
 LJSPEECH=downloads
 NSC=
+JMD=downloads
 JSSS=downloads
 JSUT=downloads
+JTUBESPEECH=downloads
 JVS=downloads
+KSS=
+SNIPS= # smart-light-en-closed-field data path
 SPGISPEECH=
+SWBD=
+SWBD_NXT=
+THCHS30=downloads
 TIMIT=$(realpath ../../../../TIMIT)
+TSUKUYOMI=downloads
 VOXFORGE=downloads
 AMI=
 COMMONVOICE=downloads
@@ -66,22 +89,96 @@ BABEL_401=
 BABEL_402=
 BABEL_403=
 BABEL_404=
+PUEBLA_NAHUATL=downloads
+TEDLIUM2=downloads
 TEDLIUM3=downloads
 VCTK=downloads
 VIVOS=downloads
 YESNO=downloads
+YOLOXOCHITL_MIXTEC=downloads
 HOW2_TEXT=downloads/how2-300h-v1
 HOW2_FEATS=downloads/fbank_pitch_181516
 ZEROTH_KOREAN=downloads
+JAVA=downloads
 RU_OPEN_STT=downloads
+RUSLAN=downloads
+SIWIS=downloads
+GIGASPEECH=
+GOOGLEI18N=downloads
+NOISY_SPEECH=
+NOISY_REVERBERANT_SPEECH=
+LRS2=
+LRS3=
+SUNDA=downloads
+CMU_ARCTIC=downloads
+CMU_INDIC=downloads
+INDIC_SPEECH=downloads
+IWSLT22_DIALECT=
+JKAC=
+MUCS_SUBTASK1=downloads
+MUCS_SUBTASK2=downloads
+GAMAYUN=downloads
+IWSLT21LR=downloads/iwslt21
+JDCINAL=downloads
+GRABO=downloads
+WENETSPEECH=
+SPEECHCOMMANDS=downloads
+TOTONAC=downloads
+PRIMEWORDS_CHINESE=downloads
+SEAME=
+BENGALI=downloads
+IWSLT14=
+ST_CMDS=downloads
+MS_INDIC_IS18=
+MARATHI=downloads
+
+# For only CMU TIR environment
+if [[ "$(hostname)" == tir* ]]; then
+    BABEL_101=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_BP_101/
+    BABEL_102=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_102/
+    BABEL_103=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_103/
+    BABEL_104=/projects/tir5/data/speech_corpora/babel/BABEL_BP_104/
+    BABEL_105=/projects/tir5/data/speech_corpora/babel/IARPA-babel105b-v0.5-build/BABEL_BP_105/
+    BABEL_106=/projects/tir5/data/speech_corpora/babel/BABEL_BP_106/
+    BABEL_107=/projects/tir5/data/speech_corpora/babel/BABEL_BP_107/
+    BABEL_201=/projects/tir5/data/speech_corpora/babel/IARPA-babel201b-v0.2b.build/BABEL_OP1_201/
+    BABEL_202=/projects/tir5/data/speech_corpora/babel/IARPA-babel202b-v1.0d-build/BABEL_OP2_202/
+    BABEL_203=/projects/tir5/data/speech_corpora/babel/IARPA-babel203b-v3.1a-build/
+    BABEL_204=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_204/
+    BABEL_205=/projects/tir5/data/speech_corpora/babel/IARPA-babel205b-v1.0a-build/BABEL_OP2_205/
+    BABEL_206=/projects/tir5/data/speech_corpora/babel/BABEL_OP1_206/
+    BABEL_207=/projects/tir5/data/speech_corpora/babel/IARPA-babel207b-v1.0e-build/BABEL_OP2_207/
+    BABEL_301=/projects/tir5/data/speech_corpora/babel/IARPA-babel301b-v2.0b-build/BABEL_OP2_301/
+    BABEL_302=/projects/tir5/data/speech_corpora/babel/IARPA-babel302b-v1.0a-build/BABEL_OP2_302/
+    BABEL_303=/projects/tir5/data/speech_corpora/babel/IARPA-babel303b-v1.0a/BABEL_OP2_303/
+    BABEL_304=/projects/tir5/data/speech_corpora/babel/IARPA-babel304b-v1.0b/BABEL_OP2_304/
+    BABEL_305=/projects/tir5/data/speech_corpora/babel/IARPA-babel305b-v1.0c-build/BABEL_OP3_305/
+    BABEL_306=/projects/tir5/data/speech_corpora/babel/IARPA-babel306b-v2.0c-build/BABEL_OP3_306/
+    BABEL_307=/projects/tir5/data/speech_corpora/babel/IARPA-babel307b-v1.0b-build/BABEL_OP3_307/
+    BABEL_401=/projects/tir5/data/speech_corpora/babel/IARPA-babel401b-v2.0b-build/BABEL_OP3_401/
+    BABEL_402=/projects/tir5/data/speech_corpora/babel/IARPA-babel402b-v1.0b-build/BABEL_OP3_402/
+    BABEL_403=/projects/tir5/data/speech_corpora/babel/IARPA-babel403b-v1.0b-build/BABEL_OP3_403/
+    BABEL_404=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_OP3_404/
+    GRABO=/projects/tir5/data/speech_corpora/Grabo
+    IWSLT14=/projects/tir5/data/iwslt14
+    IWSLT22_DIALECT=/projects/tir5/data/speech_corpora/LDC2022E01_IWSLT22_Tunisian_Arabic_Shared_Task_Training_Data/
+    PRIMEWORDS_CHINESE=/projects/tir5/data/speech_corpora/Primewords_Chinese
+    FISHER_CALLHOME_SPANISH=/projects/tir5/data/speech_corpora/fisher_callhome_spanish
+    DSING=/projects/tir5/data/speech_corpora/sing_300x30x2
+    MS_INDIC_IS18=/projects/tir6/general/cnariset/corpora/microsoft_speech_corpus_indian_languages
+fi
 
 # For only JHU environment
 if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
+    AIDATATANG_200ZH=downloads
     AISHELL=
+    AISHELL3=downloads
+    ALFFA=downloads
     AN4=
     DIRHA_ENGLISH_PHDEV=
     DIRHA_WSJ=
     DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed"  # Output file path
+    DNS=
     WSJ0=
     WSJ1=
     WSJCAM0=/export/corpora3/LDC/LDC95S24/wsjcam0
@@ -93,25 +190,35 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
     CSJVER=usb  ## Set your CSJ format (dvd or usb).
                 ## Usage    :
                 ## Case DVD : We assume CSJ DVDs are copied in this directory with the names dvd1, dvd2,...,dvd17.
-                ##            Neccesary directory is dvd3 - dvd17.
+                ##            Necessary directory is dvd3 - dvd17.
                 ##            e.g. $ ls $CSJDATATOP(DVD) => 00README.txt dvd1 dvd2 ... dvd17
                 ##
-                ## Case USB : Neccesary directory is MORPH/SDB and WAV
+                ## Case USB : Necessary directory is MORPH/SDB and WAV
                 ##            e.g. $ ls $CSJDATATOP(USB) => 00README.txt DOC MORPH ... WAV fileList.csv
-                ## Case merl :MERL setup. Neccesary directory is WAV and sdb
+                ## Case merl :MERL setup. Necessary directory is WAV and sdb
     CSMSC=downloads
+    CSS10=
     HKUST1=
     HKUST2=
+    HUI_ACG=downloads
+    HUB4_SPANISH=
     LABOROTV=
     TEDXJP=
     LIBRISPEECH=
+    FSC=
+    SNIPS= # smart-light-en-closed-field data path
+    SLURP=
     MINI_LIBRISPEECH=downloads
     LIBRITTS=
     LJSPEECH=downloads
+    JMD=downloads
     JSSS=downloads
     JSUT=downloads
     JVS=downloads
+    KSS=
+    THCHS30=downloads
     TIMIT=
+    TSUKUYOMI=downloads
     VOXFORGE=
     AMI=/export/corpora4/ami/amicorpus
     COMMONVOICE=downloads
@@ -140,12 +247,30 @@ if [[ "$(hostname -d)" == clsp.jhu.edu ]]; then
     BABEL_402=/export/babel/data/402-javanese/IARPA-babel402b-v1.0b-build/BABEL_OP3_402
     BABEL_403=/export/babel/data/403-dholuo/IARPA-babel403b-v1.0b-build/BABEL_OP3_403
     BABEL_404=/export/corpora/LDC/LDC2016S12/IARPA_BABEL_OP3_404
+    PUEBLA_NAHUATL=
+    TEDLIUM2=downloads
     TEDLIUM3=downloads
     VCTK=downloads
     VIVOS=
     YESNO=
+    YOLOXOCHITL_MIXTEC=downloads
     HOW2_TEXT=
     HOW2_FEATS=
     ZEROTH_KOREAN=downloads
-
+    LRS2=
+    JAVA=
+    BENGALI=
+    RU_OPEN_STT=downloads
+    RUSLAN=downloads
+    SIWIS=downloads
+    SUNDA=
+    CMU_INDIC=
+    INDIC_SPEECH=
+    JKAC=
+    MUCS_SUBTASK1=downloads
+    MUCS_SUBTASK2=downloads
+    GAMAYUN=downloads
+    IWSLT21LR=downloads/iwslt21
+    TOTONAC=downloads
+    GOOGLEI18N=downloads
 fi
diff --git a/egs2/TEMPLATE/asr1/path.sh b/egs2/TEMPLATE/asr1/path.sh
old mode 100644
new mode 100755
diff --git a/egs2/TEMPLATE/asr1/pyscripts/audio/trim_silence.py b/egs2/TEMPLATE/asr1/pyscripts/audio/trim_silence.py
new file mode 120000
index 00000000000..f3e4fa4f4a8
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/audio/trim_silence.py
@@ -0,0 +1 @@
+../../../../../utils/trim_silence.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py
new file mode 100755
index 00000000000..21f8f4daf46
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_text_to_phn.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Convert kaldi-style text into phonemized sentences."""
+
+import argparse
+import codecs
+
+from joblib import delayed
+from joblib import Parallel
+
+from espnet2.text.cleaner import TextCleaner
+from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
+
+
+def main():
+    """Run phoneme conversion."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--g2p", type=str, required=True, help="G2P type.")
+    parser.add_argument("--cleaner", type=str, default=None, help="Cleaner type.")
+    parser.add_argument("--nj", type=int, default=4, help="Number of parallel jobs.")
+    parser.add_argument("in_text", type=str, help="Input kaldi-style text.")
+    parser.add_argument("out_text", type=str, help="Output kaldi-style text.")
+    args = parser.parse_args()
+
+    phoneme_tokenizer = PhonemeTokenizer(args.g2p)
+    cleaner = None
+    if args.cleaner is not None:
+        cleaner = TextCleaner(args.cleaner)
+    with codecs.open(args.in_text, encoding="utf8") as f:
+        lines = [line.strip() for line in f.readlines()]
+    text = {line.split()[0]: " ".join(line.split()[1:]) for line in lines}
+    if cleaner is not None:
+        text = {k: cleaner(v) for k, v in text.items()}
+    phns_list = Parallel(n_jobs=args.nj)(
+        [delayed(phoneme_tokenizer.text2tokens)(sentence) for sentence in text.values()]
+    )
+    with codecs.open(args.out_text, "w", encoding="utf8") as g:
+        for utt_id, phns in zip(text.keys(), phns_list):
+            g.write(f"{utt_id} " + " ".join(phns) + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_f0.py b/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_f0.py
new file mode 100755
index 00000000000..e27e57624ee
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_f0.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Wen-Chin Huang and Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Evaluate log-F0 RMSE between generated and groundtruth audios based on World."""
+
+import argparse
+import fnmatch
+import logging
+import multiprocessing as mp
+import os
+
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import librosa
+import numpy as np
+import pysptk
+import pyworld as pw
+import soundfile as sf
+
+from fastdtw import fastdtw
+from scipy import spatial
+
+
+def find_files(
+    root_dir: str, query: List[str] = ["*.flac", "*.wav"], include_root_dir: bool = True
+) -> List[str]:
+    """Find files recursively.
+
+    Args:
+        root_dir (str): Root root_dir to find.
+        query (List[str]): Query to find.
+        include_root_dir (bool): If False, root_dir name is not included.
+
+    Returns:
+        List[str]: List of found filenames.
+
+    """
+    files = []
+    for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
+        for q in query:
+            for filename in fnmatch.filter(filenames, q):
+                files.append(os.path.join(root, filename))
+    if not include_root_dir:
+        files = [file_.replace(root_dir + "/", "") for file_ in files]
+
+    return files
+
+
+def world_extract(
+    x: np.ndarray,
+    fs: int,
+    f0min: int = 40,
+    f0max: int = 800,
+    n_fft: int = 512,
+    n_shift: int = 256,
+    mcep_dim: int = 25,
+    mcep_alpha: float = 0.41,
+) -> np.ndarray:
+    """Extract World-based acoustic features.
+
+    Args:
+        x (ndarray): 1D waveform array.
+        fs (int): Minimum f0 value (default=40).
+        f0 (int): Maximum f0 value (default=800).
+        n_shift (int): Shift length in point (default=256).
+        n_fft (int): FFT length in point (default=512).
+        n_shift (int): Shift length in point (default=256).
+        mcep_dim (int): Dimension of mel-cepstrum (default=25).
+        mcep_alpha (float): All pass filter coefficient (default=0.41).
+
+    Returns:
+        ndarray: Mel-cepstrum with the size (N, n_fft).
+        ndarray: F0 sequence (N,).
+
+    """
+    # extract features
+    x = x.astype(np.float64)
+    f0, time_axis = pw.harvest(
+        x,
+        fs,
+        f0_floor=f0min,
+        f0_ceil=f0max,
+        frame_period=n_shift / fs * 1000,
+    )
+    sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=n_fft)
+    if mcep_dim is None or mcep_alpha is None:
+        mcep_dim, mcep_alpha = _get_best_mcep_params(fs)
+    mcep = pysptk.sp2mc(sp, mcep_dim, mcep_alpha)
+
+    return mcep, f0
+
+
+def _get_basename(path: str) -> str:
+    return os.path.splitext(os.path.split(path)[-1])[0]
+
+
+def _get_best_mcep_params(fs: int) -> Tuple[int, float]:
+    if fs == 16000:
+        return 23, 0.42
+    elif fs == 22050:
+        return 34, 0.45
+    elif fs == 24000:
+        return 34, 0.46
+    elif fs == 44100:
+        return 39, 0.53
+    elif fs == 48000:
+        return 39, 0.55
+    else:
+        raise ValueError(f"Not found the setting for {fs}.")
+
+
+def calculate(
+    file_list: List[str],
+    gt_file_list: List[str],
+    args: argparse.Namespace,
+    f0_rmse_dict: Dict[str, float],
+):
+    """Calculate log-F0 RMSE."""
+    for i, gen_path in enumerate(file_list):
+        corresponding_list = list(
+            filter(lambda gt_path: _get_basename(gt_path) in gen_path, gt_file_list)
+        )
+        assert len(corresponding_list) == 1
+        gt_path = corresponding_list[0]
+        gt_basename = _get_basename(gt_path)
+
+        # load wav file as int16
+        gen_x, gen_fs = sf.read(gen_path, dtype="int16")
+        gt_x, gt_fs = sf.read(gt_path, dtype="int16")
+
+        fs = gen_fs
+        if gen_fs != gt_fs:
+            gt_x = librosa.resample(gt_x.astype(np.float), gt_fs, gen_fs)
+
+        # extract ground truth and converted features
+        gen_mcep, gen_f0 = world_extract(
+            x=gen_x,
+            fs=fs,
+            f0min=args.f0min,
+            f0max=args.f0max,
+            n_fft=args.n_fft,
+            n_shift=args.n_shift,
+            mcep_dim=args.mcep_dim,
+            mcep_alpha=args.mcep_alpha,
+        )
+        gt_mcep, gt_f0 = world_extract(
+            x=gt_x,
+            fs=fs,
+            f0min=args.f0min,
+            f0max=args.f0max,
+            n_fft=args.n_fft,
+            n_shift=args.n_shift,
+            mcep_dim=args.mcep_dim,
+            mcep_alpha=args.mcep_alpha,
+        )
+
+        # DTW
+        _, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
+        twf = np.array(path).T
+        gen_f0_dtw = gen_f0[twf[0]]
+        gt_f0_dtw = gt_f0[twf[1]]
+
+        # Get voiced part
+        nonzero_idxs = np.where((gen_f0_dtw != 0) & (gt_f0_dtw != 0))[0]
+        gen_f0_dtw_voiced = np.log(gen_f0_dtw[nonzero_idxs])
+        gt_f0_dtw_voiced = np.log(gt_f0_dtw[nonzero_idxs])
+
+        # log F0 RMSE
+        log_f0_rmse = np.sqrt(np.mean((gen_f0_dtw_voiced - gt_f0_dtw_voiced) ** 2))
+        logging.info(f"{gt_basename} {log_f0_rmse:.4f}")
+        f0_rmse_dict[gt_basename] = log_f0_rmse
+
+
+def get_parser() -> argparse.Namespace:
+    """Get argument parser."""
+    parser = argparse.ArgumentParser(description="Evaluate Mel-cepstrum distortion.")
+    parser.add_argument(
+        "gen_wavdir_or_wavscp",
+        type=str,
+        help="Path of directory or wav.scp for generated waveforms.",
+    )
+    parser.add_argument(
+        "gt_wavdir_or_wavscp",
+        type=str,
+        help="Path of directory or wav.scp for ground truth waveforms.",
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        help="Path of directory to write the results.",
+    )
+
+    # analysis related
+    parser.add_argument(
+        "--mcep_dim",
+        default=None,
+        type=int,
+        help=(
+            "Dimension of mel cepstrum coefficients. "
+            "If None, automatically set to the best dimension for the sampling."
+        ),
+    )
+    parser.add_argument(
+        "--mcep_alpha",
+        default=None,
+        type=float,
+        help=(
+            "All pass constant for mel-cepstrum analysis. "
+            "If None, automatically set to the best dimension for the sampling."
+        ),
+    )
+    parser.add_argument(
+        "--n_fft",
+        default=1024,
+        type=int,
+        help="The number of FFT points.",
+    )
+    parser.add_argument(
+        "--n_shift",
+        default=256,
+        type=int,
+        help="The number of shift points.",
+    )
+    parser.add_argument(
+        "--f0min",
+        default=40,
+        type=int,
+        help="Minimum f0 value.",
+    )
+    parser.add_argument(
+        "--f0max",
+        default=800,
+        type=int,
+        help="Maximum f0 value.",
+    )
+    parser.add_argument(
+        "--nj",
+        default=16,
+        type=int,
+        help="Number of parallel jobs.",
+    )
+    parser.add_argument(
+        "--verbose",
+        default=1,
+        type=int,
+        help="Verbosity level. Higher is more logging.",
+    )
+    return parser
+
+
+def main():
+    """Run log-F0 RMSE calculation in parallel."""
+    args = get_parser().parse_args()
+
+    # logging info
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # find files
+    if os.path.isdir(args.gen_wavdir_or_wavscp):
+        gen_files = sorted(find_files(args.gen_wavdir_or_wavscp))
+    else:
+        with open(args.gen_wavdir_or_wavscp) as f:
+            gen_files = [line.strip().split(None, 1)[1] for line in f.readlines()]
+        if gen_files[0].endswith("|"):
+            raise ValueError("Not supported wav.scp format.")
+    if os.path.isdir(args.gt_wavdir_or_wavscp):
+        gt_files = sorted(find_files(args.gt_wavdir_or_wavscp))
+    else:
+        with open(args.gt_wavdir_or_wavscp) as f:
+            gt_files = [line.strip().split(None, 1)[1] for line in f.readlines()]
+        if gt_files[0].endswith("|"):
+            raise ValueError("Not supported wav.scp format.")
+
+    # Get and divide list
+    if len(gen_files) == 0:
+        raise FileNotFoundError("Not found any generated audio files.")
+    if len(gen_files) > len(gt_files):
+        raise ValueError(
+            "#groundtruth files are less than #generated files "
+            f"(#gen={len(gen_files)} vs. #gt={len(gt_files)}). "
+            "Please check the groundtruth directory."
+        )
+    logging.info("The number of utterances = %d" % len(gen_files))
+    file_lists = np.array_split(gen_files, args.nj)
+    file_lists = [f_list.tolist() for f_list in file_lists]
+
+    # multi processing
+    with mp.Manager() as manager:
+        log_f0_rmse_dict = manager.dict()
+        processes = []
+        # for f in file_lists:
+        #     calculate(f, gt_files, args, log_f0_rmse_dict)
+        for f in file_lists:
+            p = mp.Process(target=calculate, args=(f, gt_files, args, log_f0_rmse_dict))
+            p.start()
+            processes.append(p)
+
+        # wait for all process
+        for p in processes:
+            p.join()
+
+        # convert to standard list
+        log_f0_rmse_dict = dict(log_f0_rmse_dict)
+
+        # calculate statistics
+        mean_log_f0_rmse = np.mean(np.array([v for v in log_f0_rmse_dict.values()]))
+        std_log_f0_rmse = np.std(np.array([v for v in log_f0_rmse_dict.values()]))
+        logging.info(f"Average: {mean_log_f0_rmse:.4f} ± {std_log_f0_rmse:.4f}")
+
+    # write results
+    if args.outdir is None:
+        if os.path.isdir(args.gen_wavdir_or_wavscp):
+            args.outdir = args.gen_wavdir_or_wavscp
+        else:
+            args.outdir = os.path.dirname(args.gen_wavdir_or_wavscp)
+    os.makedirs(args.outdir, exist_ok=True)
+    with open(f"{args.outdir}/utt2log_f0_rmse", "w") as f:
+        for utt_id in sorted(log_f0_rmse_dict.keys()):
+            log_f0_rmse = log_f0_rmse_dict[utt_id]
+            f.write(f"{utt_id} {log_f0_rmse:.4f}\n")
+    with open(f"{args.outdir}/log_f0_rmse_avg_result.txt", "w") as f:
+        f.write(f"#utterances: {len(gen_files)}\n")
+        f.write(f"Average: {mean_log_f0_rmse:.4f} ± {std_log_f0_rmse:.4f}")
+
+    logging.info("Successfully finished log-F0 RMSE evaluation.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_mcd.py b/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_mcd.py
index 6ecd2e1f0f8..379438217ea 100755
--- a/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_mcd.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_mcd.py
@@ -3,7 +3,7 @@
 # Copyright 2020 Wen-Chin Huang and Tomoki Hayashi
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-"""Evaluate MCD between generated and groundtruth audios."""
+"""Evaluate MCD between generated and groundtruth audios with SPTK-based mcep."""
 
 import argparse
 import fnmatch
@@ -178,23 +178,20 @@ def calculate(
 
 def get_parser() -> argparse.Namespace:
     """Get argument parser."""
-    parser = argparse.ArgumentParser(description="Evaluate Mel-cepstrum distorsion.")
+    parser = argparse.ArgumentParser(description="Evaluate Mel-cepstrum distortion.")
     parser.add_argument(
-        "--wavdir",
-        required=True,
+        "gen_wavdir_or_wavscp",
         type=str,
-        help="Path of directory for generated waveforms.",
+        help="Path of directory or wav.scp for generated waveforms.",
     )
     parser.add_argument(
-        "--gt_wavdir",
-        required=True,
+        "gt_wavdir_or_wavscp",
         type=str,
-        help="Path of directory for ground truth waveforms.",
+        help="Path of directory or wav.scp for ground truth waveforms.",
     )
     parser.add_argument(
         "--outdir",
         type=str,
-        default=None,
         help="Path of directory to write the results.",
     )
 
@@ -218,13 +215,22 @@ def get_parser() -> argparse.Namespace:
         ),
     )
     parser.add_argument(
-        "--n_fft", default=1024, type=int, help="The number of FFT points."
+        "--n_fft",
+        default=1024,
+        type=int,
+        help="The number of FFT points.",
     )
     parser.add_argument(
-        "--n_shift", default=256, type=int, help="The number of shift points."
+        "--n_shift",
+        default=256,
+        type=int,
+        help="The number of shift points.",
     )
     parser.add_argument(
-        "--n_jobs", default=16, type=int, help="Number of parallel jobs."
+        "--nj",
+        default=16,
+        type=int,
+        help="Number of parallel jobs.",
     )
     parser.add_argument(
         "--verbose",
@@ -258,8 +264,20 @@ def main():
         logging.warning("Skip DEBUG/INFO messages")
 
     # find files
-    gen_files = sorted(find_files(args.wavdir))
-    gt_files = sorted(find_files(args.gt_wavdir))
+    if os.path.isdir(args.gen_wavdir_or_wavscp):
+        gen_files = sorted(find_files(args.gen_wavdir_or_wavscp))
+    else:
+        with open(args.gen_wavdir_or_wavscp) as f:
+            gen_files = [line.strip().split(None, 1)[1] for line in f.readlines()]
+        if gen_files[0].endswith("|"):
+            raise ValueError("Not supported wav.scp format.")
+    if os.path.isdir(args.gt_wavdir_or_wavscp):
+        gt_files = sorted(find_files(args.gt_wavdir_or_wavscp))
+    else:
+        with open(args.gt_wavdir_or_wavscp) as f:
+            gt_files = [line.strip().split(None, 1)[1] for line in f.readlines()]
+        if gt_files[0].endswith("|"):
+            raise ValueError("Not supported wav.scp format.")
 
     # Get and divide list
     if len(gen_files) == 0:
@@ -271,7 +289,7 @@ def main():
             "Please check the groundtruth directory."
         )
     logging.info("The number of utterances = %d" % len(gen_files))
-    file_lists = np.array_split(gen_files, args.n_jobs)
+    file_lists = np.array_split(gen_files, args.nj)
     file_lists = [f_list.tolist() for f_list in file_lists]
 
     # multi processing
@@ -296,15 +314,19 @@ def main():
         logging.info(f"Average: {mean_mcd:.4f} ± {std_mcd:.4f}")
 
     # write results
-    if args.outdir is not None:
-        os.makedirs(args.outdir, exist_ok=True)
-        with open(f"{args.outdir}/utt2mcd", "w") as f:
-            for utt_id in sorted(mcd_dict.keys()):
-                mcd = mcd_dict[utt_id]
-                f.write(f"{utt_id} {mcd:.4f}\n")
-        with open(f"{args.outdir}/resuls.txt", "w") as f:
-            f.write(f"#utterances: {len(gen_files)}\n")
-            f.write(f"Average: {mean_mcd:.4f} ± {std_mcd:.4f}")
+    if args.outdir is None:
+        if os.path.isdir(args.gen_wavdir_or_wavscp):
+            args.outdir = args.gen_wavdir_or_wavscp
+        else:
+            args.outdir = os.path.dirname(args.gen_wavdir_or_wavscp)
+    os.makedirs(args.outdir, exist_ok=True)
+    with open(f"{args.outdir}/utt2mcd", "w") as f:
+        for utt_id in sorted(mcd_dict.keys()):
+            mcd = mcd_dict[utt_id]
+            f.write(f"{utt_id} {mcd:.4f}\n")
+    with open(f"{args.outdir}/mcd_avg_result.txt", "w") as f:
+        f.write(f"#utterances: {len(gen_files)}\n")
+        f.write(f"Average: {mean_mcd:.4f} ± {std_mcd:.4f}")
 
     logging.info("Successfully finished MCD evaluation.")
 
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py
new file mode 100755
index 00000000000..d7e7804a13f
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+#  2022, Hitachi LTD.; Nelson Yalta
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import kaldiio
+import logging
+from pathlib import Path
+import sys
+import torch
+import os
+import numpy as np
+
+from tqdm.contrib import tqdm
+
+from espnet2.fileio.sound_scp import SoundScpReader
+
+
+def get_parser():
+    """Construct the parser."""
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--pretrained_model", type=str, help="Pretrained model.")
+    parser.add_argument(
+        "--toolkit",
+        type=str,
+        help="Toolkit for Extracting X-vectors.",
+        choices=["espnet", "speechbrain"],
+    )
+    parser.add_argument("--verbose", type=int, default=1, help="Verbosity level.")
+    parser.add_argument("--device", type=str, default="cuda:0", help="Inference device")
+    parser.add_argument(
+        "in_folder", type=Path, help="Path to the input kaldi data directory."
+    )
+    parser.add_argument(
+        "out_folder",
+        type=Path,
+        help="Output folder to save the xvectors.",
+    )
+    return parser
+
+
+def main(argv):
+    """Load the model, generate kernel and bandpass plots."""
+    parser = get_parser()
+    args = parser.parse_args(argv)
+
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    if torch.cuda.is_available() and ("cuda" in args.device):
+        device = args.device
+    else:
+        device = "cpu"
+
+    if args.toolkit == "speechbrain":
+        from speechbrain.dataio.preprocess import AudioNormalizer
+        from speechbrain.pretrained import EncoderClassifier
+
+        # Prepare spk2utt for mean x-vector
+        spk2utt = dict()
+        with open(os.path.join(args.in_folder, "spk2utt"), "r") as reader:
+            for line in reader:
+                details = line.split()
+                spk2utt[details[0]] = details[1:]
+
+        # TODO(nelson): The model inference can be moved into functon.
+        classifier = EncoderClassifier.from_hparams(
+            source=args.pretrained_model, run_opts={"device": device}
+        )
+        audio_norm = AudioNormalizer()
+
+        wav_scp = SoundScpReader(os.path.join(args.in_folder, "wav.scp"))
+        os.makedirs(args.out_folder, exist_ok=True)
+        writer_utt = kaldiio.WriteHelper(
+            "ark,scp:{0}/xvector.ark,{0}/xvector.scp".format(args.out_folder)
+        )
+        writer_spk = kaldiio.WriteHelper(
+            "ark,scp:{0}/spk_xvector.ark,{0}/spk_xvector.scp".format(args.out_folder)
+        )
+
+        for speaker in tqdm(spk2utt):
+            xvectors = list()
+            for utt in spk2utt[speaker]:
+                in_sr, wav = wav_scp[utt]
+                # Amp Normalization -1 ~ 1
+                amax = np.amax(np.absolute(wav))
+                wav = wav.astype(np.float32) / amax
+                # Freq Norm
+                wav = audio_norm(torch.from_numpy(wav), in_sr).to(device)
+                # X-vector Embedding
+                embeds = classifier.encode_batch(wav).detach().cpu().numpy()[0]
+                writer_utt[utt] = np.squeeze(embeds)
+                xvectors.append(embeds)
+
+            # Speaker Normalization
+            xvectors = np.mean(np.concatenate(xvectors, 0), 0)
+            writer_spk[speaker] = embeds
+        writer_utt.close()
+        writer_spk.close()
+
+    elif args.toolkit == "espnet":
+        raise NotImplementedError(
+            "Follow details at: https://github.com/espnet/espnet/issues/3040"
+        )
+    else:
+        raise ValueError(
+            f"Unkown type of toolkit. Only supported: speechbrain, espnet, kaldi"
+        )
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/remove_duplicate_keys.py b/egs2/TEMPLATE/asr1/pyscripts/utils/remove_duplicate_keys.py
new file mode 100644
index 00000000000..1ad763f6e21
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/remove_duplicate_keys.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Jiatong Shi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Remove duplicate index from a index file."""
+
+import argparse
+from pathlib import Path
+
+
+def read_2column_text(path):
+    """Read a text file having 2 column as dict object.
+    Examples:
+        wav.scp:
+            key1 /some/path/a.wav
+            key2 /some/path/b.wav
+        >>> read_2column_text('wav.scp')
+        {'key1': '/some/path/a.wav', 'key2': '/some/path/b.wav'}
+    """
+
+    keys = set()
+    with Path(path).open("r", encoding="utf-8") as f:
+        for linenum, line in enumerate(f, 1):
+            sps = line.rstrip().split(maxsplit=1)
+            if len(sps) == 1:
+                k, v = sps[0], ""
+            else:
+                k, v = sps
+            if k in keys:
+                continue
+            else:
+                print("{} {}".format(k, v))
+                keys.add(k)
+
+
+def main():
+    """Print the duplicate-free result in stdout."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "index_file", type=str, help="Kaldi-style utterance-indexed file path."
+    )
+    args = parser.parse_args()
+
+    read_2column_text(args.index_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
new file mode 100755
index 00000000000..4f0f074c9db
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/score_intent.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+
+
+def get_classification_result(hyp_file, ref_file, hyp_write, ref_write):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+
+    error = 0
+    for line_count in range(len(hyp_lines)):
+        hyp_intent = hyp_lines[line_count].split(" ")[0]
+        ref_intent = ref_lines[line_count].split(" ")[0]
+        if hyp_intent != ref_intent:
+            error += 1
+        hyp_write.write(
+            " ".join(hyp_lines[line_count].split("\t")[0].split(" ")[1:])
+            + "\t"
+            + hyp_lines[line_count].split("\t")[1]
+        )
+        ref_write.write(
+            " ".join(ref_lines[line_count].split("\t")[0].split(" ")[1:])
+            + "\t"
+            + ref_lines[line_count].split("\t")[1]
+        )
+    return 1 - (error / len(hyp_lines))
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+valid_hyp_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+valid_ref_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+result = get_classification_result(
+    valid_hyp_file, valid_ref_file, valid_hyp_write_file, valid_ref_write_file
+)
+print("Valid Intent Classification Result")
+print(result)
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+test_hyp_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+test_ref_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+result = get_classification_result(
+    test_hyp_file, test_ref_file, test_hyp_write_file, test_ref_write_file
+)
+print("Test Intent Classification Result")
+print(result)
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    utt_test_hyp_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+    )
+    utt_test_ref_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref_asr.trn"), "w"
+    )
+    result = get_classification_result(
+        utt_test_hyp_file,
+        utt_test_ref_file,
+        utt_test_hyp_write_file,
+        utt_test_ref_write_file,
+    )
+    print("Unseen Utterance Test Intent Classification Result")
+    print(result)
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
new file mode 100644
index 00000000000..35202f1ce88
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
@@ -0,0 +1,50 @@
+import sys
+import os
+from datasets import load_metric
+import numpy as np
+from nlgeval import compute_metrics
+from nlgeval import NLGEval
+
+
+ref_file = sys.argv[1]
+hyp_file = sys.argv[2]
+
+with open(ref_file, "r") as f:
+    ref_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+with open(hyp_file, "r") as f:
+    hyp_dict = {
+        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
+        for line in f.readlines()
+    }
+
+keys = [k for k, v in hyp_dict.items()]
+labels = [ref_dict[k] for k, _ in hyp_dict.items()]
+decoded_preds = [v for k, v in hyp_dict.items()]
+
+metric = load_metric("bertscore")
+result_bert = metric.compute(
+    predictions=decoded_preds,
+    references=labels,
+    lang="en",
+)
+
+
+nlg = NLGEval()  # loads the models
+print("Key", "\t", "METEOR", "\t", "ROUGE-L")
+for (key, ref, hyp) in zip(keys, labels, decoded_preds):
+    metrics_dict = nlg.compute_individual_metrics([ref], hyp)
+    print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"])
+refs = [[x] for x in labels]
+metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds)
+metric = load_metric("rouge")
+result = metric.compute(predictions=decoded_preds, references=labels)
+result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+
+print(
+    f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \
+    {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}"
+)
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/utt2spk_to_utt2sid.py b/egs2/TEMPLATE/asr1/pyscripts/utils/utt2spk_to_utt2sid.py
new file mode 100755
index 00000000000..d5eec19c00f
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/pyscripts/utils/utt2spk_to_utt2sid.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Generate utt2sid file from utt2spk and spk2sid files."""
+
+import argparse
+import codecs
+
+
+def main():
+    """Print utt2sid in stdout."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("spk2sid", type=str, help="Kaldi-style spk2sid file path.")
+    parser.add_argument("utt2spk", type=str, help="Kaldi-style utt2spk file path.")
+    args = parser.parse_args()
+
+    # load files
+    with codecs.open(args.spk2sid, "r", encoding="utf-8") as f:
+        lines = [line.strip() for line in f.readlines()]
+    spk2sid = {line.split()[0]: line.split()[1] for line in lines}
+    with codecs.open(args.utt2spk, "r", encoding="utf-8") as f:
+        lines = [line.strip() for line in f.readlines()]
+    utt2spk = {line.split()[0]: line.split()[1] for line in lines}
+
+    for utt_id, spk in utt2spk.items():
+        sid = spk2sid.get(spk, 0)
+        print(f"{utt_id} {sid}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/TEMPLATE/asr1/scripts/audio/format_wav_scp.sh b/egs2/TEMPLATE/asr1/scripts/audio/format_wav_scp.sh
index 15e4563f1f8..cadc2d6cfc3 100755
--- a/egs2/TEMPLATE/asr1/scripts/audio/format_wav_scp.sh
+++ b/egs2/TEMPLATE/asr1/scripts/audio/format_wav_scp.sh
@@ -122,7 +122,7 @@ else
         ${opts} \
         --fs "${fs}" \
         --audio-format "${audio_format}" \
-        "${logdir}/wav.JOB.scp" ${outdir}/format.JOB""
+        "${logdir}/wav.JOB.scp" "${outdir}/format.JOB"
 fi
 
 # Workaround for the NFS problem
diff --git a/egs2/TEMPLATE/asr1/scripts/audio/trim_silence.sh b/egs2/TEMPLATE/asr1/scripts/audio/trim_silence.sh
new file mode 100755
index 00000000000..2eafce26a1b
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/audio/trim_silence.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Trim silence and generate segments file.
+
+SECONDS=0
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+fs=16000
+win_length=1024
+shift_length=256
+threshold=35
+min_silence=0.01
+normalize=16
+cmd=run.pl
+nj=32
+
+help_message=$(cat <<EOF
+Usage: $0 [options] <data-dir> <log-dir>
+e.g.: $0 data/train exp/trim_silence/train
+Options:
+  --fs <fs>                      # Sampling frequency (default="${fs}").
+  --win_length <win_length>      # Window length in point (default="${win_length}").
+  --shift_length <shift_length>  # Shift length in point (default="${shift_length}").
+  --threshold <threshold>        # Power threshold in db (default="${threshold}").
+  --min_silence <sec>            # Minimum silence length in sec (default="${min_silence}").
+  --normalize <bit>              # Audio bit (default="${normalize}").
+  --cmd <cmd>                    # How to run jobs (default="${cmd}").
+  --nj <nj>                      # Number of parallel jobs (default="${nj}").
+EOF
+)
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+if [ ! $# -eq 2 ]; then
+    log "${help_message}"
+    log "Error: invalid command line arguments"
+    exit 1;
+fi
+
+set -euo pipefail
+
+# shellcheck disable=SC1091
+. ./path.sh
+
+datadir=$1
+logdir=$2
+
+[ ! -e "${logdir}" ] && mkdir -p "${logdir}"
+tmpdir=$(mktemp -d "${logdir}"/tmp-XXXX)
+split_scps=""
+for n in $(seq "${nj}"); do
+    split_scps="${split_scps} ${tmpdir}/wav.${n}.scp"
+done
+# shellcheck disable=SC2086
+utils/split_scp.pl "${datadir}/wav.scp" ${split_scps} || exit 1;
+
+# make segments file describing start and end time
+${cmd} JOB=1:"${nj}" "${logdir}/trim_silence.JOB.log" \
+    MPLBACKEND=Agg pyscripts/audio/trim_silence.py \
+        --fs "${fs}" \
+        --win_length "${win_length}" \
+        --shift_length "${shift_length}" \
+        --threshold "${threshold}" \
+        --min_silence "${min_silence}" \
+        --normalize "${normalize}" \
+        --figdir "${logdir}/figs.JOB" \
+        scp:"${tmpdir}/wav.JOB.scp" \
+        "${tmpdir}/segments.JOB"
+
+# concatenate segments
+for n in $(seq "${nj}"); do
+    cat "${tmpdir}/segments.${n}" || exit 1;
+done > "${datadir}/segments" || exit 1
+rm -rf "${tmpdir}"
+
+# check
+utils/fix_data_dir.sh "${datadir}"
+log "Successfully finished silence trimming. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/TEMPLATE_HF_Readme.md b/egs2/TEMPLATE/asr1/scripts/utils/TEMPLATE_HF_Readme.md
new file mode 100644
index 00000000000..c241a9a4d05
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/utils/TEMPLATE_HF_Readme.md
@@ -0,0 +1,110 @@
+---
+tags:
+- espnet
+- audio
+- ${hf_task}
+language: ${lang}
+datasets:
+- ${_corpus}
+license: cc-by-4.0
+---
+
+## ESPnet2 ${espnet_task} model 
+
+### \`${hf_repo}\`
+
+This model was trained by ${_creator_name} using ${_task} recipe in [espnet](https://github.com/espnet/espnet/).
+
+### Demo: How to use in ESPnet2
+
+\`\`\`bash
+cd espnet
+${_checkout}
+pip install -e .
+cd $(pwd | rev | cut -d/ -f1-3 | rev)
+./run.sh --skip_data_prep false --skip_train true --download_model ${hf_repo}
+\`\`\`
+
+$(if [ -f "${task_exp}"/RESULTS.md ]; then
+  cat "${task_exp}"/RESULTS.md;
+fi)
+
+## ${espnet_task} config
+
+<details><summary>expand</summary>
+
+\`\`\`
+$(cat "${task_exp}"/config.yaml)
+\`\`\`
+
+</details>
+
+$(if [ -n "${var+use_lm}" ]; then 
+  ${use_lm} && echo "## LM config
+    
+<details><summary>expand</summary>
+
+\`\`\`
+  $(cat "${lm_exp}"/config.yaml)
+\`\`\`
+
+</details>
+    ";
+fi)
+
+### Citing ESPnet
+
+\`\`\`BibTex
+@inproceedings{watanabe2018espnet,
+  author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+  title={{ESPnet}: End-to-End Speech Processing Toolkit},
+  year={2018},
+  booktitle={Proceedings of Interspeech},
+  pages={2207--2211},
+  doi={10.21437/Interspeech.2018-1456},
+  url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
+}
+
+$(if [ "${espnet_task}" == "ENH" ]; then
+  echo '
+@inproceedings{ESPnet-SE,
+  author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and 
+  Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
+  title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
+  booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
+  pages = {785--792},
+  publisher = {{IEEE}},
+  year = {2021},
+  url = {https://doi.org/10.1109/SLT48900.2021.9383615},
+  doi = {10.1109/SLT48900.2021.9383615},
+  timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
+  biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}';
+fi)
+
+$(if [ "${espnet_task}" == "TTS" ]; then
+  echo '
+@inproceedings{hayashi2020espnet,
+  title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
+  author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
+  booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={7654--7658},
+  year={2020},
+  organization={IEEE}
+}';
+fi)
+\`\`\`
+
+or arXiv:
+
+\`\`\`bibtex
+@misc{watanabe2018espnet,
+  title={ESPnet: End-to-End Speech Processing Toolkit}, 
+  author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+  year={2018},
+  eprint={1804.00015},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL}
+}
+\`\`\`
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/TEMPLATE_Readme.md b/egs2/TEMPLATE/asr1/scripts/utils/TEMPLATE_Readme.md
new file mode 100644
index 00000000000..352aa486cf2
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/utils/TEMPLATE_Readme.md
@@ -0,0 +1,43 @@
+---
+tags:
+- espnet
+- audio
+- <add_more_tags>
+language: <add_lang>
+datasets:
+- <add_corpus>
+license: cc-by-4.0
+---
+## ESPnet2 <add_task_name> pretrained model 
+### `<add_model_name>`
+♻️ Imported from <add_url>
+
+This model was trained by <add_name> using <add_corpus>/<add_recipe_task_name> recipe in [espnet](https://github.com/espnet/espnet/).
+### Demo: How to use in ESPnet2
+```python
+# coming soon
+```
+### Citing ESPnet
+```BibTex
+@inproceedings{watanabe2018espnet,
+  author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+  title={{ESPnet}: End-to-End Speech Processing Toolkit},
+  year={2018},
+  booktitle={Proceedings of Interspeech},
+  pages={2207--2211},
+  doi={10.21437/Interspeech.2018-1456},
+  url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
+}
+<add_tts_reference>
+```
+or arXiv:
+```bibtex
+@misc{watanabe2018espnet,
+      title={ESPnet: End-to-End Speech Processing Toolkit}, 
+      author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+      year={2018},
+      eprint={1804.00015},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
\ No newline at end of file
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
new file mode 100644
index 00000000000..b30c6c64b56
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
@@ -0,0 +1,59 @@
+import pandas as pd
+from espnet_model_zoo.downloader import ModelDownloader
+import sys
+
+tts_reference = "@inproceedings{hayashi2020espnet,\n\
+  title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},\n\
+  author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},\n\
+  booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},\n\
+  pages={7654--7658},\n\
+  year={2020},\n\
+  organization={IEEE}\n\
+}"
+
+
+def create_Readme_file(repo_name, model_name):
+    # Fill in the blanks in the template Readme eg. add task tags, model name etc.
+    d = ModelDownloader()
+    corpus_name = d.query("corpus", name=model_name)[0]
+    task_name = d.query("task", name=model_name)[0]
+    url_name = d.query("url", name=model_name)[0].split("files/")[0]
+    user_name = model_name.split("/")[0]
+    lang_name = d.query("lang", name=model_name)[0].replace("jp", "ja")
+    template_Readme = open("TEMPLATE_Readme.md")
+    new_Readme = open(repo_name + "/README.md", "w")
+    lines_arr = [line for line in template_Readme]
+    line_final_arr = []
+    for line in lines_arr:
+        if "<add_more_tags>" in line:
+            if task_name == "asr":
+                line = line.replace("<add_more_tags>", "automatic-speech-recognition")
+            elif task_name == "tts":
+                line = line.replace("<add_more_tags>", "text-to-speech")
+            elif task_name == "enh":
+                line = line.replace(
+                    "<add_more_tags>", "speech-enhancement\n- audio-to-audio"
+                )
+        if "<add_lang>" in line:
+            if lang_name == "multilingual":
+                line = line.replace("<add_lang>", "en\n- zh\n- ja\n- multilingual")
+            else:
+                line = line.replace("<add_lang>", lang_name)
+        line = line.replace("<add_model_name>", model_name)
+        line = line.replace("<add_url>", url_name)
+        line = line.replace("<add_name>", user_name)
+        line = line.replace("<add_corpus>", corpus_name)
+        line = line.replace("<add_task_name>", task_name.upper())
+        line = line.replace("<add_recipe_task_name>", task_name.lower() + "1")
+        if "<add_tts_reference>" in line:
+            if task_name == "tts":
+                line = line.replace("<add_tts_reference>", tts_reference)
+            else:
+                line = line.replace("<add_tts_reference>", "")
+        new_Readme.write(line)
+
+
+if __name__ == "__main__":
+    repo_name = sys.argv[1]
+    model_name = sys.argv[2]
+    create_Readme_file(repo_name, model_name)
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/evaluate_asr.sh b/egs2/TEMPLATE/asr1/scripts/utils/evaluate_asr.sh
new file mode 100755
index 00000000000..7d3da2bfbea
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/utils/evaluate_asr.sh
@@ -0,0 +1,284 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1
+stop_stage=2
+nj=8
+gpu_inference=false
+fs=16000
+
+# Model related configuration
+model_tag=""
+asr_model_file=""
+lm_file=""
+
+# Inference option related configuration
+inference_config=""
+inference_args=""
+
+# Scoring related configuration
+bpemodel=""
+nlsyms_txt=none
+cleaner=none
+gt_text=""
+
+help_message=$(cat << EOF
+Usage: $0 [Options] <wav.scp> <outdir>
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --nj             # Number of parallel jobs (default="${nj}").
+    --gpu_inference  # Whether to use gpu in the inference (default="${gpu_inference}").
+    --fs             # Sampling rate for ASR model inputs (default="${fs}").
+
+    # Model related configuration
+    --model_tag       # Model tag or url available in espnet_model_zoo (default="${model_tag}")
+                      # If provided, overwrite --asr_model_file and --lm_file options.
+    --asr_model_file  # ASR model file path in local (default="${asr_model_file}").
+    --lm_file         # LM model file path in local (default="${lm_file}").
+
+    # Inference related configuration
+    --inference_config  # ASR inference configuration file (default="${inference_config}").
+    --inference_args    # Additional arguments for ASR inference (default=${inference_args}).
+
+    # Scoring related configuration
+    --bpemodel    # BPE model path, needed if you want to calculate TER (default="${bpemodel}").
+    --nlsyms_txt  # Non-language symbol file (default="${nlsyms_txt}").
+    --cleaner     # Text cleaner module for the reference (default="${cleaner}").
+    --gt_text     # Kaldi-format groundtruth text file (default="${gt_text}")
+                  # This must be provided if you want to calculate scores.
+
+Examples:
+    # Use pretrained model and perform only inference
+    $0 --model_tag <model_tag> wav.scp asr_outputs
+
+    # Use pretrained model and perform inference and scoring
+    $0 --model_tag <model_tag> --stop-stage 2 --gt_text /path/to/text wav.scp asr_results
+
+    # Use local model and perform inference and scoring
+    $0 --asr_model_file /path/to/model.pth --stop-stage 2 --gt_text /path/to/text wav.scp asr_results
+
+EOF
+)
+
+log "$0 $*"
+# shellcheck disable=SC1091
+. utils/parse_options.sh
+
+wavscp=$1
+outdir=$2
+
+if [ $# -ne 2 ]; then
+    log "${help_message}"
+    exit 2
+fi
+
+# shellcheck disable=SC1091
+. ./path.sh
+# shellcheck disable=SC1091
+. ./cmd.sh
+
+# Check the option is valid
+if [ -z "${gt_text}" ] && [ "${stop_stage}" -ge 3 ]; then
+    log "--gt_text must be provided if perform scoring."
+    exit 1
+fi
+if [ -z "${model_tag}" ] && [ -z "${asr_model_file}" ]; then
+    log "Either --model_tag or --asr_model_file must be provided."
+    exit 1
+fi
+
+if ${gpu_inference}; then
+    # shellcheck disable=SC2154
+    _cmd="${cuda_cmd}"
+    _ngpu=1
+else
+    # shellcheck disable=SC2154
+    _cmd="${decode_cmd}"
+    _ngpu=0
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Format wav.scp"
+    # shellcheck disable=SC2154
+    scripts/audio/format_wav_scp.sh \
+        --nj "${nj}" \
+        --cmd "${train_cmd}" \
+        --audio-format wav \
+        --fs "${fs}" \
+        "${wavscp}" "${outdir}/tmp"
+fi
+
+if [ -e "${outdir}/tmp/wav.scp" ]; then
+    wavscp="${outdir}/tmp/wav.scp"
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: ASR inference"
+    _opts=()
+    if [ -n "${inference_config}" ]; then
+        _opts+=("--config" "${inference_config}")
+    fi
+    if [ -n "${asr_model_file}" ]; then
+        _opts+=("--asr_model_file" "${asr_model_file}")
+    fi
+    if [ -n "${lm_file}" ]; then
+        _opts+=("--lm_file" "${lm_file}")
+    fi
+    if [ -n "${model_tag}" ]; then
+        # FIXME: workaround until fixing filelock in espnet_model_zoo
+        espnet_model_zoo_download --unpack true "${model_tag}" > /dev/null
+        _opts+=("--model_tag" "${model_tag}")
+    fi
+
+    logdir="${outdir}/logdir"
+    mkdir -p "${logdir}"
+
+    # 1. Split the key file
+    key_file=${wavscp}
+    split_scps=""
+    _nj=$(min "${nj}" "$(wc -l < "${key_file}")")
+    for n in $(seq "${_nj}"); do
+        split_scps+=" ${logdir}/keys.${n}.scp"
+    done
+    # shellcheck disable=SC2086
+    utils/split_scp.pl "${key_file}" ${split_scps}
+
+    # 2. Submit decoding jobs
+    log "Decoding started... log: '${logdir}/asr_inference.*.log'"
+    # shellcheck disable=SC2086
+    ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${logdir}"/asr_inference.JOB.log \
+        python3 -m espnet2.bin.asr_inference \
+            --ngpu "${_ngpu}" \
+            --data_path_and_name_and_type "${wavscp},speech,sound" \
+            --key_file "${logdir}"/keys.JOB.scp \
+            --output_dir "${logdir}"/output.JOB \
+            "${_opts[@]}" ${inference_args}
+
+    # 3. Concatenates the output files from each jobs
+    for f in token token_int score text; do
+        for i in $(seq "${_nj}"); do
+            cat "${logdir}/output.${i}/1best_recog/${f}"
+        done | LC_ALL=C sort -k1 >"${outdir}/${f}"
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Scoring"
+    for _type in cer wer ter; do
+        [ "${_type}" = ter ] && [ ! -f "${bpemodel}" ] && continue
+
+        _scoredir="${outdir}/score_${_type}"
+        mkdir -p "${_scoredir}"
+
+        if [ "${_type}" = wer ]; then
+            paste \
+                <(<"${gt_text}" \
+                      python3 -m espnet2.bin.tokenize_text  \
+                          -f 2- --input - --output - \
+                          --token_type word \
+                          --non_linguistic_symbols "${nlsyms_txt}" \
+                          --remove_non_linguistic_symbols true \
+                          --cleaner "${cleaner}" \
+                          ) \
+                <(<"${wavscp}" awk '{ print "(" $1 ")" }') \
+                    >"${_scoredir}/ref.trn"
+            paste \
+                <(<"${outdir}/text"  \
+                      python3 -m espnet2.bin.tokenize_text  \
+                          -f 2- --input - --output - \
+                          --token_type word \
+                          --non_linguistic_symbols "${nlsyms_txt}" \
+                          --remove_non_linguistic_symbols true \
+                          ) \
+                <(<"${wavscp}" awk '{ print "(" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn"
+
+        elif [ "${_type}" = cer ]; then
+            paste \
+                <(<"${gt_text}" \
+                      python3 -m espnet2.bin.tokenize_text  \
+                          -f 2- --input - --output - \
+                          --token_type char \
+                          --non_linguistic_symbols "${nlsyms_txt}" \
+                          --remove_non_linguistic_symbols true \
+                          --cleaner "${cleaner}" \
+                          ) \
+                <(<"${wavscp}" awk '{ print "(" $1 ")" }') \
+                    >"${_scoredir}/ref.trn"
+            paste \
+                <(<"${outdir}/text" \
+                      python3 -m espnet2.bin.tokenize_text  \
+                          -f 2- --input - --output - \
+                          --token_type char \
+                          --non_linguistic_symbols "${nlsyms_txt}" \
+                          --remove_non_linguistic_symbols true \
+                          ) \
+                <(<"${wavscp}" awk '{ print "(" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn"
+
+        elif [ "${_type}" = ter ]; then
+            paste \
+                <(<"${gt_text}" \
+                      python3 -m espnet2.bin.tokenize_text  \
+                          -f 2- --input - --output - \
+                          --token_type bpe \
+                          --bpemodel "${bpemodel}" \
+                          --cleaner "${cleaner}" \
+                        ) \
+                <(<"${wavscp}" awk '{ print "(" $1 ")" }') \
+                    >"${_scoredir}/ref.trn"
+            paste \
+                <(<"${outdir}/text" \
+                      python3 -m espnet2.bin.tokenize_text  \
+                          -f 2- --input - --output - \
+                          --token_type bpe \
+                          --bpemodel "${bpemodel}" \
+                          ) \
+                <(<"${wavscp}" awk '{ print "(" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn"
+
+        fi
+
+        # Scoring
+        sclite \
+            -r "${_scoredir}/ref.trn" trn \
+            -h "${_scoredir}/hyp.trn" trn \
+            -i rm -o all stdout > "${_scoredir}/result.txt"
+
+        log "Write ${_type} result in ${_scoredir}/result.txt"
+        grep -e Avg -e SPKR -m 2 "${_scoredir}/result.txt"
+    done
+fi
+
+# Remove tmp dir if exists
+rm -rf "${outdir}/tmp"
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
new file mode 100644
index 00000000000..8e0e863fc0c
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
@@ -0,0 +1,7 @@
+from espnet_model_zoo.downloader import ModelDownloader
+import sys
+
+model_name = sys.argv[1]
+d = ModelDownloader()
+model_path = d.download(sys.argv[1])
+print(model_path)
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/perturb_data_dir_speed.sh b/egs2/TEMPLATE/asr1/scripts/utils/perturb_data_dir_speed.sh
index 9e08dba72bb..972eaf785c5 100755
--- a/egs2/TEMPLATE/asr1/scripts/utils/perturb_data_dir_speed.sh
+++ b/egs2/TEMPLATE/asr1/scripts/utils/perturb_data_dir_speed.sh
@@ -23,6 +23,9 @@
 export LC_ALL=C
 set -euo pipefail
 
+utt_extra_files=
+. utils/parse_options.sh
+
 if [[ $# != 3 ]]; then
     echo "Usage: perturb_data_dir_speed.sh <warping-factor> <srcdir> <destdir>"
     echo "e.g.:"
@@ -100,17 +103,17 @@ else # no segments->wav indexed by utterance.
     fi
 fi
 
-if [[ -f ${srcdir}/text ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/text >"${destdir}"/text
-fi
+for x in text utt2lang ${utt_extra_files}; do
+    echo ${x}
+    if [[ -f ${srcdir}/${x} ]]; then
+        utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/${x} >"${destdir}"/${x}
+    fi
+done
 if [[ -f ${srcdir}/spk2gender ]]; then
     utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender
 fi
-if [[ -f ${srcdir}/utt2lang ]]; then
-    utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/utt2lang >"${destdir}"/utt2lang
-fi
 
 rm "${destdir}"/spk_map "${destdir}"/utt_map "${destdir}"/reco_map 2>/dev/null
 echo "$0: generated speed-perturbed version of data in ${srcdir}, in ${destdir}"
-
+utils/fix_data_dir.sh "${destdir}"
 utils/validate_data_dir.sh --no-feats --no-text "${destdir}"
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
index 2626dea95cb..9b8abb9d658 100755
--- a/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
+++ b/egs2/TEMPLATE/asr1/scripts/utils/show_asr_result.sh
@@ -44,21 +44,36 @@ cat << EOF
 EOF
 
 while IFS= read -r expdir; do
-    if ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
+    
+      if ls "${expdir}"/*/*/result.sum &> /dev/null; then
+	echo "## $(basename ${expdir})"
+	cat << EOF
+|dataset|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|
+EOF
+	grep -H -e "RESULT" "${expdir}"/*/*/result.sum | sed 's=RESULT==g' |  cut -d ' ' -f 1,2- | tr ' ' '|'
+	echo  
+      elif ls "${expdir}"/*/*/score_*/result.txt &> /dev/null; then
         echo "## $(basename ${expdir})"
         for type in wer cer ter; do
-            if ls "${expdir}"/*/*/score_${type}/result.txt &> /dev/null; then
-                cat << EOF
+                	cat << EOF
 ### ${type^^}
 
 |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
 |---|---|---|---|---|---|---|---|---|
 EOF
-                grep -H -e Avg "${expdir}"/*/*/score_${type}/result.txt \
-                    | sed -e "s#${expdir}/\([^/]*/[^/]*\)/score_${type}/result.txt:#|\1#g" \
-                    | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
-                echo
-            fi
+		if  [[ $type == "wer" ]] && [[ -n $(ls ${expdir}/*/*/score_wer/scoring/*.filt.sys) ]] ; then
+	    		## If STM used for HUBSCR based scoring, the *.sys files have the WER, not result.txt or result.wrd.txt
+            		grep -H -e Sum/Avg "${expdir}"/*/*/score_wer/scoring/*.filt.sys \
+				| sed -e "s#${expdir}/\([^/]*/[^/]*\)/score_wer/scoring/\([[:graph:]]*\):#|\1/\2#g" \
+			| sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+	    		echo 
+	    	elif ls "${expdir}"/*/*/score_${type}/result.txt &> /dev/null; then
+                		grep -H -e Avg "${expdir}"/*/*/score_${type}/result.txt \
+                    		| sed -e "s#${expdir}/\([^/]*/[^/]*\)/score_${type}/result.txt:#|\1#g" \
+                    		| sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+                 		echo
+    	        fi
         done
     fi
 
diff --git a/egs2/TEMPLATE/asr1/scripts/utils/upload_models_to_hub.sh b/egs2/TEMPLATE/asr1/scripts/utils/upload_models_to_hub.sh
new file mode 100755
index 00000000000..9ef25ab7d4d
--- /dev/null
+++ b/egs2/TEMPLATE/asr1/scripts/utils/upload_models_to_hub.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+rm model_names.txt
+echo "$1"
+model_name="$1"
+echo ${model_name}
+
+python get_model_names.py "${model_name}" >> model_names.txt # Saves model path in a file
+declare a
+IFS=$'\n' GLOBIGNORE='*' command eval  "a=($(cat model_names.txt))"
+
+for value in "${a[@]}"
+do
+
+echo ${value} # This is model path
+
+
+escapeString="_"
+pattern1="/"
+pattern2="+"
+pattern3=" " # Repo name does not accept /
+repo_name=${model_name//${pattern3}/${escapeString}} 
+repo_name=${repo_name//${pattern1}/${escapeString}} 
+repo_name=${repo_name//${pattern2}/${escapeString}} 
+# Get name of hugging face repo
+
+rm -rf dest/*
+unzip  ${value} -d dest/ # Save data in dest folder
+
+# # Create hugging face repo and push all data to repo
+transformers-cli repo create ${repo_name}  --organization espnet
+git clone https://huggingface.co/espnet/${repo_name}
+mv dest/* ${repo_name}/.
+
+# Add readme
+python create_README_file.py ${repo_name} "${model_name}"
+cd ${repo_name} || exit
+git add .
+git commit -m "import from zenodo"
+git push
+
+
+cd ..  || exit
+rm -rf ${repo_name}
+done
diff --git a/egs2/TEMPLATE/diar1/cmd.sh b/egs2/TEMPLATE/diar1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/TEMPLATE/diar1/cmd.sh
+++ b/egs2/TEMPLATE/diar1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/TEMPLATE/diar1/diar.sh b/egs2/TEMPLATE/diar1/diar.sh
index cea36915b6d..815c73537f4 100755
--- a/egs2/TEMPLATE/diar1/diar.sh
+++ b/egs2/TEMPLATE/diar1/diar.sh
@@ -23,30 +23,31 @@ min() {
 SECONDS=0
 
 # General configuration
-stage=1          # Processes starts from the specified stage.
-stop_stage=10000 # Processes is stopped at the specified stage.
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
 skip_data_prep=false # Skip data preparation stages
 skip_train=false     # Skip training stages
 skip_eval=false      # Skip decoding and evaluation stages
 skip_upload=true     # Skip packing and uploading stages
-ngpu=1           # The number of gpus ("0" uses cpu, otherwise use gpu).
-num_nodes=1      # The number of nodes
-nj=32            # The number of parallel jobs.
-dumpdir=dump     # Directory to dump features.
-inference_nj=32     # The number of parallel jobs in decoding.
-gpu_inference=false # Whether to perform gpu decoding.
-expdir=exp       # Directory to save experiments.
+skip_upload_hf=true # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes
+nj=32                # The number of parallel jobs.
+dumpdir=dump         # Directory to dump features.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+expdir=exp           # Directory to save experiments.
 python=python3       # Specify python to execute espnet commands
 
 # Data preparation related
 local_data_opts= # The options given to local/data.sh.
 
 # Feature extraction related
-feats_type=raw    # Feature type (raw or fbank_pitch).
-audio_format=flac # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
-fs=8k             # Sampling rate.
-hop_length=128    # Hop length in sample number
-min_wav_duration=0.1   # Minimum duration in second
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=8k                # Sampling rate.
+hop_length=128       # Hop length in sample number
+min_wav_duration=0.1 # Minimum duration in second
 
 # diar model related
 diar_tag=    # Suffix to the result dir for diar model training.
@@ -54,12 +55,16 @@ diar_config= # Config for diar model training.
 diar_args=   # Arguments for diar model training, e.g., "--max_epoch 10".
              # Note that it will overwrite args in diar config.
 feats_normalize=global_mvn # Normalizaton layer type.
-num_spk=2    # # Number of speakers in the input audio 
+num_spk=2    # Number of speakers in the input audio
 
 # diar related
-inference_config=    # Config for diar model inference
+inference_config= # Config for diar model inference
 inference_model=valid.acc.best.pth
-inference_tag=       # Suffix to the inference dir for diar model inference
+inference_tag=    # Suffix to the inference dir for diar model inference
+download_model=   # Download a model from Model Zoo and use it for diarization.
+
+# Upload model related
+hf_repo=
 
 # scoring related
 collar=0         # collar for der scoring
@@ -72,6 +77,7 @@ valid_set=       # Name of development set.
 test_sets=       # Names of evaluation sets. Multiple items can be specified.
 diar_speech_fold_length=800 # fold_length for speech data during diar training
                             # Typically, the label also follow the same fold length
+lang=noinfo      # The language type of corpus.
 
 
 help_message=$(cat << EOF
@@ -79,53 +85,54 @@ Usage: $0 --train-set <train_set_name> --valid-set <valid_set_name> --test_sets
 
 Options:
     # General configuration
-    --stage         # Processes starts from the specified stage (default="${stage}").
-    --stop_stage    # Processes is stopped at the specified stage (default="${stop_stage}").
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
     --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
     --skip_train     # Skip training stages (default="${skip_train}").
     --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
     --skip_upload    # Skip packing and uploading stages (default="${skip_upload}").
-    --ngpu          # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
-    --num_nodes     # The number of nodes
-    --nj            # The number of parallel jobs (default="${nj}").
-    --inference_nj  # The number of parallel jobs in inference (default="${inference_nj}").
-    --gpu_inference # Whether to use gpu for inference (default="${gpu_inference}").
-    --dumpdir       # Directory to dump features (default="${dumpdir}").
-    --expdir        # Directory to save experiments (default="${expdir}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in inference (default="${inference_nj}").
+    --gpu_inference  # Whether to use gpu for inference (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
     --python         # Specify python to execute espnet commands (default="${python}").
 
     # Data preparation related
     --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
 
     # Feature extraction related
-    --feats_type   # Feature type (only support raw currently).
-    --audio_format # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
-    --fs           # Sampling rate (default="${fs}").
-    --hop_length   # Hop length in sample number (default="${hop_length}")
+    --feats_type       # Feature type (only support raw currently).
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --hop_length       # Hop length in sample number (default="${hop_length}")
     --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
 
 
     # Diarization model related
-    --diar_tag    # Suffix to the result dir for diarization model training (default="${diar_tag}").
-    --diar_config # Config for diarization model training (default="${diar_config}").
-    --diar_args   # Arguments for diarization model training, e.g., "--max_epoch 10" (default="${diar_args}").
-                 # Note that it will overwrite args in diar config.
-    --feats_normalize  # Normalizaton layer type (default="${feats_normalize}").
-    --num_spk    # Number of speakers in the input audio (default="${num_spk}")
+    --diar_tag        # Suffix to the result dir for diarization model training (default="${diar_tag}").
+    --diar_config     # Config for diarization model training (default="${diar_config}").
+    --diar_args       # Arguments for diarization model training, e.g., "--max_epoch 10" (default="${diar_args}").
+                      # Note that it will overwrite args in diar config.
+    --feats_normalize # Normalizaton layer type (default="${feats_normalize}").
+    --num_spk         # Number of speakers in the input audio (default="${num_spk}")
 
     # Diarization related
-    --inference_config # Config for diar model inference 
+    --inference_config # Config for diar model inference
     --inference_model  # diarization model path for inference (default="${inference_model}").
     --inference_tag    # Suffix to the inference dir for diar model inference
+    --download_model   # Download a model from Model Zoo and use it for diarization (default="${download_model}").
 
     # Scoring related
     --collar      # collar for der scoring
     --frame_shift # frame shift to convert frame-level label into real time
 
     # [Task dependent] Set the datadir name created by local/data.sh
-    --train_set     # Name of training set (required).
-    --valid_set       # Name of development set (required).
-    --test_sets     # Names of evaluation sets (required).
+    --train_set               # Name of training set (required).
+    --valid_set               # Name of development set (required).
+    --test_sets               # Names of evaluation sets (required).
     --diar_speech_fold_length # fold_length for speech data during diarization training  (default="${diar_speech_fold_length}").
 EOF
 )
@@ -252,7 +259,7 @@ if ! "${skip_data_prep}"; then
             <"${data_feats}/org/${dset}/wav.scp" \
                 utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
                 >"${data_feats}/${dset}/wav.scp"
-            
+
             # fix_data_dir.sh leaves only utts which exist in all files
             utils/fix_data_dir.sh "${data_feats}/${dset}"
 
@@ -332,8 +339,8 @@ if ! "${skip_train}"; then
         utils/split_scp.pl "${key_file}" ${split_scps}
 
         # 2. Generate run.sh
-        log "Generate '${diar_stats_dir}/run.sh'. You can resume the process from stage 9 using this script"
-        mkdir -p "${diar_stats_dir}"; echo "${run_args} --stage 9 \"\$@\"; exit \$?" > "${diar_stats_dir}/run.sh"; chmod +x "${diar_stats_dir}/run.sh"
+        log "Generate '${diar_stats_dir}/run.sh'. You can resume the process from stage 4 using this script"
+        mkdir -p "${diar_stats_dir}"; echo "${run_args} --stage 4 \"\$@\"; exit \$?" > "${diar_stats_dir}/run.sh"; chmod +x "${diar_stats_dir}/run.sh"
 
         # 3. Submit jobs
         log "Diarization collect-stats started... log: '${_logdir}/stats.*.log'"
@@ -410,8 +417,8 @@ if ! "${skip_train}"; then
         _opts+="--valid_shape_file ${diar_stats_dir}/valid/speech_shape "
         _opts+="--valid_shape_file ${diar_stats_dir}/valid/spk_labels_shape "
 
-        log "Generate '${diar_exp}/run.sh'. You can resume the process from stage 10 using this script"
-        mkdir -p "${diar_exp}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${diar_exp}/run.sh"; chmod +x "${diar_exp}/run.sh"
+        log "Generate '${diar_exp}/run.sh'. You can resume the process from stage 5 using this script"
+        mkdir -p "${diar_exp}"; echo "${run_args} --stage 5 \"\$@\"; exit \$?" > "${diar_exp}/run.sh"; chmod +x "${diar_exp}/run.sh"
 
         # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
         log "Diarization training started... log: '${diar_exp}/train.log'"
@@ -443,6 +450,24 @@ else
     log "Skip the training stages"
 fi
 
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    diar_exp="${expdir}/${download_model}"
+    mkdir -p "${diar_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${diar_exp}/config.txt"
+
+    # Get the path of each file
+    _diar_model_file=$(<"${diar_exp}/config.txt" sed -e "s/.*'diar_model_file': '\([^']*\)'.*$/\1/")
+    _diar_train_config=$(<"${diar_exp}/config.txt" sed -e "s/.*'diar_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_diar_model_file}" "${diar_exp}"
+    ln -sf "${_diar_train_config}" "${diar_exp}"
+    inference_diar_model=$(basename "${_diar_model_file}")
+
+fi
 
 if ! "${skip_eval}"; then
     if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
@@ -457,15 +482,14 @@ if ! "${skip_eval}"; then
         fi
 
         log "Generate '${diar_exp}/run_diarize.sh'. You can resume the process from stage 6 using this script"
-        mkdir -p "${diar_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${diar_exp}/run_diarize.sh"; chmod +x "${diar_exp}/run_diarize.sh"
+        mkdir -p "${diar_exp}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${diar_exp}/run_diarize.sh"; chmod +x "${diar_exp}/run_diarize.sh"
         _opts=
 
         if [ -n "${inference_config}" ]; then
             _opts+="--config ${inference_config} "
         fi
 
-        # for dset in "${valid_set}" ${test_sets}; do
-        for dset in "${valid_set}"; do
+        for dset in "${valid_set}" ${test_sets}; do
             _data="${data_feats}/${dset}"
             _dir="${diar_exp}/diarized_${dset}"
             _logdir="${_dir}/logdir"
@@ -493,8 +517,8 @@ if ! "${skip_eval}"; then
                     --fs "${fs}" \
                     --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
                     --key_file "${_logdir}"/keys.JOB.scp \
-                    --diar_train_config "${diar_exp}"/config.yaml \
-                    --diar_model_file "${diar_exp}"/"${inference_model}" \
+                    --train_config "${diar_exp}"/config.yaml \
+                    --model_file "${diar_exp}"/"${inference_model}" \
                     --output_dir "${_logdir}"/output.JOB \
                     ${_opts}
 
@@ -520,6 +544,10 @@ if ! "${skip_eval}"; then
                 --collar ${collar} --fs ${fs} --frame_shift ${frame_shift}
         done
 
+        # Show results in Markdown syntax
+        scripts/utils/show_diar_result.sh "${diar_exp}" > "${diar_exp}"/RESULTS.md
+        cat "${diar_exp}"/RESULTS.md
+
     fi
 else
     log "Skip the evaluation stages"
@@ -527,22 +555,25 @@ fi
 
 
 packed_model="${diar_exp}/${diar_exp##*/}_${inference_model%.*}.zip"
-if ! "${skip_upload}"; then
+if [ -z "${download_model}" ]; then
+    # Skip pack preparation if using a downloaded model
     if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
         log "Stage 8: Pack model: ${packed_model}"
 
         ${python} -m espnet2.bin.pack diar \
             --train_config "${diar_exp}"/config.yaml \
             --model_file "${diar_exp}"/"${inference_model}" \
-            --option "${diar_exp}"/RESULTS.TXT \
+            --option "${diar_exp}"/RESULTS.md \
             --option "${diar_stats_dir}"/train/feats_stats.npz  \
             --option "${diar_exp}"/images \
             --outpath "${packed_model}"
     fi
+fi
 
-
+if ! "${skip_upload}"; then
     if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
         log "Stage 9: Upload model to Zenodo: ${packed_model}"
+        log "Warning: Upload model to Zenodo will be deprecated. We encourage to use Hugging Face"
 
         # To upload your model, you need to do:
         #   1. Sign up to Zenodo: https://zenodo.org/
@@ -596,7 +627,59 @@ EOF
             --publish false
     fi
 else
-    log "Skip the uploading stages"
+    log "Skip the uploading stage"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 10: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1
+
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=diarization
+        # shellcheck disable=SC2034
+        espnet_task=DIAR
+        # shellcheck disable=SC2034
+        task_exp=${diar_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/diar1/pyscripts/utils/convert_rttm.py b/egs2/TEMPLATE/diar1/pyscripts/utils/convert_rttm.py
index 5230ce83a8c..d5d4b257b36 100755
--- a/egs2/TEMPLATE/diar1/pyscripts/utils/convert_rttm.py
+++ b/egs2/TEMPLATE/diar1/pyscripts/utils/convert_rttm.py
@@ -61,13 +61,12 @@ def convert_rttm_text(
                 utt_id, path
             )
 
-            array, rate = soundfile.read(wav_path, always_2d=True)
-            assert rate == sampling_rate
-            shape = array.shape[0]
+            sf = soundfile.SoundFile(wav_path)
+            assert sf.samplerate == sampling_rate
             output_handler.write(
                 (
                     "{} {} <NA> <NA> {} <NA> <NA> <NA> <NA>\n".format(
-                        "END", utt_id, shape
+                        "END", utt_id, sf.frames
                     )
                 )
             )
diff --git a/egs2/TEMPLATE/diar1/pyscripts/utils/make_rttm.py b/egs2/TEMPLATE/diar1/pyscripts/utils/make_rttm.py
index 9ee2ee1cf51..f8b9c8c05af 100755
--- a/egs2/TEMPLATE/diar1/pyscripts/utils/make_rttm.py
+++ b/egs2/TEMPLATE/diar1/pyscripts/utils/make_rttm.py
@@ -9,6 +9,7 @@
 import logging
 import numpy as np
 from scipy.signal import medfilt
+import humanfriendly
 
 
 def get_parser() -> argparse.Namespace:
@@ -21,7 +22,7 @@ def get_parser() -> argparse.Namespace:
     parser.add_argument("--frame_shift", default=128, type=int)
     parser.add_argument("--subsampling", default=1, type=int)
     parser.add_argument("--median", default=1, type=int)
-    parser.add_argument("--sampling_rate", default=8000, type=int)
+    parser.add_argument("--sampling_rate", default="8000", type=str)
     parser.add_argument(
         "--verbose",
         default=1,
@@ -34,7 +35,7 @@ def get_parser() -> argparse.Namespace:
 def main():
     """Make rttm based on diarization inference results"""
     args = get_parser().parse_args()
-
+    sampling_rate = humanfriendly.parse_size(args.sampling_rate)
     # logging info
     if args.verbose > 1:
         logging.basicConfig(
@@ -61,7 +62,7 @@ def main():
             a = np.where(data[:] > args.threshold, 1, 0)
             if args.median > 1:
                 a = medfilt(a, (args.median, 1))
-            factor = args.frame_shift * args.subsampling / args.sampling_rate
+            factor = args.frame_shift * args.subsampling / sampling_rate
             for spkid, frames in enumerate(a.T):
                 frames = np.pad(frames, (1, 1), "constant")
                 (changes,) = np.where(np.diff(frames, axis=0) != 0)
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/TEMPLATE_HF_Readme.md b/egs2/TEMPLATE/diar1/scripts/utils/TEMPLATE_HF_Readme.md
new file mode 120000
index 00000000000..137c5c9044a
--- /dev/null
+++ b/egs2/TEMPLATE/diar1/scripts/utils/TEMPLATE_HF_Readme.md
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/TEMPLATE_HF_Readme.md
\ No newline at end of file
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/TEMPLATE_Readme.md b/egs2/TEMPLATE/diar1/scripts/utils/TEMPLATE_Readme.md
new file mode 100644
index 00000000000..c534da9552b
--- /dev/null
+++ b/egs2/TEMPLATE/diar1/scripts/utils/TEMPLATE_Readme.md
@@ -0,0 +1,42 @@
+---
+tags:
+- espnet
+- audio
+- speaker-diarization
+language: <add_lang>
+datasets:
+- <add_corpus>
+license: cc-by-4.0
+---
+## ESPnet2 DIAR pretrained model 
+### `<add_model_name>`
+♻️ Imported from <add_url>
+
+This model was trained by <add_name> using <add_corpus>/diar1 recipe in [espnet](https://github.com/espnet/espnet/).
+### Demo: How to use in ESPnet2
+```python
+# coming soon
+```
+### Citing ESPnet
+```BibTex
+@inproceedings{watanabe2018espnet,
+  author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+  title={{ESPnet}: End-to-End Speech Processing Toolkit},
+  year={2018},
+  booktitle={Proceedings of Interspeech},
+  pages={2207--2211},
+  doi={10.21437/Interspeech.2018-1456},
+  url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
+}
+```
+or arXiv:
+```bibtex
+@misc{watanabe2018espnet,
+      title={ESPnet: End-to-End Speech Processing Toolkit}, 
+      author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+      year={2018},
+      eprint={1804.00015},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
\ No newline at end of file
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py
new file mode 120000
index 00000000000..a18aed64ab6
--- /dev/null
+++ b/egs2/TEMPLATE/diar1/scripts/utils/create_README_file.py
@@ -0,0 +1 @@
+egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py
new file mode 120000
index 00000000000..0b4eaaf09a8
--- /dev/null
+++ b/egs2/TEMPLATE/diar1/scripts/utils/get_model_names.py
@@ -0,0 +1 @@
+egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/score_der.sh b/egs2/TEMPLATE/diar1/scripts/utils/score_der.sh
index d53260101af..fadb5ff3931 100755
--- a/egs2/TEMPLATE/diar1/scripts/utils/score_der.sh
+++ b/egs2/TEMPLATE/diar1/scripts/utils/score_der.sh
@@ -8,7 +8,7 @@ frame_shift=128
 fs=8000
 subsampling=1
 
-./utils/parse_options.sh || exit 1
+. ./utils/parse_options.sh || exit 1
 
 if [ $# -lt 3 ]; then
     echo "Usage: $0 <scoring_dir> <infer_scp> <gt_label>";
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/show_diar_result.sh b/egs2/TEMPLATE/diar1/scripts/utils/show_diar_result.sh
new file mode 100755
index 00000000000..ca9bedfa5b7
--- /dev/null
+++ b/egs2/TEMPLATE/diar1/scripts/utils/show_diar_result.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+mindepth=1
+maxdepth=5
+
+. utils/parse_options.sh
+
+if [ $# -gt 1 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    exp=exp
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python3 << EOF
+import sys, espnet, torch
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- espnet version: \`espnet {espnet.__version__}\`
+- pytorch version: \`pytorch {torch.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+while IFS= read -r expdir; do
+#    if ls "${expdir}"/*/*/*/scoring/result_* &> /dev/null; then
+    if ls "${expdir}"/scoring/result_* &> /dev/null; then
+    diardir=${expdir#*/}
+        cat << EOF
+## ${diardir%%/*}
+### DER
+${expdir##*/}
+|threshold_median_collar|DER|
+|---|---|
+EOF
+        for file in "${expdir}"/scoring/result_*; do
+            grep OVER ${file} \
+                | grep -v nooverlap \
+                | sed "s/^.*[^0-9]\([0-9]\{1,3\}\.[0-9]\{2\}\).*$/\|$(basename ${file})\|\1\|/"
+            echo -n
+        done
+    fi
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)
diff --git a/egs2/TEMPLATE/diar1/scripts/utils/upload_models_to_hub.sh b/egs2/TEMPLATE/diar1/scripts/utils/upload_models_to_hub.sh
new file mode 120000
index 00000000000..aeae4732e4b
--- /dev/null
+++ b/egs2/TEMPLATE/diar1/scripts/utils/upload_models_to_hub.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/upload_models_to_hub.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/README.md b/egs2/TEMPLATE/enh1/README.md
index d048c427e3d..2d7e7aa542b 100644
--- a/egs2/TEMPLATE/enh1/README.md
+++ b/egs2/TEMPLATE/enh1/README.md
@@ -171,10 +171,14 @@ Same as Stage 12 in the ASR task. The decoding results in Stage 11 are scored to
 
 Just the same as other tasks. A new entry for packing speech enhancement models is added in `espnet2/bin/pack.py`.
 
-#### Stage 12: Upload model to Zenodo
+#### Stage 12: Upload model to Zenodo (Deprecated)
 
 Upload the trained speech enhancement/separation model to Zenodo for sharing.
 
+#### Stage 13: Upload model to Hugging Face
+
+Upload the trained speech enhancement/separation model to Hugging Face for sharing. Additonal information at [Docs](https://espnet.github.io/espnet/espnet2_tutorial.html#packing-and-sharing-your-trained-model).
+
 ## Instructions on creating a new recipe
 #### Step 1 Create recipe directory
 First, run the following command to create the directory for the new recipe from our template:
diff --git a/egs2/TEMPLATE/enh1/cmd.sh b/egs2/TEMPLATE/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/TEMPLATE/enh1/cmd.sh
+++ b/egs2/TEMPLATE/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/TEMPLATE/enh1/enh.sh b/egs2/TEMPLATE/enh1/enh.sh
index c0793db4f96..cb6e9e8503b 100755
--- a/egs2/TEMPLATE/enh1/enh.sh
+++ b/egs2/TEMPLATE/enh1/enh.sh
@@ -29,6 +29,7 @@ skip_data_prep=false # Skip data preparation stages
 skip_train=false     # Skip training stages
 skip_eval=false      # Skip inference and evaluation stages
 skip_upload=true     # Skip packing and uploading stages
+skip_upload_hf=true # Skip uploading to hugging face stages.
 ngpu=1           # The number of gpus ("0" uses cpu, otherwise use gpu).
 num_nodes=1      # The number of nodes
 nj=32            # The number of parallel jobs.
@@ -52,9 +53,9 @@ min_wav_duration=0.1   # Minimum duration in second
 max_wav_duration=20    # Maximum duration in second
 
 # Enhancement model related
-enh_exp=    # Specify the direcotry path for enhancement experiment. If this option is specified, enh_tag is ignored.
+enh_exp=    # Specify the directory path for enhancement experiment. If this option is specified, enh_tag is ignored.
 enh_tag=    # Suffix to the result dir for enhancement model training.
-enh_config= # Config for ehancement model training.
+enh_config= # Config for enhancement model training.
 enh_args=   # Arguments for enhancement model training, e.g., "--max_epoch 10".
             # Note that it will overwrite args in enhancement config.
 spk_num=2   # Number of speakers
@@ -71,7 +72,8 @@ init_param=
 
 # Enhancement related
 inference_args="--normalize_output_wav true"
-inference_model=valid.si_snr.ave.pth
+inference_model=valid.loss.ave.pth
+download_model=
 
 # Evaluation related
 scoring_protocol="STOI SDR SAR SIR"
@@ -79,6 +81,12 @@ ref_channel=0
 score_with_asr=false
 asr_exp=""       # asr model for scoring WER
 lm_exp=""       # lm model for scoring WER
+inference_asr_model=valid.acc.best.pth # ASR model path for decoding.
+inference_lm=valid.loss.best.pth       # Language model path for decoding.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+inference_asr_tag=    # Suffix to the result dir for decoding.
+inference_asr_config= # Config for decoding.
+inference_asr_args=   # Arguments for ASR decoding, e.g., "--lm_weight 0.1".
 
 # [Task dependent] Set the datadir name created by local/data.sh
 train_set=       # Name of training set.
@@ -87,6 +95,9 @@ test_sets=       # Names of evaluation sets. Multiple items can be specified.
 enh_speech_fold_length=800 # fold_length for speech data during enhancement training
 lang=noinfo      # The language type of corpus
 
+# Upload model related
+hf_repo=
+
 help_message=$(cat << EOF
 Usage: $0 --train-set <train_set_name> --valid-set <valid_set_name> --test_sets <test_set_names>
 
@@ -149,6 +160,18 @@ Options:
                             output is single-channel and reference speech is multi-channel
                             (default="${ref_channel}")
 
+    # ASR evaluation related
+    --score_with_asr       # Enable ASR evaluation (default="${score_with_asr}")
+    --asr_exp              # asr model for scoring WER  (default="${asr_exp}")
+    --lm_exp               # lm model for scoring WER (default="${lm_exp}")
+    --nlsyms_txt           # Non-linguistic symbol list if existing.  (default="${nlsyms_txt}")
+    --inference_asr_model  # ASR model path for decoding. (default="${inference_asr_model}")
+    --inference_lm         # Language model path for decoding. (default="${inference_lm}")
+    --nlsyms_txt           # Non-linguistic symbol list if existing. (default="${nlsyms_txt}")
+    --inference_asr_tag    # Suffix to the result dir for decoding. (default="${inference_asr_tag}")
+    --inference_asr_config # Config for ASR decoding.  (default="${inference_asr_config}")
+    --inference_asr_args   # Arguments for ASR decoding, e.g., "--lm_weight 0.1". (default="${inference_asr_args}")
+
     # [Task dependent] Set the datadir name created by local/data.sh
     --train_set     # Name of training set (required).
     --valid_set       # Name of development set (required).
@@ -194,6 +217,23 @@ if [ -z "${enh_tag}" ]; then
     fi
 fi
 
+if [ -z "${inference_asr_tag}" ]; then
+    if [ -n "${inference_asr_config}" ]; then
+        inference_asr_tag="$(basename "${inference_asr_config}" .yaml)"
+    else
+        inference_asr_tag=asr_inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${inference_asr_args}" ]; then
+        inference_asr_tag+="$(echo "${inference_asr_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if [ -n "${lm_exp}" ]; then
+        inference_asr_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_asr_tag+="_asr_model_$(echo "${inference_asr_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+fi
+
+
 
 # The directory used for collect-stats mode
 enh_stats_dir="${expdir}/enh_stats_${fs}"
@@ -318,7 +358,7 @@ if ! "${skip_data_prep}"; then
             if $use_noise_ref; then
                 # references for denoising ("noise1 noise2 ... niose${noise_type_num} ")
                 _spk_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n "; done)
-                _scp_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n.scp "; done)                
+                _scp_list+=$(for n in $(seq $noise_type_num); do echo -n "noise$n.scp "; done)
             fi
             if $use_dereverb_ref; then
                 # references for dereverberation
@@ -413,27 +453,27 @@ if ! "${skip_train}"; then
         log "Enhancement collect-stats started... log: '${_logdir}/stats.*.log'"
 
         # prepare train and valid data parameters
-        _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/wav.scp,speech_mix,sound "
-        _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,sound "
+        _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/wav.scp,speech_mix,${_type} "
+        _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,${_type} "
         for spk in $(seq "${spk_num}"); do
-            _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},sound "
-            _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/spk${spk}.scp,speech_ref${spk},sound "
+            _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
+            _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
         done
 
         if $use_dereverb_ref; then
             # references for dereverberation
             _train_data_param+=$(for n in $(seq $dereverb_ref_num); do echo -n \
-                "--train_data_path_and_name_and_type ${_enh_train_dir}/dereverb${n}.scp,dereverb_ref${n},sound "; done)
+                "--train_data_path_and_name_and_type ${_enh_train_dir}/dereverb${n}.scp,dereverb_ref${n},${_type} "; done)
             _valid_data_param+=$(for n in $(seq $dereverb_ref_num); do echo -n \
-                "--valid_data_path_and_name_and_type ${_enh_valid_dir}/dereverb${n}.scp,dereverb_ref${n},sound "; done)
+                "--valid_data_path_and_name_and_type ${_enh_valid_dir}/dereverb${n}.scp,dereverb_ref${n},${_type} "; done)
         fi
 
         if $use_noise_ref; then
             # references for denoising
             _train_data_param+=$(for n in $(seq $noise_type_num); do echo -n \
-                "--train_data_path_and_name_and_type ${_enh_train_dir}/noise${n}.scp,noise_ref${n},sound "; done)
+                "--train_data_path_and_name_and_type ${_enh_train_dir}/noise${n}.scp,noise_ref${n},${_type} "; done)
             _valid_data_param+=$(for n in $(seq $noise_type_num); do echo -n \
-                "--valid_data_path_and_name_and_type ${_enh_valid_dir}/noise${n}.scp,noise_ref${n},sound "; done)
+                "--valid_data_path_and_name_and_type ${_enh_valid_dir}/noise${n}.scp,noise_ref${n},${_type} "; done)
         fi
 
         # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
@@ -477,19 +517,24 @@ if ! "${skip_train}"; then
 
         _scp=wav.scp
         # "sound" supports "wav", "flac", etc.
-        _type=sound
+        if [[ "${audio_format}" == *ark* ]]; then
+            _type=kaldi_ark
+        else
+            # "sound" supports "wav", "flac", etc.
+            _type=sound
+        fi
         _fold_length="$((enh_speech_fold_length * 100))"
 
         # prepare train and valid data parameters
-        _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/wav.scp,speech_mix,sound "
+        _train_data_param="--train_data_path_and_name_and_type ${_enh_train_dir}/wav.scp,speech_mix,${_type} "
         _train_shape_param="--train_shape_file ${enh_stats_dir}/train/speech_mix_shape "
-        _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,sound "
+        _valid_data_param="--valid_data_path_and_name_and_type ${_enh_valid_dir}/wav.scp,speech_mix,${_type} "
         _valid_shape_param="--valid_shape_file ${enh_stats_dir}/valid/speech_mix_shape "
         _fold_length_param="--fold_length ${_fold_length} "
         for spk in $(seq "${spk_num}"); do
-            _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},sound "
+            _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
             _train_shape_param+="--train_shape_file ${enh_stats_dir}/train/speech_ref${spk}_shape "
-            _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/spk${spk}.scp,speech_ref${spk},sound "
+            _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/spk${spk}.scp,speech_ref${spk},${_type} "
             _valid_shape_param+="--valid_shape_file ${enh_stats_dir}/valid/speech_ref${spk}_shape "
             _fold_length_param+="--fold_length ${_fold_length} "
         done
@@ -497,9 +542,9 @@ if ! "${skip_train}"; then
         if $use_dereverb_ref; then
             # references for dereverberation
             for n in $(seq "${dereverb_ref_num}"); do
-                _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/dereverb${n}.scp,dereverb_ref${n},sound "
+                _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/dereverb${n}.scp,dereverb_ref${n},${_type} "
                 _train_shape_param+="--train_shape_file ${enh_stats_dir}/train/dereverb_ref${n}_shape "
-                _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/dereverb${n}.scp,dereverb_ref${n},sound "
+                _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/dereverb${n}.scp,dereverb_ref${n},${_type} "
                 _valid_shape_param+="--valid_shape_file ${enh_stats_dir}/valid/dereverb_ref${n}_shape "
                 _fold_length_param+="--fold_length ${_fold_length} "
             done
@@ -508,9 +553,9 @@ if ! "${skip_train}"; then
         if $use_noise_ref; then
             # references for denoising
             for n in $(seq "${noise_type_num}"); do
-                _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/noise${n}.scp,noise_ref${n},sound "
+                _train_data_param+="--train_data_path_and_name_and_type ${_enh_train_dir}/noise${n}.scp,noise_ref${n},${_type} "
                 _train_shape_param+="--train_shape_file ${enh_stats_dir}/train/noise_ref${n}_shape "
-                _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/noise${n}.scp,noise_ref${n},sound "
+                _valid_data_param+="--valid_data_path_and_name_and_type ${_enh_valid_dir}/noise${n}.scp,noise_ref${n},${_type} "
                 _valid_shape_param+="--valid_shape_file ${enh_stats_dir}/valid/noise_ref${n}_shape "
                 _fold_length_param+="--fold_length ${_fold_length} "
             done
@@ -574,7 +619,12 @@ if ! "${skip_eval}"; then
             mkdir -p "${_logdir}"
 
             _scp=wav.scp
-            _type=sound
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
 
             # 1. Split the key file
             key_file=${_data}/${_scp}
@@ -587,7 +637,7 @@ if ! "${skip_eval}"; then
             utils/split_scp.pl "${key_file}" ${split_scps}
 
             # 2. Submit inference jobs
-            log "Ehancement started... log: '${_logdir}/enh_inference.*.log'"
+            log "Enhancement started... log: '${_logdir}/enh_inference.*.log'"
             # shellcheck disable=SC2086
             ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enh_inference.JOB.log \
                 ${python} -m espnet2.bin.enh_inference \
@@ -595,8 +645,8 @@ if ! "${skip_eval}"; then
                     --fs "${fs}" \
                     --data_path_and_name_and_type "${_data}/${_scp},speech_mix,${_type}" \
                     --key_file "${_logdir}"/keys.JOB.scp \
-                    --enh_train_config "${enh_exp}"/config.yaml \
-                    --enh_model_file "${enh_exp}"/"${inference_model}" \
+                    --train_config "${enh_exp}"/config.yaml \
+                    --model_file "${enh_exp}"/"${inference_model}" \
                     --output_dir "${_logdir}"/output.JOB \
                     ${_opts} ${inference_args}
 
@@ -622,62 +672,84 @@ if ! "${skip_eval}"; then
         log "Stage 8: Scoring"
         _cmd=${decode_cmd}
 
-        for dset in "${valid_set}" ${test_sets}; do
-            _data="${data_feats}/${dset}"
-            _inf_dir="${enh_exp}/enhanced_${dset}"
-            _dir="${enh_exp}/enhanced_${dset}/scoring"
-            _logdir="${_dir}/logdir"
-            mkdir -p "${_logdir}"
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS.md" ]; then
+                log "${data_feats}/RESULTS.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
 
-            # 1. Split the key file
-            key_file=${_data}/wav.scp
-            split_scps=""
-            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
-            for n in $(seq "${_nj}"); do
-                split_scps+=" ${_logdir}/keys.${n}.scp"
-            done
-            # shellcheck disable=SC2086
-            utils/split_scp.pl "${key_file}" ${split_scps}
+            for dset in "${valid_set}" ${test_sets}; do
+                _data="${data_feats}/${dset}"
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${dset}/scoring"
+                else
+                    _dir="${enh_exp}/enhanced_${dset}/scoring"
+                fi
 
+                _logdir="${_dir}/logdir"
+                mkdir -p "${_logdir}"
 
-            _ref_scp=
-            for spk in $(seq "${spk_num}"); do
-                _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
-            done
-            _inf_scp=
-            for spk in $(seq "${spk_num}"); do
-                _inf_scp+="--inf_scp ${_inf_dir}/spk${spk}.scp "
-            done
+                # 1. Split the key file
+                key_file=${_data}/wav.scp
+                split_scps=""
+                _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+                for n in $(seq "${_nj}"); do
+                    split_scps+=" ${_logdir}/keys.${n}.scp"
+                done
+                # shellcheck disable=SC2086
+                utils/split_scp.pl "${key_file}" ${split_scps}
 
-            # 2. Submit scoring jobs
-            log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
-            # shellcheck disable=SC2086
-            ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
-                ${python} -m espnet2.bin.enh_scoring \
-                    --key_file "${_logdir}"/keys.JOB.scp \
-                    --output_dir "${_logdir}"/output.JOB \
-                    ${_ref_scp} \
-                    ${_inf_scp} \
-                    --ref_channel ${ref_channel}
-
-            for spk in $(seq "${spk_num}"); do
-                for protocol in ${scoring_protocol} wav; do
-                    for i in $(seq "${_nj}"); do
-                        cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
-                    done | LC_ALL=C sort -k1 > "${_dir}/${protocol}_spk${spk}"
+
+                _ref_scp=
+                for spk in $(seq "${spk_num}"); do
+                    _ref_scp+="--ref_scp ${_data}/spk${spk}.scp "
+                done
+                _inf_scp=
+                for spk in $(seq "${spk_num}"); do
+                    if "${score_obs}"; then
+                        # To compute the score of observation, input original wav.scp
+                        _inf_scp+="--inf_scp ${data_feats}/${dset}/wav.scp "
+                    else
+                        _inf_scp+="--inf_scp ${enh_exp}/enhanced_${dset}/spk${spk}.scp "
+                    fi
+                done
+
+                # 2. Submit scoring jobs
+                log "Scoring started... log: '${_logdir}/enh_scoring.*.log'"
+                # shellcheck disable=SC2086
+                ${_cmd} JOB=1:"${_nj}" "${_logdir}"/enh_scoring.JOB.log \
+                    ${python} -m espnet2.bin.enh_scoring \
+                        --key_file "${_logdir}"/keys.JOB.scp \
+                        --output_dir "${_logdir}"/output.JOB \
+                        ${_ref_scp} \
+                        ${_inf_scp} \
+                        --ref_channel ${ref_channel}
+
+                for spk in $(seq "${spk_num}"); do
+                    for protocol in ${scoring_protocol} wav; do
+                        for i in $(seq "${_nj}"); do
+                            cat "${_logdir}/output.${i}/${protocol}_spk${spk}"
+                        done | LC_ALL=C sort -k1 > "${_dir}/${protocol}_spk${spk}"
+                    done
                 done
-            done
 
 
-            for protocol in ${scoring_protocol}; do
-                # shellcheck disable=SC2046
-                paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
-                awk 'BEGIN{sum=0}
-                    {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
-                    END{print sum/NR}' > "${_dir}/result_${protocol,,}.txt"
+                for protocol in ${scoring_protocol}; do
+                    # shellcheck disable=SC2046
+                    paste $(for j in $(seq ${spk_num}); do echo "${_dir}"/"${protocol}"_spk"${j}" ; done)  |
+                    awk 'BEGIN{sum=0}
+                        {n=0;score=0;for (i=2; i<=NF; i+=2){n+=1;score+=$i}; sum+=score/n}
+                        END{printf ("%.2f\n",sum/NR)}' > "${_dir}/result_${protocol,,}.txt"
+                done
             done
+
+            ./scripts/utils/show_enh_score.sh "${_dir}/../.." > "${_dir}/../../RESULTS.md"
         done
-        ./scripts/utils/show_enh_score.sh ${enh_exp} > "${enh_exp}/RESULTS.md"
+        log "Evaluation result for observation: ${data_feats}/RESULTS.md"
+        log "Evaluation result for enhancement: ${enh_exp}/enhanced/RESULTS.md"
 
     fi
 else
@@ -689,10 +761,15 @@ if "${score_with_asr}"; then
     if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
         log "Stage 9: Decode with pretrained ASR model: "
         _cmd=${decode_cmd}
-        decode_asr_model=valid.acc.best.pth
 
-        decode_args="--lm_train_config ${lm_exp}/config.yaml "
-        decode_args+="--lm_file ${lm_exp}/valid.loss.best.pth "
+        _opts=
+        if [ -n "${inference_asr_config}" ]; then
+            _opts+="--config ${inference_asr_config} "
+        fi
+        if [ -n "${lm_exp}" ]; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+        fi
 
         if ${gpu_inference}; then
             _cmd=${cuda_cmd}
@@ -702,61 +779,86 @@ if "${score_with_asr}"; then
             _ngpu=0
         fi
 
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS_ASR.md" ]; then
+                log "${data_feats}/RESULTS_ASR.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
 
-        for dset in ${valid_set} ${test_sets}; do
-            _data="${data_feats}/${dset}"
-            _inf_dir="${enh_exp}/enhanced_${dset}"
-            _dir="${enh_exp}/enhanced_${dset}/scoring_asr"
-
-            for spk in $(seq "${spk_num}"); do
-                _ddir=${_dir}/spk_${spk}
-                _logdir="${_ddir}/logdir"
-                _decode_dir="${_ddir}/decode"
-                mkdir -p ${_ddir}
-                mkdir -p "${_logdir}"
-                mkdir -p "${_decode_dir}"
-
-                cat ${enh_exp}/enhanced_${dset}/scoring/wav_spk${spk} > ${_ddir}/wav.scp
-                cp data/${dset}/text_spk${spk} ${_ddir}/text
-                cp ${_data}/{spk2utt,utt2spk,utt2num_samples,feats_type} ${_ddir}
-                utils/fix_data_dir.sh "${_ddir}"
-                mv ${_ddir}/wav.scp ${_ddir}/wav_ori.scp
-
-                scripts/audio/format_wav_scp.sh --nj "${inference_nj}" --cmd "${_cmd}" \
-                    --out-filename "wav.scp" \
-                    --audio-format "${audio_format}" --fs "${fs}" \
-                    "${_ddir}/wav_ori.scp" "${_ddir}" \
-                    "${_ddir}/formated/logs/" "${_ddir}/formated/"
-
-                # 1. Split the key file
-                key_file=${_ddir}/wav.scp
-                _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
-
-                split_scps=""
-                for n in $(seq "${_nj}"); do
-                    split_scps+=" ${_logdir}/keys.${n}.scp"
-                done
-                # shellcheck disable=SC2086
-                utils/split_scp.pl "${key_file}" ${split_scps}
+            for dset in ${valid_set} ${test_sets}; do
+                _data="${data_feats}/${dset}"
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${inference_asr_tag}/${dset}/"
+                else
+                    _dir="${enh_exp}/${inference_asr_tag}/${dset}"
+                fi
+
+                for spk in $(seq "${spk_num}"); do
+                    _ddir=${_dir}/spk_${spk}
+                    _logdir="${_ddir}/logdir"
+                    _decode_dir="${_ddir}/decode"
+                    mkdir -p ${_ddir}
+                    mkdir -p "${_logdir}"
+                    mkdir -p "${_decode_dir}"
+
+                    if "${score_obs}"; then
+                        # Using same wav.scp for all speakers
+                        cp "${_data}/wav.scp" "${_ddir}/wav.scp"
+                    else
+                        cp "${enh_exp}/enhanced_${dset}/scoring/wav_spk${spk}" "${_ddir}/wav.scp"
+                    fi
+                    cp data/${dset}/text_spk${spk} ${_ddir}/text
+                    cp ${_data}/{spk2utt,utt2spk,utt2num_samples,feats_type} ${_ddir}
+                    utils/fix_data_dir.sh "${_ddir}"
+                    mv ${_ddir}/wav.scp ${_ddir}/wav_ori.scp
+
+                    scripts/audio/format_wav_scp.sh --nj "${inference_nj}" --cmd "${_cmd}" \
+                        --out-filename "wav.scp" \
+                        --audio-format "${audio_format}" --fs "${fs}" \
+                        "${_ddir}/wav_ori.scp" "${_ddir}" \
+                        "${_ddir}/formated/logs/" "${_ddir}/formated/"
+
+                    if [[ "${audio_format}" == *ark* ]]; then
+                        _type=kaldi_ark
+                    else
+                        # "sound" supports "wav", "flac", etc.
+                        _type=sound
+                    fi
 
-                log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
-                # shellcheck disable=SC2086
-                ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
-                    python3 -m espnet2.bin.asr_inference \
-                        --ngpu "${_ngpu}" \
-                        --data_path_and_name_and_type "${_ddir}/wav.scp,speech,sound" \
-                        --key_file "${_logdir}"/keys.JOB.scp \
-                        --asr_train_config "${asr_exp}"/config.yaml \
-                        --asr_model_file "${asr_exp}"/"${decode_asr_model}" \
-                        --output_dir "${_logdir}"/output.JOB ${decode_args}
-
-                for f in token token_int score text; do
-                    for i in $(seq "${_nj}"); do
-                        cat "${_logdir}/output.${i}/1best_recog/${f}"
-                    done | LC_ALL=C sort -k1 >"${_decode_dir}/${f}"
+                    # 1. Split the key file
+                    key_file=${_ddir}/wav.scp
+                    _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+
+                    split_scps=""
+                    for n in $(seq "${_nj}"); do
+                        split_scps+=" ${_logdir}/keys.${n}.scp"
+                    done
+                    # shellcheck disable=SC2086
+                    utils/split_scp.pl "${key_file}" ${split_scps}
+
+                    log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
+                    # shellcheck disable=SC2086
+                    ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+                        python3 -m espnet2.bin.asr_inference \
+                            --ngpu "${_ngpu}" \
+                            --data_path_and_name_and_type "${_ddir}/wav.scp,speech,${_type}" \
+                            --key_file "${_logdir}"/keys.JOB.scp \
+                            --asr_train_config "${asr_exp}"/config.yaml \
+                            --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                            --output_dir "${_logdir}"/output.JOB \
+                            ${_opts} ${inference_asr_args}
+
+
+                    for f in token token_int score text; do
+                        for i in $(seq "${_nj}"); do
+                            cat "${_logdir}/output.${i}/1best_recog/${f}"
+                        done | LC_ALL=C sort -k1 >"${_decode_dir}/${f}"
+                    done
                 done
             done
-
         done
     fi
 
@@ -764,7 +866,6 @@ if "${score_with_asr}"; then
         log "Stage 10: Scoring with pretrained ASR model: "
 
         _cmd=${decode_cmd}
-        nlsyms_txt='./data/nlsyms.txt'
         cleaner=none
 
         if ${gpu_inference}; then
@@ -775,82 +876,99 @@ if "${score_with_asr}"; then
             _ngpu=0
         fi
 
-        for dset in ${valid_set} ${test_sets}; do
-            _inf_dir="${enh_exp}/enhanced_${dset}"
-            _dir="${enh_exp}/enhanced_${dset}/scoring_asr"
-
-            for spk in $(seq "${spk_num}"); do
-                _ddir=${_dir}/spk_${spk}
-                _logdir="${_ddir}/logdir"
-                _decode_dir="${_ddir}/decode"
-
-                for _type in cer wer; do
-
-                    _scoredir="${_ddir}/score_${_type}"
-                    mkdir -p "${_scoredir}"
-
-                    if [ "${_type}" = wer ]; then
-                        # Tokenize text to word level
-                        paste \
-                            <(<"${_ddir}/text" \
-                                python3 -m espnet2.bin.tokenize_text  \
-                                    -f 2- --input - --output - \
-                                    --token_type word \
-                                    --non_linguistic_symbols "${nlsyms_txt}" \
-                                    --remove_non_linguistic_symbols true \
-                                    --cleaner "${cleaner}" \
-                                    ) \
-                            <(<"${_ddir}/text" awk '{ print "(" $1 ")" }') \
-                                >"${_scoredir}/ref.trn"
-
-                        # NOTE(kamo): Don't use cleaner for hyp
-                        paste \
-                            <(<"${_decode_dir}/text"  \
-                                python3 -m espnet2.bin.tokenize_text  \
-                                    -f 2- --input - --output - \
-                                    --token_type word \
-                                    --non_linguistic_symbols "${nlsyms_txt}" \
-                                    --remove_non_linguistic_symbols true \
-                                    ) \
-                            <(<"${_ddir}/text" awk '{ print "(" $1 ")" }') \
-                                >"${_scoredir}/hyp.trn"
-                    elif [ "${_type}" = cer ]; then
-                        # Tokenize text to char level
-                        paste \
-                            <(<"${_ddir}/text" \
-                                python3 -m espnet2.bin.tokenize_text  \
-                                    -f 2- --input - --output - \
-                                    --token_type char \
-                                    --non_linguistic_symbols "${nlsyms_txt}" \
-                                    --remove_non_linguistic_symbols true \
-                                    --cleaner "${cleaner}" \
-                                    ) \
-                            <(<"${_ddir}/text" awk '{ print "(" $1 ")" }') \
-                                >"${_scoredir}/ref.trn"
-
-                        # NOTE(kamo): Don't use cleaner for hyp
-                        paste \
-                            <(<"${_decode_dir}/text"  \
-                                python3 -m espnet2.bin.tokenize_text  \
-                                    -f 2- --input - --output - \
-                                    --token_type char \
-                                    --non_linguistic_symbols "${nlsyms_txt}" \
-                                    --remove_non_linguistic_symbols true \
-                                    ) \
-                            <(<"${_ddir}/text" awk '{ print "(" $1 ")" }') \
-                                >"${_scoredir}/hyp.trn"
-                    fi
-
-                    sclite \
-                        -r "${_scoredir}/ref.trn" trn \
-                        -h "${_scoredir}/hyp.trn" trn \
-                        -i rm -o all stdout > "${_scoredir}/result.txt"
+        # score_obs=true: Scoring for observation signal
+        # score_obs=false: Scoring for enhanced signal
+        for score_obs in true false; do
+            # Peform only at the first time for observation
+            if "${score_obs}" && [ -e "${data_feats}/RESULTS_ASR.md" ]; then
+                log "${data_feats}/RESULTS_ASR.md already exists. The scoring for observation will be skipped"
+                continue
+            fi
 
-                    log "Write ${_type} result in ${_scoredir}/result.txt"
-                    grep -e Avg -e SPKR -m 2 "${_scoredir}/result.txt"
+            for dset in ${valid_set} ${test_sets}; do
+                if "${score_obs}"; then
+                    _dir="${data_feats}/${inference_asr_tag}/${dset}"
+                else
+                    _dir="${enh_exp}/${inference_asr_tag}/${dset}/"
+                fi
+
+                for spk in $(seq "${spk_num}"); do
+                    _ddir=${_dir}/spk_${spk}
+                    _logdir="${_ddir}/logdir"
+                    _decode_dir="${_ddir}/decode"
+
+                    for _type in cer wer; do
+
+                        _scoredir="${_ddir}/score_${_type}"
+                        mkdir -p "${_scoredir}"
+
+                        if [ "${_type}" = wer ]; then
+                            # Tokenize text to word level
+                            paste \
+                                <(<"${_ddir}/text" \
+                                    python3 -m espnet2.bin.tokenize_text  \
+                                        -f 2- --input - --output - \
+                                        --token_type word \
+                                        --non_linguistic_symbols "${nlsyms_txt}" \
+                                        --remove_non_linguistic_symbols true \
+                                        --cleaner "${cleaner}" \
+                                        ) \
+                                <(<"${_ddir}/text" awk '{ print "(" $1 ")" }') \
+                                    >"${_scoredir}/ref.trn"
+
+                            # NOTE(kamo): Don't use cleaner for hyp
+                            paste \
+                                <(<"${_decode_dir}/text"  \
+                                    python3 -m espnet2.bin.tokenize_text  \
+                                        -f 2- --input - --output - \
+                                        --token_type word \
+                                        --non_linguistic_symbols "${nlsyms_txt}" \
+                                        --remove_non_linguistic_symbols true \
+                                        ) \
+                                <(<"${_ddir}/text" awk '{ print "(" $1 ")" }') \
+                                    >"${_scoredir}/hyp.trn"
+                        elif [ "${_type}" = cer ]; then
+                            # Tokenize text to char level
+                            paste \
+                                <(<"${_ddir}/text" \
+                                    python3 -m espnet2.bin.tokenize_text  \
+                                        -f 2- --input - --output - \
+                                        --token_type char \
+                                        --non_linguistic_symbols "${nlsyms_txt}" \
+                                        --remove_non_linguistic_symbols true \
+                                        --cleaner "${cleaner}" \
+                                        ) \
+                                <(<"${_ddir}/text" awk '{ print "(" $1 ")" }') \
+                                    >"${_scoredir}/ref.trn"
+
+                            # NOTE(kamo): Don't use cleaner for hyp
+                            paste \
+                                <(<"${_decode_dir}/text"  \
+                                    python3 -m espnet2.bin.tokenize_text  \
+                                        -f 2- --input - --output - \
+                                        --token_type char \
+                                        --non_linguistic_symbols "${nlsyms_txt}" \
+                                        --remove_non_linguistic_symbols true \
+                                        ) \
+                                <(<"${_ddir}/text" awk '{ print "(" $1 ")" }') \
+                                    >"${_scoredir}/hyp.trn"
+                        fi
+
+                        sclite \
+                            -r "${_scoredir}/ref.trn" trn \
+                            -h "${_scoredir}/hyp.trn" trn \
+                            -i rm -o all stdout > "${_scoredir}/result.txt"
+
+                        log "Write ${_type} result in ${_scoredir}/result.txt"
+                        grep -e Avg -e SPKR -m 2 "${_scoredir}/result.txt"
+                    done
                 done
             done
+
+            scripts/utils/show_asr_result.sh "${_dir}/../../" > "${_dir}"/../../RESULTS_ASR.md
         done
+        log "Evaluation result for observation: ${data_feats}/RESULTS_ASR.md"
+        log "Evaluation result for enhancement: ${enh_exp}/RESULTS_ASR.md"
     fi
 else
     log "Skip the stages for scoring with asr"
@@ -859,7 +977,8 @@ fi
 
 
 packed_model="${enh_exp}/${enh_exp##*/}_${inference_model%.*}.zip"
-if ! "${skip_upload}"; then
+if [ -z "${download_model}" ]; then
+    # Skip pack preparation if using a downloaded model
     if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 9 ]; then
         log "Stage 11: Pack model: ${packed_model}"
 
@@ -871,10 +990,12 @@ if ! "${skip_upload}"; then
             --option "${enh_exp}"/images \
             --outpath "${packed_model}"
     fi
+fi
 
-
+if ! "${skip_upload}"; then
     if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
         log "Stage 12: Upload model to Zenodo: ${packed_model}"
+        log "Warning: Upload model to Zenodo will be deprecated. We encourage to use Hugging Face"
 
         # To upload your model, you need to do:
         #   1. Sign up to Zenodo: https://zenodo.org/
@@ -928,7 +1049,59 @@ EOF
             --publish false
     fi
 else
-    log "Skip the uploading stages"
+    log "Skip the uploading stage"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 13: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1
+
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=audio-to-audio
+        # shellcheck disable=SC2034
+        espnet_task=ENH
+        # shellcheck disable=SC2034
+        task_exp=${enh_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/TEMPLATE_HF_Readme.md b/egs2/TEMPLATE/enh1/scripts/utils/TEMPLATE_HF_Readme.md
new file mode 120000
index 00000000000..137c5c9044a
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/TEMPLATE_HF_Readme.md
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/TEMPLATE_HF_Readme.md
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/TEMPLATE_Readme.md b/egs2/TEMPLATE/enh1/scripts/utils/TEMPLATE_Readme.md
new file mode 100644
index 00000000000..4f8920f2657
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/TEMPLATE_Readme.md
@@ -0,0 +1,51 @@
+---
+tags:
+- espnet
+- audio
+- speech-enhancement
+- audio-to-audio
+language: <add_lang>
+datasets:
+- <add_corpus>
+license: cc-by-4.0
+---
+## ESPnet2 ENH pretrained model 
+### `<add_model_name>`
+♻️ Imported from <add_url>
+
+This model was trained by <add_name> using <add_corpus>/enh1 recipe in [espnet](https://github.com/espnet/espnet/).
+### Demo: How to use in ESPnet2
+```python
+# coming soon
+```
+### Citing ESPnet
+```BibTex
+@inproceedings{watanabe2018espnet,
+  author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+  title={{ESPnet}: End-to-End Speech Processing Toolkit},
+  year={2018},
+  booktitle={Proceedings of Interspeech},
+  pages={2207--2211},
+  doi={10.21437/Interspeech.2018-1456},
+  url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
+}
+@inproceedings{li2021espnet,
+title={ESPnet-SE: end-to-end speech enhancement and separation toolkit designed for ASR integration},
+author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and others},
+booktitle={2021 IEEE Spoken Language Technology Workshop (SLT)},
+pages={785--792},
+year={2021},
+organization={IEEE}
+}
+```
+or arXiv:
+```bibtex
+@misc{watanabe2018espnet,
+      title={ESPnet: End-to-End Speech Processing Toolkit}, 
+      author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
+      year={2018},
+      eprint={1804.00015},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/calculate_speech_metrics.py b/egs2/TEMPLATE/enh1/scripts/utils/calculate_speech_metrics.py
new file mode 100644
index 00000000000..85c1885fea4
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/calculate_speech_metrics.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import sys
+from typing import List
+from typing import Union
+
+from mir_eval.separation import bss_eval_sources
+import numpy as np
+from pystoi import stoi
+import torch
+from typeguard import check_argument_types
+
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.enh.encoder.stft_encoder import STFTEncoder
+from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.fileio.sound_scp import SoundScpReader
+from espnet2.utils import config_argparse
+
+
+def scoring(
+    output_dir: str,
+    dtype: str,
+    log_level: Union[int, str],
+    key_file: str,
+    ref_scp: List[str],
+    inf_scp: List[str],
+    ref_channel: int,
+    metrics: List[str],
+    frame_size: int = 512,
+    frame_hop: int = 256,
+):
+    assert check_argument_types()
+    for metric in metrics:
+        assert metric in (
+            "STOI",
+            "ESTOI",
+            "SNR",
+            "SI_SNR",
+            "SDR",
+            "SAR",
+            "SIR",
+            "framewise-SNR",
+        ), metric
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    assert len(ref_scp) == len(inf_scp), ref_scp
+    num_spk = len(ref_scp)
+
+    keys = [
+        line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8")
+    ]
+
+    ref_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp]
+    inf_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp]
+
+    # get sample rate
+    fs, _ = ref_readers[0][keys[0]]
+
+    # check keys
+    for inf_reader, ref_reader in zip(inf_readers, ref_readers):
+        assert inf_reader.keys() == ref_reader.keys()
+
+    stft = STFTEncoder(n_fft=frame_size, hop_length=frame_hop)
+
+    do_bss_eval = "SDR" in metrics or "SAR" in metrics or "SIR" in metrics
+    with DatadirWriter(output_dir) as writer:
+        for key in keys:
+            ref_audios = [ref_reader[key][1] for ref_reader in ref_readers]
+            inf_audios = [inf_reader[key][1] for inf_reader in inf_readers]
+            ref = np.array(ref_audios)
+            inf = np.array(inf_audios)
+            if ref.ndim > inf.ndim:
+                # multi-channel reference and single-channel output
+                ref = ref[..., ref_channel]
+                assert ref.shape == inf.shape, (ref.shape, inf.shape)
+            elif ref.ndim < inf.ndim:
+                # single-channel reference and multi-channel output
+                raise ValueError(
+                    "Reference must be multi-channel when the "
+                    "network output is multi-channel."
+                )
+            elif ref.ndim == inf.ndim == 3:
+                # multi-channel reference and output
+                ref = ref[..., ref_channel]
+                inf = inf[..., ref_channel]
+
+            if do_bss_eval or num_spk > 1:
+                sdr, sir, sar, perm = bss_eval_sources(
+                    ref, inf, compute_permutation=True
+                )
+            else:
+                perm = [0]
+
+            ilens = torch.LongTensor([ref.shape[1]])
+            # (num_spk, T, F)
+            ref_spec, flens = stft(torch.from_numpy(ref), ilens)
+            inf_spec, _ = stft(torch.from_numpy(inf), ilens)
+
+            for i in range(num_spk):
+                p = int(perm[i])
+                for metric in metrics:
+                    name = f"{metric}_spk{i + 1}"
+                    if metric == "STOI":
+                        writer[name][key] = str(
+                            stoi(ref[i], inf[p], fs_sig=fs, extended=False)
+                        )
+                    elif metric == "ESTOI":
+                        writer[name][key] = str(
+                            stoi(ref[i], inf[p], fs_sig=fs, extended=True)
+                        )
+                    elif metric == "SNR":
+                        si_snr_score = -float(
+                            ESPnetEnhancementModel.snr_loss(
+                                torch.from_numpy(ref[i][None, ...]),
+                                torch.from_numpy(inf[p][None, ...]),
+                            )
+                        )
+                        writer[name][key] = str(si_snr_score)
+                    elif metric == "SI_SNR":
+                        si_snr_score = -float(
+                            ESPnetEnhancementModel.si_snr_loss(
+                                torch.from_numpy(ref[i][None, ...]),
+                                torch.from_numpy(inf[p][None, ...]),
+                            )
+                        )
+                        writer[name][key] = str(si_snr_score)
+                    elif metric == "SDR":
+                        writer[name][key] = str(sdr[i])
+                    elif metric == "SAR":
+                        writer[name][key] = str(sar[i])
+                    elif metric == "SIR":
+                        writer[name][key] = str(sir[i])
+                    elif metric == "framewise-SNR":
+                        framewise_snr = -ESPnetEnhancementModel.snr_loss(
+                            ref_spec[i].abs(), inf_spec[i].abs()
+                        )
+                        writer[name][key] = " ".join(map(str, framewise_snr.tolist()))
+                    else:
+                        raise ValueError("Unsupported metric: %s" % metric)
+                    # save permutation assigned script file
+                    writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="Frontend inference",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--ref_scp",
+        type=str,
+        required=True,
+        action="append",
+    )
+    group.add_argument(
+        "--inf_scp",
+        type=str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str)
+    group.add_argument("--metrics", type=str, action="append")
+    group.add_argument("--ref_channel", type=int, default=0)
+    group.add_argument(
+        "--frame_size",
+        type=int,
+        default=512,
+        help="STFT frame size in samples, for calculating framewise-* metrics",
+    )
+    group.add_argument(
+        "--frame_hop",
+        type=int,
+        default=256,
+        help="STFT frame hop in samples, for calculating framewise-* metrics",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    scoring(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/calculate_speech_metrics.sh b/egs2/TEMPLATE/enh1/scripts/utils/calculate_speech_metrics.sh
new file mode 100755
index 00000000000..7329ef9e1e7
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/calculate_speech_metrics.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+
+# 2021 @wangyou-zhang
+# Copied from ./scripts/utils/perturb_data_dir_speed.sh
+# Modified for calculating speech related metrics
+
+# Copyright 2021  Shanghai Jiao Tong University (author: Wangyou Zhang)
+# Apache 2.0
+
+# This script calculates the specified metric between audios in <ref_scp> and <enh_scp>,
+# and store it in the specified output file.
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+
+export LC_ALL=C
+
+ref_channel=0    # reference channel
+nj=32            # The number of parallel jobs in inference
+python=python3   # Specify python to execute espnet commands
+id_prefix="enh-" # prefix of utt ids to remove in <ref_scp> and <enh_scp>
+frame_size=512   # STFT frame size in samples
+frame_hop=256    # STFT frame hop in samples
+
+. utils/parse_options.sh
+. ./path.sh
+. ./cmd.sh
+
+
+help_message=$(cat << EOF
+Usage: calculate_speech_metrics.sh <ref_scp> <enh_scp> <metric> <outfile>
+e.g.:
+    $0 spk1.scp enh.scp SNR snr.scp
+
+Arguments:
+    <ref_scp>: scp file containing the path to audios that will be used as reference signals when calculating the metrics
+    <enh_scp>: scp file containing the path to audios that will be used as estimated signals when calculating the metrics
+    <metric>: must be one of the following:
+        "STOI": short-time objective intelligibility
+        "ESTOI": extended short-time objective intelligibility
+        "SNR": signal-to-noise ratio
+        "SI-SNR": scale-invariant signal-to-noise ratio
+        "SDR": signal-to-distortion ratio
+        "SAR": signal-to-artifact ratio
+        "SIR": signal-to-interference ratio
+        "framewise-SNR": frame-level SNR
+    <outfile>: the scp file to store the calculated metric
+
+Optional:
+    --ref_channel: Reference channel of the reference speech will be used if the enhanced speech is single-channel and reference speech is multi-channel (default=${ref_channel})
+    --nj: The number of parallel jobs in inference (default=${nj})
+    --python: specify python to execute espnet commands (default=${python})
+    --id_prefix: prefix of utt ids to remove in <ref_scp> and <enh_scp> (default=${id_prefix})
+    --frame_size: STFT frame size in samples, for calculating framewise-* metrics (default=${frame_size})
+    --frame_hop: STFT frame hop in samples, for calculating framewise-* metrics (default=${frame_hop})
+EOF
+)
+
+log "$0 $*"
+
+if [[ $# != 4 ]]; then
+    log "${help_message}"
+    exit 2
+fi
+
+ref_scp=$1
+enh_scp=$2
+metric=$3
+outfile=$4
+
+tmpdir=$(mktemp -d speech_metrics-XXXX)
+chmod 755 "${tmpdir}"
+
+
+# 1. Split the key file
+sed -e "s/${id_prefix}//g" ${ref_scp} > "${tmpdir}/ref.scp"
+sed -e "s/${id_prefix}//g" ${enh_scp} > "${tmpdir}/enh.scp"
+key_file="${tmpdir}/ref.scp"
+split_scps=""
+_nj=$(min "${nj}" "$(<${key_file} wc -l)")
+for n in $(seq "${_nj}"); do
+    split_scps+=" ${tmpdir}/keys.${n}.scp"
+done
+# shellcheck disable=SC2086
+utils/split_scp.pl "${key_file}" ${split_scps}
+
+# 2. Submit scoring jobs
+log "Scoring started... log: '${tmpdir}/enh_metric.*.log'"
+# shellcheck disable=SC2086,SC2154
+${decode_cmd} JOB=1:"${_nj}" "${tmpdir}"/enh_metric.JOB.log \
+    ${python} scripts/utils/calculate_speech_metrics.py \
+        --key_file "${tmpdir}"/keys.JOB.scp \
+        --output_dir "${tmpdir}"/output.JOB \
+        --ref_scp "${tmpdir}/ref.scp" \
+        --inf_scp "${tmpdir}/enh.scp" \
+        --metrics ${metric} \
+        --ref_channel ${ref_channel} \
+        --frame_size ${frame_size} \
+        --frame_hop ${frame_hop}
+
+for i in $(seq "${_nj}"); do
+    cat "${tmpdir}/output.${i}/${metric}_spk1"
+done | LC_ALL=C sort -k1 > "${outfile}"
+
+rm -r "${tmpdir}"
+log "$0: calculated metric is in ${outfile}"
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py b/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py
new file mode 120000
index 00000000000..a18aed64ab6
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/create_README_file.py
@@ -0,0 +1 @@
+egs2/TEMPLATE/asr1/scripts/utils/create_README_file.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/enhance_dataset.sh b/egs2/TEMPLATE/enh1/scripts/utils/enhance_dataset.sh
new file mode 100755
index 00000000000..b38ff2a02b6
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/enhance_dataset.sh
@@ -0,0 +1,224 @@
+#!/usr/bin/env bash
+
+# 2021 @wangyou-zhang
+# Copied from ./scripts/utils/perturb_data_dir_speed.sh
+# Modified for enhancing a dataset with a pretrained SE model
+
+# Copyright 2021  Shanghai Jiao Tong University (author: Wangyou Zhang)
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# which contains the following files:
+#  wav.scp
+#  utt2spk
+#
+# It enhances the speech data with a specified (pretrained) SE model,
+# and generates the corresponding files pointing to the enahnced data.
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+
+export LC_ALL=C
+
+audio_dir=            # The output directory for storing enahnced audio files
+scp_files=            # Additional scp files to be copied and converted
+spk_num=1             # Number of speakers in the input (>1 for separation)
+gpu_inference=false   # Whether to perform gpu inference
+inference_nj=32       # The number of parallel jobs in inference
+fs=16k                # Sampling rate
+python=python3        # Specify python to execute espnet commands
+id_prefix="enh-"      # prefix for utt ids and spk ids of enhanced samples
+enh_args="--normalize_output_wav true"
+
+. utils/parse_options.sh
+. ./path.sh
+. ./cmd.sh
+
+help_message=$(cat << EOF
+Usage: enhance_dataset.sh <srcdir> <destdir> <modelfile>
+e.g.:
+    $0 dump/train_noisy data/train_noisy_enh '/path/to/model'
+
+Arguments:
+    <srcdir>: path to the data directory containing the original dataset
+    <destdir>: path to the data directory for storing the enhanced dataset
+    <modelfile>: path to the pretrained speech enhancement model
+        Note: "train.yaml" is assumed to be in the same directory as <modelfile>
+
+Optional:
+    --audio_dir: specify the output directory for storing enhanced audios (default is '<destdir>/wavs')
+    --scp_files: specify additional scp files to be copied and converted (default is '${scp_files}')
+    --spk_num: number of speakers in the input (>1 for separation, default=${spk_num})
+    --gpu_inference: whether to use gpu for inference (default=${gpu_inference})
+    --inference_nj: The number of parallel jobs in inference (default=${inference_nj})
+    --fs: sampling rate (default=${fs})
+    --python: specify python to execute espnet commands (default=${python})
+    --id_prefix: specify the prefix to prepend to utt ids and spk ids (default=${id_prefix})
+    --enh_args: additional arguments for espnet2/bin/enh_inference.py (default is '${enh_args}')
+EOF
+)
+
+log "$0 $*"
+
+if [[ $# != 3 ]]; then
+    log "${help_message}"
+    exit 2
+fi
+
+srcdir=$1
+destdir=$2
+modelfile=$3
+
+
+if [[ ! -f ${srcdir}/utt2spk ]]; then
+    log "$0: no such file ${srcdir}/utt2spk"
+    exit 1
+fi
+
+if [[ ${destdir} == "${srcdir}" ]]; then
+    log "$0: this script requires <srcdir> and <destdir> to be different."
+    exit 1
+fi
+
+if [[ ! -f "${modelfile}" ]]; then
+    log "$0: no such file ${modelfile}"
+    exit 1
+fi
+modeldir="$(dirname ${modelfile})"
+if [[ ! -f "${modeldir}"/config.yaml ]]; then
+    log "$0: no such file ${modeldir}/config.yaml"
+    exit 1
+fi
+
+if [[ -z "${audio_dir}" ]]; then
+  audio_dir="${destdir}/wavs"
+fi
+
+mkdir -p "${destdir}"
+mkdir -p "${audio_dir}"
+
+<"${srcdir}"/utt2spk awk -v p="${id_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/utt_map"
+<"${srcdir}"/spk2utt awk -v p="${id_prefix}" '{printf("%s %s%s\n", $1, p, $1);}' > "${destdir}/spk_map"
+if [[ ! -f ${srcdir}/utt2uniq ]]; then
+    <"${srcdir}/utt2spk" awk -v p="${id_prefix}" '{printf("%s%s %s\n", p, $1, $1);}' > "${destdir}/utt2uniq"
+else
+    <"${srcdir}/utt2uniq" awk -v p="${id_prefix}" '{printf("%s%s %s\n", p, $1, $2);}' > "${destdir}/utt2uniq"
+fi
+
+
+<"${srcdir}"/utt2spk utils/apply_map.pl -f 1 "${destdir}"/utt_map | \
+  utils/apply_map.pl -f 2 "${destdir}"/spk_map >"${destdir}"/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <"${destdir}"/utt2spk >"${destdir}"/spk2utt
+
+for scp_file in ${scp_files};do
+    if [[ -f "${srcdir}/${scp_files}" ]]; then
+        utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}/${scp_file}" >"${destdir}/${scp_file}"
+    fi
+done
+
+for f in wav.scp text utt2lang; do
+    if [[ -f ${srcdir}/${f} ]]; then
+        utils/apply_map.pl -f 1 "${destdir}"/utt_map <"${srcdir}"/${f} >"${destdir}"/${f}
+    fi
+done
+if [[ -f ${srcdir}/spk2gender ]]; then
+    utils/apply_map.pl -f 1 "${destdir}"/spk_map <"${srcdir}"/spk2gender >"${destdir}"/spk2gender
+fi
+
+rm "${destdir}"/spk_map "${destdir}"/utt_map 2>/dev/null
+
+
+# shellcheck disable=SC2154
+if ${gpu_inference}; then
+    _cmd=${cuda_cmd}
+    _ngpu=1
+else
+    _cmd=${decode_cmd}
+    _ngpu=0
+fi
+
+_logdir="$(realpath ${audio_dir})"
+mkdir -p "${_logdir}"
+
+
+# 1. Split the key file
+key_file=${destdir}/wav.scp
+split_scps=""
+_nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+for n in $(seq "${_nj}"); do
+    split_scps+=" ${_logdir}/keys.${n}.scp"
+done
+# shellcheck disable=SC2086
+utils/split_scp.pl "${key_file}" ${split_scps}
+
+# 2. Submit inference jobs
+log "Enhancement started... log: '${_logdir}/enhance_dataset.*.log'"
+# shellcheck disable=SC2086
+# TODO(wangyou): support enhancement from enh_asr models after https://github.com/espnet/espnet/pull/3226 is merged
+${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/enhance_dataset.JOB.log \
+    ${python} -m espnet2.bin.enh_inference \
+        --ngpu "${_ngpu}" \
+        --fs "${fs}" \
+        --data_path_and_name_and_type "${destdir}/wav.scp,speech_mix,sound" \
+        --key_file "${_logdir}"/keys.JOB.scp \
+        --train_config "${modeldir}"/config.yaml \
+        --model_file "${modelfile}" \
+        --output_dir "${_logdir}"/output.JOB \
+        ${enh_args}
+
+_spk_list=" "
+for i in $(seq ${spk_num}); do
+    _spk_list+="spk${i} "
+done
+
+# 3. Concatenates the output files from each jobs
+for spk in ${_spk_list}; do
+    for i in $(seq "${_nj}"); do
+        cat "${_logdir}/output.${i}/${spk}.scp"
+    done | LC_ALL=C sort -k1 > "${_logdir}/${spk}.scp"
+done
+
+if [[ ${spk_num} -gt 1 ]]; then
+    # (speech separation) prepare a subdir for each speaker
+    for spk in ${_spk_list}; do
+        mkdir -p "${destdir}/${spk}"
+        cp "${_logdir}/${spk}.scp" "${destdir}/${spk}/wav.scp"
+        for f in utt2spk spk2utt utt2lang; do
+            if [[ -f "${destdir}/${f}" ]]; then
+                ln -s ../${f} "${destdir}/${spk}/${f}"
+            fi
+        done
+        for f in text spk2gender; do
+            if [[ -f "${destdir}/${f}_${spk}" ]]; then
+                ln -s ../${f}_${spk} "${destdir}/${spk}/${f}"
+            fi
+        done
+        utils/validate_data_dir.sh --no-feats --no-text "${destdir}/${spk}"
+    done
+    log "$0: generated enhanced version of data in ${srcdir}, in ${destdir}/spk*"
+else
+    # (speech enhancement) no subdir is needed
+    cp "${_logdir}/spk1.scp" "${destdir}/wav.scp"
+    log "$0: generated enhanced version of data in ${srcdir}, in ${destdir}"
+    utils/validate_data_dir.sh --no-feats --no-text "${destdir}"
+fi
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py b/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py
new file mode 120000
index 00000000000..0b4eaaf09a8
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/get_model_names.py
@@ -0,0 +1 @@
+egs2/TEMPLATE/asr1/scripts/utils/get_model_names.py
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/show_asr_result.sh b/egs2/TEMPLATE/enh1/scripts/utils/show_asr_result.sh
new file mode 120000
index 00000000000..ea34b243f2c
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/show_asr_result.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/show_asr_result.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
index 1f863c5c121..66fb9bc81c2 100755
--- a/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
+++ b/egs2/TEMPLATE/enh1/scripts/utils/show_enh_score.sh
@@ -45,14 +45,14 @@ EOF
 
 
 while IFS= read -r expdir; do
-    if ls "${expdir}"/enhanced_*/scoring/result_stoi.txt &> /dev/null; then
+    if ls "${expdir}"/*/scoring/result_stoi.txt &> /dev/null; then
         echo -e "\n## $(basename ${expdir})\n"
-        grep ^config "${expdir}"/config.yaml
+        [ -e "${expdir}"/config.yaml ] && grep ^config "${expdir}"/config.yaml
         metrics=()
         heading="\n|dataset|"
         sep="|---|"
         for type in pesq stoi sar sdr sir si_snr; do
-            if ls "${expdir}"/enhanced_*/scoring/result_${type}.txt &> /dev/null; then
+            if ls "${expdir}"/*/scoring/result_${type}.txt &> /dev/null; then
                 metrics+=("$type")
                 heading+="${type^^}|"
                 sep+="---|"
@@ -61,7 +61,7 @@ while IFS= read -r expdir; do
         echo -e "${heading}\n${sep}"
 
         setnames=()
-        for dirname in "${expdir}"/enhanced_*/scoring/result_stoi.txt; do
+        for dirname in "${expdir}"/*/scoring/result_stoi.txt; do
             dset=$(echo $dirname | sed -e "s#${expdir}/\([^/]*\)/scoring/result_stoi.txt#\1#g")
             setnames+=("$dset")
         done
@@ -69,7 +69,11 @@ while IFS= read -r expdir; do
             line="|${dset}|"
             for ((i=0; i<${#metrics[@]}; i++)); do
                 type=${metrics[$i]}
-                score=$(head -n1 "${expdir}"/${dset}/scoring/result_${type}.txt)
+                if [ -f "${expdir}"/${dset}/scoring/result_${type}.txt ]; then
+                    score=$(head -n1 "${expdir}"/${dset}/scoring/result_${type}.txt)
+                else
+                    score=""
+                fi
                 line+="${score}|"
             done
             echo $line
diff --git a/egs2/TEMPLATE/enh1/scripts/utils/upload_models_to_hub.sh b/egs2/TEMPLATE/enh1/scripts/utils/upload_models_to_hub.sh
new file mode 120000
index 00000000000..aeae4732e4b
--- /dev/null
+++ b/egs2/TEMPLATE/enh1/scripts/utils/upload_models_to_hub.sh
@@ -0,0 +1 @@
+../../../asr1/scripts/utils/upload_models_to_hub.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/cmd.sh b/egs2/TEMPLATE/mt1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/mt1/conf/pbs.conf b/egs2/TEMPLATE/mt1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/mt1/conf/queue.conf b/egs2/TEMPLATE/mt1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/mt1/conf/slurm.conf b/egs2/TEMPLATE/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/mt1/db.sh b/egs2/TEMPLATE/mt1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/local/path.sh b/egs2/TEMPLATE/mt1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/mt1/mt.sh b/egs2/TEMPLATE/mt1/mt.sh
new file mode 100755
index 00000000000..35c6ab276c3
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/mt.sh
@@ -0,0 +1,1431 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload=true     # Skip packing and uploading stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+
+# Tokenization related
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+token_joint=false       # whether to use a single bpe system for both source and target languages
+src_case=lc.rm
+src_token_type=bpe      # Tokenization type (char or bpe) for source languages.
+src_nbpe=30             # The number of BPE vocabulary for source language.
+src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe).
+src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language.
+src_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE of source language
+src_bpe_char_cover=1.0  # character coverage when modeling BPE for source language
+tgt_case=tc
+tgt_token_type=bpe      # Tokenization type (char or bpe) for target language.
+tgt_nbpe=30             # The number of BPE vocabulary for target language.
+tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language.
+tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language.
+tgt_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE for target language.
+tgt_bpe_char_cover=1.0  # character coverage when modeling BPE for target language.
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for MT decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# MT model related
+mt_tag=        # Suffix to the result dir for mt model training.
+mt_exp=        # Specify the directory path for MT experiment.
+               # If this option is specified, mt_tag is ignored.
+mt_stats_dir=  # Specify the directory path for MT statistics.
+mt_config=     # Config for mt model training.
+mt_args=       # Arguments for mt model training, e.g., "--max_epoch 10".
+               # Note that it will overwrite args in mt config.
+ignore_init_mismatch=false      # Ignore initial mismatch
+num_splits_mt=1            # Number of splitting for lm corpus.
+src_lang=es                # source language abbrev. id (e.g., es)
+tgt_lang=en                # target language abbrev. id (e.g., en)
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false      # Whether to use k2 based decoder
+batch_size=1
+inference_tag=    # Suffix to the result dir for decoding.
+inference_config= # Config for decoding.
+inference_args=   # Arguments for decoding, e.g., "--lm_weight 0.1".
+                  # Note that it will overwrite args in inference config.
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_mt_model=valid.acc.ave.pth # MT model path for decoding.
+                                      # e.g.
+                                      # inference_mt_model=train.loss.best.pth
+                                      # inference_mt_model=3epoch.pth
+                                      # inference_mt_model=valid.acc.best.pth
+                                      # inference_mt_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+src_bpe_train_text=  # Text file path of bpe training set for source language.
+tgt_bpe_train_text=  # Text file path of bpe training set for target language.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+mt_text_fold_length=150   # fold_length for text data during MT training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload    # Skip packing and uploading stages (default="${skip_upload}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+
+    # Tokenization related
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --token_joint=false       # Whether to use a single bpe system for both source and target languages.
+                              # if set as true, will use tgt_* for processing (default="${token_joint}").
+    --src_token_type=bpe      # Tokenization type (char or bpe) for source languages. (default="${src_token_type}").
+    --src_nbpe=30             # The number of BPE vocabulary for source language. (default="${src_nbpe}").
+    --src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe). (default="${src_bpemode}").
+    --src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language. (default="${src_bpe_input_sentence_size}").
+    --src_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE of source language. (default="${src_bpe_nlsyms}").
+    --src_bpe_char_cover=1.0  # Character coverage when modeling BPE for source language. (default="${src_bpe_char_cover}").
+    --tgt_token_type=bpe      # Tokenization type (char or bpe) for target language. (default="${tgt_token_type}").
+    --tgt_nbpe=30             # The number of BPE vocabulary for target language. (default="${tgt_nbpe}").
+    --tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language. (default="${tgt_bpemode}").
+    --tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language. (default="${tgt_bpe_input_sentence_size}").
+    --tgt_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE for target language. (default="${tgt_bpe_nlsyms}").
+    --tgt_bpe_char_cover=1.0  # Character coverage when modeling BPE for target language. (default="${tgt_bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # MT model related
+    --mt_tag           # Suffix to the result dir for mt model training (default="${mt_tag}").
+    --mt_exp           # Specify the directory path for MT experiment.
+                       # If this option is specified, mt_tag is ignored (default="${mt_exp}").
+    --mt_stats_dir     # Specify the directory path for MT statistics (default="${mt_stats_dir}").
+    --mt_config        # Config for mt model training (default="${mt_config}").
+    --mt_args          # Arguments for mt model training (default="${mt_args}").
+                       # e.g., --mt_args "--max_epoch 10"
+                       # Note that it will overwrite args in mt config.
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --num_splits_mt    # Number of splitting for lm corpus.  (default="${num_splits_mt}").
+    --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
+    --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --inference_args      # Arguments for decoding (default="${inference_args}").
+                          # e.g., --inference_args "--lm_weight 0.1"
+                          # Note that it will overwrite args in inference config.
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_mt_model # MT model path for decoding (default="${inference_mt_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --src_bpe_train_text # Text file path of bpe training set for source language.
+    --tgt_bpe_train_text # Text file path of bpe training set for target language
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --mt_text_fold_length   # fold_length for text data during MT training (default="${mt_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for translation process
+utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
+# Use the same text as MT for bpe training if not specified.
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as MT for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as MT for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+# The tgt bpedir is set for all cases when using bpe
+tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
+tgt_bpeprefix="${tgt_bpedir}"/bpe
+tgt_bpemodel="${tgt_bpeprefix}".model
+tgt_bpetoken_list="${tgt_bpedir}"/tokens.txt
+tgt_chartoken_list="${token_listdir}"/char/tgt_tokens.txt
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    src_bpedir="${tgt_bpedir}"
+    src_bpeprefix="${tgt_bpeprefix}"
+    src_bpemodel="${tgt_bpemodel}"
+    src_bpetoken_list="${tgt_bpetoken_list}"
+    src_chartoken_list="${tgt_chartoken_list}"
+else
+    src_bpedir="${token_listdir}/src_bpe_${tgt_bpemode}${tgt_nbpe}"
+    src_bpeprefix="${src_bpedir}"/bpe
+    src_bpemodel="${src_bpeprefix}".model
+    src_bpetoken_list="${src_bpedir}"/tokens.txt
+    src_chartoken_list="${token_listdir}"/char/src_tokens.txt
+fi
+
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+tgt_wordtoken_list="${token_listdir}"/word/tgt_tokens.txt
+if "${token_joint}"; then
+    src_wordtoken_list="${tgt_wordtoken_list}"
+else
+    src_wordtoken_list="${token_listdir}"/word/src_tokens.txt
+fi
+
+# Set token types for src and tgt langs
+if [ "${src_token_type}" = bpe ]; then
+    src_token_list="${src_bpetoken_list}"
+elif [ "${src_token_type}" = char ]; then
+    src_token_list="${src_chartoken_list}"
+    src_bpemodel=none
+elif [ "${src_token_type}" = word ]; then
+    src_token_list="${src_wordtoken_list}"
+    src_bpemodel=none
+else
+    log "Error: not supported --src_token_type '${src_token_type}'"
+    exit 2
+fi
+if [ "${tgt_token_type}" = bpe ]; then
+    tgt_token_list="${tgt_bpetoken_list}"
+elif [ "${tgt_token_type}" = char ]; then
+    tgt_token_list="${tgt_chartoken_list}"
+    tgt_bpemodel=none
+elif [ "${tgt_token_type}" = word ]; then
+    tgt_token_list="${tgt_wordtoken_list}"
+    tgt_bpemodel=none
+else
+    log "Error: not supported --tgt_token_type '${tgt_token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${tgt_wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${tgt_token_list}"
+    lm_token_type="${tgt_token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${mt_tag}" ]; then
+    if [ -n "${mt_config}" ]; then
+        mt_tag="$(basename "${mt_config}" .yaml)_${feats_type}"
+    else
+        mt_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        mt_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+    else
+        mt_tag+="_${tgt_token_type}_${tgt_case}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        mt_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${mt_args}" ]; then
+        mt_tag+="$(echo "${mt_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${mt_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        mt_stats_dir="${expdir}/mt_stats_${feats_type}_${lang}_${tgt_token_type}"
+    else
+        mt_stats_dir="${expdir}/mt_stats_${feats_type}_${tgt_token_type}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        mt_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${mt_exp}" ]; then
+    mt_exp="${expdir}/mt_${mt_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${inference_args}" ]; then
+        inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_mt_model_$(echo "${inference_mt_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+        
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 2: data/ -> ${data_feats}"
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                mkdir -p "${data_feats}${_suf}/${dset}"
+
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                    done 
+                done
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        log "Stage 3: Data filtering: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            mkdir -p "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+            done
+            # TODO: Maybe Remove empty text
+            # TODO: Add other data cleaning -- currently being done as part of data.sh
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
+        # First generate tgt lang
+        if [ "${tgt_token_type}" = bpe ]; then
+            log "Stage 4a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+
+            mkdir -p "${tgt_bpedir}"
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${tgt_bpedir}"/train.txt
+
+            if [ -n "${tgt_bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${tgt_bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${tgt_bpedir}"/train.txt \
+                --vocab_size="${tgt_nbpe}" \
+                --model_type="${tgt_bpemode}" \
+                --model_prefix="${tgt_bpeprefix}" \
+                --character_coverage=${tgt_bpe_char_cover} \
+                --input_sentence_size="${tgt_bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${tgt_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${tgt_token_list}"
+
+        elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
+            log "Stage 4a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for MT and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${tgt_token_type}" \
+                --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${tgt_token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Stage 4b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+        else
+            if [ "${src_token_type}" = bpe ]; then
+                log "Stage 4b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+
+                mkdir -p "${src_bpedir}"
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${src_bpedir}"/train.txt
+
+                if [ -n "${src_bpe_nlsyms}" ]; then
+                    _opts_spm="--user_defined_symbols=${src_bpe_nlsyms}"
+                else
+                    _opts_spm=""
+                fi
+
+                spm_train \
+                    --input="${src_bpedir}"/train.txt \
+                    --vocab_size="${src_nbpe}" \
+                    --model_type="${src_bpemode}" \
+                    --model_prefix="${src_bpeprefix}" \
+                    --character_coverage=${src_bpe_char_cover} \
+                    --input_sentence_size="${src_bpe_input_sentence_size}" \
+                    ${_opts_spm}
+
+                {
+                echo "${blank}"
+                echo "${oov}"
+                # Remove <unk>, <s>, </s> from the vocabulary
+                <"${src_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+                echo "${sos_eos}"
+                } > "${src_token_list}"
+
+            elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
+                log "Stage 4b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+
+                _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+                # 0 is reserved for CTC-blank for MT and also used as ignore-index in the other task
+                ${python} -m espnet2.bin.tokenize_text  \
+                    --token_type "${src_token_type}" \
+                    --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
+                    --field 2- \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --write_vocabulary true \
+                    --add_symbol "${blank}:0" \
+                    --add_symbol "${oov}:1" \
+                    --add_symbol "${sos_eos}:-1"
+
+            else
+                log "Error: not supported --token_type '${src_token_type}'"
+                exit 2
+            fi
+        fi
+    fi
+
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+            log "Stage 5: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 5-7: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+        if "${use_ngram}"; then
+            log "Stage 8: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 8: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        _mt_train_dir="${data_feats}/${train_set}"
+        _mt_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 9: MT collect stats: train_set=${_mt_train_dir}, valid_set=${_mt_valid_dir}"
+
+        _opts=
+        if [ -n "${mt_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.mt_train --print_config --optim adam
+            _opts+="--config ${mt_config} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${mt_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        _scp=text.${src_case}.${src_lang}
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_mt_train_dir}/${_scp} wc -l)" "$(<${_mt_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_mt_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_mt_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${mt_stats_dir}/run.sh'. You can resume the process from stage 9 using this script"
+        mkdir -p "${mt_stats_dir}"; echo "${run_args} --stage 9 \"\$@\"; exit \$?" > "${mt_stats_dir}/run.sh"; chmod +x "${mt_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "MT collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # TODO(jiatong): fix different bpe model
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.mt_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --src_token_type "${src_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_mt_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_mt_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${mt_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${mt_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${mt_stats_dir}/train/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${mt_stats_dir}/train/text_shape.${tgt_token_type}"
+
+        <"${mt_stats_dir}/train/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${mt_stats_dir}/train/src_text_shape.${src_token_type}"
+
+        <"${mt_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${mt_stats_dir}/valid/text_shape.${tgt_token_type}"
+
+        <"${mt_stats_dir}/valid/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${mt_stats_dir}/valid/src_text_shape.${src_token_type}"
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _mt_train_dir="${data_feats}/${train_set}"
+        _mt_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: MT Training: train_set=${_mt_train_dir}, valid_set=${_mt_valid_dir}"
+
+        _opts=
+        if [ -n "${mt_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.mt_train --print_config --optim adam
+            _opts+="--config ${mt_config} "
+        fi
+
+        if [ "${num_splits_mt}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${mt_stats_dir}/splits${num_splits_mt}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_mt_train_dir}/${_scp}" \
+                      "${_mt_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_mt_train_dir}/text.${src_case}.${src_lang}" \
+                      "${mt_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${mt_stats_dir}/train/src_text_shape.${src_token_type}" \
+                  --num_splits "${num_splits_mt}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
+            _opts+="--multiple_iterator true "
+        else
+            _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_mt_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${mt_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${mt_stats_dir}/train/src_text_shape.${src_token_type} "
+        fi
+
+        log "Generate '${mt_exp}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${mt_exp}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${mt_exp}/run.sh"; chmod +x "${mt_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "MT training started... log: '${mt_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${mt_exp})"
+        else
+            jobname="${mt_exp}/train.log"
+        fi
+
+        # TODO(jiatong): fix bpe
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${mt_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${mt_exp}"/.dimt_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.mt_train \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --src_token_type "${src_token_type}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_mt_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${mt_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${mt_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --resume true \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${mt_text_fold_length}" \
+                --fold_length "${mt_text_fold_length}" \
+                --output_dir "${mt_exp}" \
+                ${_opts} ${mt_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    mt_exp="${expdir}/${download_model}"
+    mkdir -p "${mt_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${mt_exp}/config.txt"
+
+    # Get the path of each file
+    _mt_model_file=$(<"${mt_exp}/config.txt" sed -e "s/.*'mt_model_file': '\([^']*\)'.*$/\1/")
+    _mt_train_config=$(<"${mt_exp}/config.txt" sed -e "s/.*'mt_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_mt_model_file}" "${mt_exp}"
+    ln -sf "${_mt_train_config}" "${mt_exp}"
+    inference_mt_model=$(basename "${_mt_model_file}")
+
+    if [ "$(<${mt_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${mt_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${mt_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        log "Stage 11: Decoding: training_dir=${mt_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${mt_exp}/${inference_tag}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${mt_exp}/${inference_tag}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${mt_exp}/${inference_tag}/run.sh"; chmod +x "${mt_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${mt_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _scp=text.${src_case}.${src_lang}
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            mt_inference_tool="espnet2.bin.mt_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/mt_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/mt_inference.JOB.log \
+                ${python} -m ${mt_inference_tool} \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},src_text,text" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --mt_train_config "${mt_exp}"/config.yaml \
+                    --mt_model_file "${mt_exp}"/"${inference_mt_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+        done
+    fi
+
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Scoring"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${mt_exp}/${inference_tag}/${dset}"
+
+            # TODO(jiatong): add asr scoring and inference
+
+            _scoredir="${_dir}/score_bleu"
+            mkdir -p "${_scoredir}"
+
+            <"${_data}/text.${tgt_case}.${tgt_lang}" \
+                ${python} -m espnet2.bin.tokenize_text  \
+                    -f 2- --input - --output - \
+                    --token_type word \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --remove_non_linguistic_symbols true \
+                    --cleaner "${cleaner}" \
+            >"${_scoredir}/ref.trn"
+
+            #paste \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+            #        ${python} -m espnet2.bin.tokenize_text  \
+            #            -f 2- --input - --output - \
+            #            --token_type word \
+            #            --non_linguistic_symbols "${nlsyms_txt}" \
+            #            --remove_non_linguistic_symbols true \
+            #            --cleaner "${cleaner}" \
+            #            ) \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
+            #        >"${_scoredir}/ref.trn.org"
+
+            # NOTE(kamo): Don't use cleaner for hyp
+            <"${_dir}/text"  \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                    -f 2- --input - --output - \
+                    --token_type word \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --remove_non_linguistic_symbols true \
+            >"${_scoredir}/hyp.trn"
+
+            #paste \
+            #    <(<"${_dir}/text"  \
+            #            ${python} -m espnet2.bin.tokenize_text  \
+            #                -f 2- --input - --output - \
+            #                --token_type word \
+            #                --non_linguistic_symbols "${nlsyms_txt}" \
+            #                --remove_non_linguistic_symbols true \
+            #                ) \
+            #    <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
+            #        >"${_scoredir}/hyp.trn.org"
+            
+            # remove utterance id
+            #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            #perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+
+            # detokenizer
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+
+            if [ ${tgt_case} = "tc" ]; then
+                echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
+                sacrebleu "${_scoredir}/ref.trn.detok" \
+                          -i "${_scoredir}/hyp.trn.detok" \
+                          -m bleu chrf ter \
+                          >> ${_scoredir}/result.tc.txt
+                
+                log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
+            fi
+
+            # detokenize & remove punctuation except apostrophe
+            remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
+            remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
+            echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
+            sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
+                      -i "${_scoredir}/hyp.trn.detok.lc.rm" \
+                      -m bleu chrf ter \
+                      >> ${_scoredir}/result.lc.txt
+            log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
+
+            # process multi-references cases
+            multi_references=$(ls "${_data}/text.${tgt_case}.${tgt_lang}".* || echo "")
+            if [ "${multi_references}" != "" ]; then
+                case_sensitive_refs=""
+                case_insensitive_refs=""
+                for multi_reference in ${multi_references}; do
+                    ref_idx="${multi_reference##*.}"
+                    paste \
+                        <(<${multi_reference} \
+                            ${python} -m espnet2.bin.tokenize_text  \
+                                -f 2- --input - --output - \
+                                --token_type word \
+                                --non_linguistic_symbols "${nlsyms_txt}" \
+                                --remove_non_linguistic_symbols true \
+                                --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/text.${tgt_case}.${tgt_lang}" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn.org.${ref_idx}"
+                    
+                    # 
+                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                    case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
+                    case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                done
+
+                if [ ${tgt_case} = "tc" ]; then
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    sacrebleu ${case_sensitive_refs} \
+                        -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                        >> ${_scoredir}/result.tc.txt
+                    log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
+                fi
+
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                sacrebleu -lc ${case_insensitive_refs} \
+                    -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                    >> ${_scoredir}/result.lc.txt
+                log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
+            fi
+        done
+
+        # Show results in Markdown syntax
+        scripts/utils/show_translation_result.sh --case $tgt_case "${mt_exp}" > "${mt_exp}"/RESULTS.md
+        cat "${cat_exp}"/RESULTS.md
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${mt_exp}/${mt_exp##*/}_${inference_mt_model%.*}.zip"
+if ! "${skip_upload}"; then
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${tgt_token_type}" = bpe ]; then
+            _opts+="--option ${tgt_bpemodel} "
+            _opts+="--option ${src_bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack mt \
+            --mt_train_config "${mt_exp}"/config.yaml \
+            --mt_model_file "${mt_exp}"/"${inference_mt_model}" \
+            ${_opts} \
+            --option "${mt_exp}"/RESULTS.md \
+            --option "${mt_exp}"/RESULTS.md \
+            --option "${mt_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Upload model to Zenodo: ${packed_model}"
+
+        # To upload your model, you need to do:
+        #   1. Sign up to Zenodo: https://zenodo.org/
+        #   2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
+        #   3. Set your environment: % export ACCESS_TOKEN="<your token>"
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="
+git checkout $(git show -s --format=%H)"
+
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/st1/ -> foo/st1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/st1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # Generate description file
+        cat << EOF > "${mt_exp}"/description
+This model was trained by ${_creator_name} using ${_task} recipe in <a href="https://github.com/espnet/espnet/">espnet</a>.
+<p>&nbsp;</p>
+<ul>
+<li><strong>Python API</strong><pre><code class="language-python">See https://github.com/espnet/espnet_model_zoo</code></pre></li>
+<li><strong>Evaluate in the recipe</strong><pre>
+<code class="language-bash">git clone https://github.com/espnet/espnet
+cd espnet${_checkout}
+pip install -e .
+cd $(pwd | rev | cut -d/ -f1-3 | rev)
+./run.sh --skip_data_prep false --skip_train true --download_model ${_model_name}</code>
+</pre></li>
+<li><strong>Results</strong><pre><code>$(cat "${mt_exp}"/RESULTS.md)</code></pre></li>
+<li><strong>MT config</strong><pre><code>$(cat "${mt_exp}"/config.yaml)</code></pre></li>
+<li><strong>LM config</strong><pre><code>$(if ${use_lm}; then cat "${lm_exp}"/config.yaml; else echo NONE; fi)</code></pre></li>
+</ul>
+EOF
+
+        # NOTE(kamo): The model file is uploaded here, but not published yet.
+        #   Please confirm your record at Zenodo and publish it by yourself.
+
+        # shellcheck disable=SC2086
+        espnet_model_zoo_upload \
+            --file "${packed_model}" \
+            --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \
+            --description_file "${mt_exp}"/description \
+            --creator_name "${_creator_name}" \
+            --license "CC-BY-4.0" \
+            --use_sandbox false \
+            --publish false
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 15: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1             
+  
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+  
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+  
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=machine-translation
+        # shellcheck disable=SC2034     
+        espnet_task=MT
+        # shellcheck disable=SC2034
+        task_exp=${mt_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/mt1/path.sh b/egs2/TEMPLATE/mt1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/mt1/pyscripts b/egs2/TEMPLATE/mt1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/scripts b/egs2/TEMPLATE/mt1/scripts
new file mode 120000
index 00000000000..1000492f630
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/scripts
@@ -0,0 +1 @@
+../asr1/scripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/setup.sh b/egs2/TEMPLATE/mt1/setup.sh
new file mode 100755
index 00000000000..bcb0fa0916b
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/mt1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in mt.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/mt1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/mt1/steps b/egs2/TEMPLATE/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/mt1/utils b/egs2/TEMPLATE/mt1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/cmd.sh b/egs2/TEMPLATE/ssl1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/ssl1/conf/pbs.conf b/egs2/TEMPLATE/ssl1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/ssl1/conf/queue.conf b/egs2/TEMPLATE/ssl1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/ssl1/conf/slurm.conf b/egs2/TEMPLATE/ssl1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/ssl1/db.sh b/egs2/TEMPLATE/ssl1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/hubert.sh b/egs2/TEMPLATE/ssl1/hubert.sh
new file mode 100755
index 00000000000..8a6f7590cb8
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/hubert.sh
@@ -0,0 +1,587 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+
+# Copyright 2021 Tianzi Wang
+# Apache 2.0
+# Thanks to Abdelrahman Mohamed and Wei-Ning Hsu's help in this implementation,
+# Their origial Hubert work is in:
+#     Paper: https://arxiv.org/pdf/2106.07447.pdf
+#     Code in Fairseq: https://github.com/pytorch/fairseq/tree/master/examples/hubert
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+pretrain_start_iter= # Pretrain starts from the specified iteration (0 mean MFCC iteraion)
+pretrain_stop_iter=  # Pretrain is stopped from the specified iteration (0 mean MFCC iteraion)
+skip_data_prep=false # Skip data preparation stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload=true     # Skip packing and uploading stages.
+ngpu=1      # The number of gpus in pretrain stage ("0" uses cpu, otherwise use gpu).
+num_nodes=1 # The number of nodes in pretrain stage.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+token_type=word      # Tokenization type (char or bpe).
+nbpe=30             # The number of BPE vocabulary.
+bpemode=unigram     # Mode of BPE (unigram or bpe).
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+pad="<pad>"         # pad symbol
+bpe_input_sentence_size=100000000 # Size of input sentence for BPE.
+bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE
+bpe_char_cover=1.0  # character coverage when modeling BPE
+
+# Pretrain model related
+pt_args=      # Arguments for asr model training, e.g., "--max_epoch 10".
+               # Note that it will overwrite args in asr config.
+num_splits_asr=1           # Number of splitting for lm corpus.
+
+# Pretrain related
+n_clusters=                # Number of k-means clusters of pretraining stage
+features_km=               # Feature for k-means clustering of pretraining stage
+portion_km=                # Portion of training set used for k-means
+pretrain_configs=          # Configration files of pretraining stage
+
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=     # Name of pretrain training set
+valid_set=     # Name of pretraining valid set
+bpe_train_text= # Text file path of bpe training set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+asr_speech_fold_length=800 # fold_length for speech data during ASR training.
+asr_text_fold_length=150   # fold_length for text data during ASR training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage                # Processes starts from the specified stage (default="${stage}").
+    --stop_stage            # Processes is stopped at the specified stage (default="${stop_stage}").
+    --pretrain_start_iter  # Pretrain starts from the specified iteration (0 mean MFCC iteraion, default="${pretrain_start_iter}").
+    --pretrain_stop_iter   # Pretrain is stopped from the specified iteration (0 mean MFCC iteraion, default="${pretrain_stop_iter}").
+    --skip_data_prep        # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_eval             # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload           # Skip packing and uploading stages (default="${skip_upload}").
+    --ngpu                 # The number of gpus in pretrain stage ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes            # The number of nodes in pretrain stage (default="${num_nodes}").
+    --nj                    # The number of parallel jobs (default="${nj}").
+    --dumpdir               # Directory to dump features (default="${dumpdir}").
+    --expdir                # Directory to save experiments (default="${expdir}").
+    --python                # Specify python to execute espnet commands (default="${python}").
+    # Data preparation related
+    --local_data_opts       # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format            # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs                      # Sampling rate (default="${fs}").
+    --min_wav_duration        # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration        # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --token_type              # Tokenization type (char or bpe, default="${token_type}").
+    --nbpe                    # The number of BPE vocabulary (default="${nbpe}").
+    --bpemode                 # Mode of BPE (unigram or bpe, default="${bpemode}").
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --bpe_input_sentence_size # Size of input sentence for BPE (default="${bpe_input_sentence_size}").
+    --bpe_nlsyms              # Non-linguistic symbol list for sentencepiece, separated by a comma. (default="${bpe_nlsyms}").
+    --bpe_char_cover          # Character coverage when modeling BPE (default="${bpe_char_cover}").
+
+    # Language model related
+    --num_splits_asr   # Number of splitting for lm corpus  (default="${num_splits_asr}").
+
+    # Pretrain related
+    --pretrain_configs # configration files of pretraining stage
+    --n_clusters       # number of k-means clusters of pretraining stage
+    --features_km      # feature for k-means clustering of pretraining stage    
+    --pt_args         # Arguments for hubert model pretraining (default="${pt_args}").
+                       # e.g., --pt_args "--max_epoch 10"
+                       # Note that it will overwrite args in pt config.
+
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of pretraining train set
+    --valid_set     # Name of pretraining valid set
+    --bpe_train_text # Text file path of bpe training set.
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --asr_speech_fold_length # fold_length for speech data during ASR training (default="${asr_speech_fold_length}").
+    --asr_text_fold_length   # fold_length for text data during ASR training (default="${asr_text_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+echo $@
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+
+# Check pretrain_config, n_clusters and feature list
+pretrain_config_list=(${pretrain_configs// / }) 
+n_clusters_list=(${n_clusters// / })
+feature_list=(${features_km// / })
+if ! [ ${pretrain_start_iter} -le ${pretrain_stop_iter} ]; then
+    log "Error: pretrain_start_iter is required to be smaller or equal than pretrain_stop_iter"
+fi
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "Stage 1: Data preparation"
+    # [Task dependent] Need to create data.sh for new corpus
+    local/data.sh ${local_data_opts}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    if [ -n "${speed_perturb_factors}" ]; then
+        log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+        for factor in ${speed_perturb_factors}; do
+            if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                scripts/utils/perturb_data_dir_speed.sh "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}"
+                _dirs+="data/${train_set}_sp${factor} "
+            else
+                # If speed factor is 1, same as the original
+                _dirs+="data/${train_set} "
+            fi
+        done
+        utils/combine_data.sh "data/${train_set}_sp" ${_dirs}
+    else
+        log "Skip stage 2: Speed perturbation"
+    fi
+fi
+
+if [ -n "${speed_perturb_factors}" ]; then
+    train_set="${train_set}_sp"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    if [ "${feats_type}" = raw ]; then
+        log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+        
+        # ====== Recreating "wav.scp" ======
+        # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+        # shouldn't be used in training process.
+        # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+        # and it can also change the audio-format and sampling rate.
+        # If nothing is need, then format_wav_scp.sh does nothing:
+        # i.e. the input file format and rate is same as the output.
+        
+        for dset in "${train_set}" "${valid_set}"; do
+	    _suf="/org"
+            utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+            rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+            _opts=
+            if [ -e data/"${dset}"/segments ]; then
+                # "segments" is used for splitting wav files which are written in "wav".scp
+                # into utterances. The file format of segments:
+                #   <segment_id> <record_id> <start_time> <end_time>
+                #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                # Where the time is written in seconds.
+                _opts+="--segments data/${dset}/segments "
+            fi
+            # shellcheck disable=SC2086
+            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                                            --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                                            "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+            
+            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+        done
+    else
+        log "Error: not supported: --feats_type ${feats_type}"
+        exit 2
+    fi
+fi
+
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+    
+    # NOTE(kamo): Not applying to test_sets to keep original data
+    for dset in "${train_set}" "${valid_set}"; do
+        
+        # Copy data dir
+        utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+        cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+        
+        # Remove short utterances
+        _feats_type="$(<${data_feats}/${dset}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+            _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+            _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+            
+            # utt2num_samples is created by format_wav_scp.sh
+            <"${data_feats}/org/${dset}/utt2num_samples" \
+             awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+             '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+             >"${data_feats}/${dset}/utt2num_samples"
+            <"${data_feats}/org/${dset}/wav.scp" \
+             utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+             >"${data_feats}/${dset}/wav.scp"
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+        fi
+        
+        # Remove empty text
+        <"${data_feats}/org/${dset}/text" \
+         awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+        
+        # fix_data_dir.sh leaves only utts which exist in all files
+        utils/fix_data_dir.sh "${data_feats}/${dset}"
+    done
+fi
+
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 5 ]; then
+    
+    for ((iter=${pretrain_start_iter}; iter<=${pretrain_stop_iter};iter++)); do
+        asr_config="${pretrain_config_list[${iter}]}"
+        if [ "${lang}" != noinfo ]; then
+            asr_stats_dir="${expdir}/pretrain_iter${iter}_stats_${feats_type}_${lang}"
+        else
+            asr_stats_dir="${expdir}/pretrain_iter${iter}_stats_${feats_type}"
+        fi
+        
+        if [ -n "${asr_config}" ]; then
+            asr_tag="$(basename "${asr_config}" .yaml)_${feats_type}"
+        else
+            asr_tag="train_${feats_type}"
+        fi
+        
+        asr_exp="${expdir}/pretrain_${asr_tag}_iter${iter}"
+        
+        train_set_plabel=$(eval "echo ${train_set}_\${feature_list[${iter}]}_km\${n_clusters_list[${iter}]}")
+        valid_set_plabel=$(eval "echo ${valid_set}_\${feature_list[${iter}]}_km\${n_clusters_list[${iter}]}")
+        
+        feats_km="${feature_list[${iter}]}"
+        n_clusters="${n_clusters_list[${iter}]}"
+        dictdir="./data/${feats_km}_km${n_clusters}_token_list_iter${iter}/${token_type}"
+        
+        if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+            log "Stage 5.iter${iter}: Running ${n_clusters} cluster K-means on ${feats_km} feature."
+            
+            if [ ${iter} -eq 0 ] || [ ${feats_km} == "mfcc" ]; then
+                ./scripts/km.sh \
+                    --train_set "${train_set}" \
+                    --dev_set "${valid_set}" \
+                    --nclusters "${n_clusters}" \
+                    --feature-type "${feats_km}" \
+                    --datadir "${data_feats}" \
+                    --kmrootdir "${expdir}" \
+                    --portion "${portion_km}" \
+                    --dictdir "${dictdir}"
+            else
+                ./scripts/km.sh \
+                    --train_set "${train_set}" \
+                    --dev_set "${valid_set}" \
+                    --nclusters "${n_clusters}" \
+                    --feature-type "${feats_km}" \
+                    --datadir "${data_feats}" \
+                    --kmrootdir "${expdir}" \
+                    --portion "${portion_km}" \
+                    --dictdir "${dictdir}" \
+                    --hubert_url espnet \
+                    --hubert_dir_path "${expdir}/pretrained_model_iter$((iter-1))"/valid.acc.best.pth
+            fi
+        fi
+        
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            _asr_train_dir="${data_feats}/${train_set_plabel}"
+            _asr_valid_dir="${data_feats}/${valid_set_plabel}"
+            
+            log "Stage 6.iter${iter}: ${feats_km} pretrain model collect stats: \
+                       train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}"
+            
+            _opts=
+            if [ -n "${asr_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.asr_train --print_config --optim adam
+                _opts+="--config ${asr_config} "
+            fi
+            
+            _feats_type="$(<${_asr_train_dir}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    # "sound" supports "wav", "flac", etc.
+                    _type=sound
+                fi
+                _opts+="--frontend_conf fs=${fs} "
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+                _input_size="$(<${_asr_train_dir}/feats_dim)"
+                _opts+="--input_size=${_input_size} "
+            fi
+            
+            # 1. Split the key file
+            _logdir="${asr_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${_asr_train_dir}/${_scp} wc -l)" "$(<${_asr_valid_dir}/${_scp} wc -l)")
+            
+            key_file="${_asr_train_dir}/${_scp}"
+            split_scps=""
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+            
+            key_file="${_asr_valid_dir}/${_scp}"
+            split_scps=""
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/valid.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+            
+            # 2. Generate run.sh
+            log "Generate '${asr_stats_dir}/run.sh'. You can resume the process from stage 5.iter${iter} using this script"
+            mkdir -p "${asr_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${asr_stats_dir}/run.sh"; chmod +x "${asr_stats_dir}/run.sh"
+            
+            # 3. Submit jobs
+            log "Hubert pretraining collect-stats started... log: '${_logdir}/stats.*.log'"
+            
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                         ${python} -m espnet2.bin.hubert_train \
+                         --collect_stats true \
+                         --use_preprocessor true \
+                         --normalize none \
+                         --bpemodel none \
+                         --token_type "${token_type}" \
+                         --token_list "${dictdir}/tokens.txt" \
+                         --non_linguistic_symbols none \
+                         --cleaner "${cleaner}" \
+                         --g2p "${g2p}" \
+                         --train_data_path_and_name_and_type "${_asr_train_dir}/${_scp},speech,${_type}" \
+                         --train_data_path_and_name_and_type "${_asr_train_dir}/text,text,text" \
+                         --valid_data_path_and_name_and_type "${_asr_valid_dir}/${_scp},speech,${_type}" \
+                         --valid_data_path_and_name_and_type "${_asr_valid_dir}/text,text,text" \
+                         --train_shape_file "${_logdir}/train.JOB.scp" \
+                         --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                         --output_dir "${_logdir}/stats.JOB" \
+                         --hubert_dict "${dictdir}/dict.txt" \
+                         ${_opts} ${pt_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+            
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${asr_stats_dir}"
+            
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${asr_stats_dir}/train/text_shape" \
+             awk -v N="$(<${dictdir}/tokens.txt wc -l)" '{ print $0 "," N }' \
+             >"${asr_stats_dir}/train/text_shape.${token_type}"
+            
+            <"${asr_stats_dir}/valid/text_shape" \
+             awk -v N="$(<${dictdir}/tokens.txt wc -l)" '{ print $0 "," N }' \
+             >"${asr_stats_dir}/valid/text_shape.${token_type}"
+        fi
+        
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            _asr_train_dir="${data_feats}/${train_set_plabel}"
+            _asr_valid_dir="${data_feats}/${valid_set_plabel}"
+            
+            log "Stage 7.iter${iter}: Hubert Pretraining: train_set=${_asr_train_dir}, valid_set=${_asr_valid_dir}"
+            
+            _opts=
+            if [ -n "${asr_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.hubert_train --print_config --optim adam
+                _opts+="--config ${asr_config} "
+            fi
+            
+            _feats_type="$(<${_asr_train_dir}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                # "sound" supports "wav", "flac", etc.
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+                _fold_length="$((asr_speech_fold_length * 100))"
+                _opts+="--frontend_conf fs=${fs} "
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+                _fold_length="${asr_speech_fold_length}"
+                _input_size="$(<${_asr_train_dir}/feats_dim)"
+                _opts+="--input_size=${_input_size} "        
+            fi
+            
+            if [ "${num_splits_asr}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+                
+                _split_dir="${asr_stats_dir}/splits${num_splits_asr}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                              --scps \
+                              "${_asr_train_dir}/${_scp}" \
+                              "${_asr_train_dir}/text" \
+                              "${asr_stats_dir}/train/speech_shape" \
+                              "${asr_stats_dir}/train/text_shape.${token_type}" \
+                              --num_splits "${num_splits_asr}" \
+                              --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+                
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
+                _opts+="--train_shape_file ${_split_dir}/speech_shape "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
+                _opts+="--multiple_iterator true "
+                
+            else
+                _opts+="--train_data_path_and_name_and_type ${_asr_train_dir}/${_scp},speech,${_type} "
+                _opts+="--train_data_path_and_name_and_type ${_asr_train_dir}/text,text,text "
+                _opts+="--train_shape_file ${asr_stats_dir}/train/speech_shape "
+                _opts+="--train_shape_file ${asr_stats_dir}/train/text_shape.${token_type} "
+            fi
+            
+            log "Generate '${asr_exp}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${asr_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${asr_exp}/run.sh"; chmod +x "${asr_exp}/run.sh"
+            
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+            log "Hubert pretraining started... log: '${asr_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${asr_exp})"
+            else
+                jobname="${asr_exp}/train.log"
+            fi
+            
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                      --cmd "${cuda_cmd} --name ${jobname}" \
+                      --log "${asr_exp}"/train.log \
+                      --ngpu "${ngpu}" \
+                      --num_nodes "${num_nodes}" \
+                      --init_file_prefix "${asr_exp}"/.dist_init_ \
+                      --multiprocessing_distributed false -- \
+                      ${python} -m espnet2.bin.hubert_train \
+                      --use_preprocessor true \
+                      --normalize null \
+                      --bpemodel none \
+                      --token_type "${token_type}" \
+                      --token_list "${dictdir}/tokens.txt" \
+                      --non_linguistic_symbols none \
+                      --cleaner "${cleaner}" \
+                      --g2p "${g2p}" \
+                      --valid_data_path_and_name_and_type "${_asr_valid_dir}/${_scp},speech,${_type}" \
+                      --valid_data_path_and_name_and_type "${_asr_valid_dir}/text,text,text" \
+                      --valid_shape_file "${asr_stats_dir}/valid/speech_shape" \
+                      --valid_shape_file "${asr_stats_dir}/valid/text_shape.${token_type}" \
+                      --resume true \
+                      --fold_length "${_fold_length}" \
+                      --fold_length "${asr_text_fold_length}" \
+                      --output_dir "${asr_exp}" \
+                      --hubert_dict "${dictdir}/dict.txt" \
+                      ${_opts} ${pt_args}
+            
+            if [ "${iter}" -ge 0 ]; then
+                log "Create a symbolic link of the pretrained model"
+                if  [ -L "${expdir}/pretrained_model_iter${iter}" ]; then
+                    log "Symbolic link ${expdir}/pretrained_model_iter${iter} already exists, remove it."
+                    rm "${expdir}/pretrained_model_iter${iter}"
+                fi
+                
+                if ! [ -z "${asr_exp}" ]; then
+                    ln -s "../${asr_exp}" "${expdir}/pretrained_model_iter${iter}"
+                fi
+            fi
+            
+            log "Model saved in: ${asr_exp}"
+        else
+            log "Skip the pretraining stages"
+        fi
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/ssl1/local/path.sh b/egs2/TEMPLATE/ssl1/local/path.sh
new file mode 100644
index 00000000000..306a709a086
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import fairseq" > /dev/null; then
+    echo "Error: fairseq is not installed." >&2
+    echo "Error: please install fairseq and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make fairseq.done" >&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/ssl1/path.sh b/egs2/TEMPLATE/ssl1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/audio b/egs2/TEMPLATE/ssl1/pyscripts/audio
new file mode 120000
index 00000000000..b39e2b7c414
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/pyscripts/audio
@@ -0,0 +1 @@
+../../asr1/pyscripts/audio/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
new file mode 100644
index 00000000000..552c84f89ad
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/pyscripts/dump_km_label.py
@@ -0,0 +1,130 @@
+import argparse
+import logging
+import os
+import sys
+
+import numpy as np
+
+import joblib
+import torch
+import tqdm
+import pdb
+
+from sklearn_km import MfccFeatureReader, get_path_iterator, HubertFeatureReader
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+logger = logging.getLogger("dump_km_label")
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--km-path", type=str)
+    parser.add_argument("--label-path", type=str)
+    parser.add_argument(
+        "--recog-set", default=None, nargs="+", help="folders contain wav.scp for recog"
+    )
+    parser.add_argument("--feature", default="mfcc", type=str)
+    parser.add_argument("--nj", default=1, type=int)
+    parser.add_argument("--sample-rate", type=int, default=16000)
+    parser.add_argument("--hurl", type=str, default="./")
+    parser.add_argument("--hdir", type=str, default="./")
+
+    return parser
+
+
+class ApplyKmeans(object):
+    def __init__(self, km_path):
+        self.km_model = joblib.load(km_path)
+        self.nc = self.km_model.cluster_centers_.transpose()
+        self.nc_norm = (self.nc**2).sum(0, keepdims=True)
+
+    def __call__(self, x):
+        if isinstance(x, torch.Tensor):
+            x = x.cpu().numpy()
+        probs = (
+            (x**2).sum(1, keepdims=True) - 2 * np.matmul(x, self.nc) + self.nc_norm
+        )
+        return np.argmin(probs, axis=1)
+
+
+def dump_pseudo_label_mfcc(km_path, task, sample_rate, nj):
+    apply_kmeans = ApplyKmeans(km_path)
+    reader = MfccFeatureReader(sample_rate)
+    generator, num = get_path_iterator(f"{task}/wav.scp", 1.0)
+    iterator = generator()
+
+    if nj > 1:
+        feats = joblib.Parallel(n_jobs=nj)(
+            joblib.delayed(reader.get_feats)(path)
+            for utt_id, path in tqdm.tqdm(iterator, total=num)
+        )
+
+        p_labs = joblib.Parallel(n_jobs=nj)(
+            joblib.delayed(apply_kmeans)(feat) for feat in tqdm.tqdm(feats, total=num)
+        )
+        iterator = generator()
+        utt_ids = [utt_id for utt_id, _ in iterator]
+    else:
+        utt_ids, p_labs = [], []
+        for utt_id, path in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path)
+            p_lab = apply_kmeans(feat).tolist()
+            p_labs.append(p_lab)
+            utt_ids.append(utt_id)
+    return utt_ids, p_labs
+
+
+def dump_pseudo_label_hubert(km_path, task, sample_rate, url, dir, layer):
+    apply_kmeans = ApplyKmeans(km_path)
+    reader = HubertFeatureReader(sample_rate, url, dir, layer)
+    generator, num = get_path_iterator(f"{task}/wav.scp", 1.0)
+    iterator = generator()
+
+    utt_ids, p_labs = [], []
+    for utt_id, path in tqdm.tqdm(iterator, total=num):
+        feat = reader.get_feats(path)
+        p_lab = apply_kmeans(feat).tolist()
+        p_labs.append(p_lab)
+        utt_ids.append(utt_id)
+    return utt_ids, p_labs
+
+
+def dump_label(km_path, label_path, recog_set, feature, nj, sample_rate, hurl, hdir):
+    feature = feature.lower()
+    if recog_set:
+        for task in recog_set:
+            logger.info("Dumping pseudo labeling for: %s", task)
+            if feature == "mfcc":
+                utt_ids, p_labs = dump_pseudo_label_mfcc(
+                    f"{km_path}",
+                    task,
+                    sample_rate,
+                    nj,
+                )
+            elif "hubert" in feature:
+                hlayer = int(feature.replace("hubert", ""))
+                utt_ids, p_labs = dump_pseudo_label_hubert(
+                    f"{km_path}",
+                    task,
+                    sample_rate,
+                    hurl,
+                    hdir,
+                    hlayer,
+                )
+
+            with open(label_path, "w") as f:
+                for utt_id, p_lab in zip(utt_ids, p_labs):
+                    f.write(utt_id + " " + " ".join(map(str, p_lab)) + "\n")
+
+    logger.info("finished successfully")
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.info(str(args))
+
+    dump_label(**vars(args))
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/feats b/egs2/TEMPLATE/ssl1/pyscripts/feats
new file mode 120000
index 00000000000..0896f32bcdc
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/pyscripts/feats
@@ -0,0 +1 @@
+../../asr1/pyscripts/feats/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/feature_loader.py b/egs2/TEMPLATE/ssl1/pyscripts/feature_loader.py
new file mode 100644
index 00000000000..b0dae8a2074
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/pyscripts/feature_loader.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+# The feature_loader.py uses code from Fairseq:
+#     https://github.com/pytorch/fairseq/blob/master/examples/hubert/simple_kmeans/dump_mfcc_feature.py
+#
+# Thanks to Abdelrahman Mohamed and Wei-Ning Hsu's help in this implementation,
+# Their origial Hubert work is in:
+#     Paper: https://arxiv.org/pdf/2106.07447.pdf
+#     Code in Fairseq: https://github.com/pytorch/fairseq/tree/master/examples/hubert
+
+"""Extract MFCC & intermediate embedding from the Hubert model for k-means clustering."""
+
+import logging
+import os
+import sys
+
+import fairseq
+
+import soundfile as sf
+import torch
+import torchaudio
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+logger = logging.getLogger("feature_loader")
+
+
+class MfccFeatureReader(object):
+    def __init__(self, fs):
+        self.fs = fs
+
+    def load_audio(self, path):
+        wav, sr = sf.read(path)
+        assert sr == self.fs, sr
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        return wav
+
+    def get_feats(self, path):
+        x = self.load_audio(path)
+        with torch.no_grad():
+            x = torch.from_numpy(x).view(1, -1).float()
+
+            mfcc = torchaudio.compliance.kaldi.mfcc(
+                waveform=x,
+                sample_frequency=self.fs,
+                use_energy=False,
+            ).transpose(
+                0, 1
+            )  # (freq, time)
+            delta = torchaudio.functional.compute_deltas(mfcc)
+            ddelta = torchaudio.functional.compute_deltas(delta)
+            concat = (
+                torch.cat([mfcc, delta, ddelta], dim=0).transpose(0, 1).contiguous()
+            )
+            return concat
+
+
+class HubertFeatureReader(object):
+    def __init__(self, fs, hubert_url, hubert_dir_path, layer, max_chunk=1600000):
+        self.fs = fs
+
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
+
+        e = FairseqHubertEncoder(0, hubert_url, hubert_dir_path)
+        self.model = e.encoders.to(self.device).eval()
+
+        self.layer = layer
+        self.max_chunk = max_chunk
+        logger.info(f" max_chunk = {self.max_chunk}")
+
+    def load_audio(self, path):
+        wav, sr = sf.read(path)
+        assert sr == self.fs, sr
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        return wav
+
+    def get_feats(self, path):
+        x = self.load_audio(path)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(self.device)
+            x = x.view(1, -1)
+
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start : start + self.max_chunk]
+                feat_chunk, _ = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    output_layer=self.layer,
+                )
+                feat.append(feat_chunk)
+            return torch.cat(feat, 1).squeeze(0).cpu()
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/sklearn_km.py b/egs2/TEMPLATE/ssl1/pyscripts/sklearn_km.py
new file mode 100644
index 00000000000..ce0c82fcd3c
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/pyscripts/sklearn_km.py
@@ -0,0 +1,228 @@
+# The sklearn_km.py uses code from Fairseq:
+#     https://github.com/pytorch/fairseq/blob/master/examples/hubert/simple_kmeans/learn_kmeans.py
+#
+# Thanks to Abdelrahman Mohamed and Wei-Ning Hsu's help in this implementation,
+# Their origial Hubert work is in:
+#     Paper: https://arxiv.org/pdf/2106.07447.pdf
+#     Code in Fairseq: https://github.com/pytorch/fairseq/tree/master/examples/hubert
+
+import argparse
+import logging
+import os
+import sys
+from random import sample
+import warnings
+
+import joblib
+import numpy as np
+import math
+
+import soundfile as sf
+import torch
+import torchaudio
+import tqdm
+
+from sklearn.cluster import MiniBatchKMeans
+import fairseq
+
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
+
+from feature_loader import MfccFeatureReader
+from feature_loader import HubertFeatureReader
+
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+)
+logger = logging.getLogger("sklearn_kmeans")
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--feats-dir", type=str, help="folder contains wav.scp for training"
+    )
+    parser.add_argument(
+        "--n-clusters", default=100, type=int, help="number of clusters for K-Means"
+    )
+    parser.add_argument("--nj", default=1, type=int, help="only support mfcc")
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--fs", type=int, default=16000)
+    parser.add_argument("--feature-type", type=str, default="mfcc")
+    parser.add_argument("--hubert-model-url", type=str, default=None)
+    parser.add_argument("--hubert-model-path", type=str, default=None)
+    parser.add_argument(
+        "--portion", type=float, default=1.0, help="Using a subset of the data."
+    )
+
+    group = parser.add_argument_group(description="K-means model.")
+    group.add_argument("--km-path", type=str, help="path for k-means model.")
+    group.add_argument("--init", default="k-means++")
+    group.add_argument("--max-iter", default=100, type=int)
+    group.add_argument("--batch-size", default=10000, type=int)
+    group.add_argument("--tol", default=0.0, type=float)
+    group.add_argument("--max-no-improvement", default=100, type=int)
+    group.add_argument("--n-init", default=20, type=int)
+    group.add_argument("--reassignment-ratio", default=0.0, type=float)
+
+    return parser
+
+
+def get_path_iterator(wav, portion=0.1):
+    with open(wav, "r") as f:
+        lines = [line.rstrip() for line in f]
+        lines = sample(lines, int(portion * len(lines)))
+
+        def iterate():
+            for line in lines:
+                utt_id, path = line.split(" ")
+                yield utt_id, f"{path}"
+
+        return iterate, len(lines)
+
+
+def get_mfcc_feature(feats_dir, fs, nj, portion):
+    reader = MfccFeatureReader(fs)
+    print(f"{feats_dir}/wav.scp")
+    generator, num = get_path_iterator(f"{feats_dir}/wav.scp", portion)
+    iterator = generator()
+
+    if nj > 1:
+        feats = joblib.Parallel(n_jobs=nj)(
+            joblib.delayed(reader.get_feats)(path)
+            for utt_id, path in tqdm.tqdm(iterator, total=num)
+        )
+    else:
+        feats = []
+        for utt_id, path in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path)
+            feats.append(feat.cpu().numpy())
+        np.random.shuffle(feat)
+    logger.info("Getting MFCC feature successfully")
+    return np.vstack(feats)
+
+
+def get_hubert_feature(feats_dir, fs, portion, url, dir, layer):
+
+    reader = HubertFeatureReader(fs, url, dir, layer)
+    generator, num = get_path_iterator(f"{feats_dir}/wav.scp", portion)
+    iterator = generator()
+    feats = []
+    for utt_id, path in tqdm.tqdm(iterator, total=num):
+        feat = reader.get_feats(path)
+        feats.append(feat.cpu().numpy())
+    np.random.shuffle(feat)
+    logger.info("Getting HuBERT feature successfully")
+    return np.vstack(feats)
+
+
+def load_feature(
+    feats_dir,
+    fs,
+    nj,
+    portion,
+    feature_type,
+    hubert_model_url,
+    hubert_model_path,
+):
+    # generate mfcc feature
+    if feature_type == "mfcc":
+        feat = get_mfcc_feature(feats_dir, fs, nj, portion)
+    elif "hubert" in feature_type:
+        hlayer = int(feature_type.replace("hubert", ""))
+        feat = get_hubert_feature(
+            feats_dir, fs, portion, hubert_model_url, hubert_model_path, hlayer
+        )
+    else:
+        raise ValueError(f"feature_type: {feature_type}")
+    return feat
+
+
+def train_km_model(
+    n_clusters,
+    init,
+    max_iter,
+    batch_size,
+    tol,
+    max_no_improvement,
+    n_init,
+    reassignment_ratio,
+):
+    return MiniBatchKMeans(
+        n_clusters=n_clusters,
+        init=init,
+        max_iter=max_iter,
+        batch_size=batch_size,
+        verbose=1,
+        compute_labels=False,
+        tol=tol,
+        max_no_improvement=max_no_improvement,
+        init_size=None,
+        n_init=n_init,
+        reassignment_ratio=reassignment_ratio,
+    )
+
+
+def learn_kmeans(
+    feats,
+    km_path,
+    n_clusters,
+    init,
+    max_iter,
+    batch_size,
+    tol,
+    max_no_improvement,
+    n_init,
+    reassignment_ratio,
+):
+    km_model = train_km_model(
+        n_clusters,
+        init,
+        max_iter,
+        batch_size,
+        tol,
+        max_no_improvement,
+        n_init,
+        reassignment_ratio,
+    )
+    km_model.fit(feats)
+    joblib.dump(km_model, f"{km_path}")
+
+    inertia = -km_model.score(feats) / len(feats)
+    logger.info("total intertia: %.5f", inertia)
+    logger.info("K-means training successfully")
+
+
+def main(args):
+    np.random.seed(args.seed)
+    print("Loading Features")
+    feats = load_feature(
+        feats_dir=args.feats_dir,
+        fs=args.fs,
+        nj=args.nj,
+        portion=args.portion,
+        feature_type=args.feature_type.lower(),
+        hubert_model_path=args.hubert_model_path,
+        hubert_model_url=args.hubert_model_url,
+    )
+    print("Learning kmeans")
+    learn_kmeans(
+        feats,
+        km_path=args.km_path,
+        n_clusters=args.n_clusters,
+        init=args.init,
+        max_iter=args.max_iter,
+        batch_size=args.batch_size,
+        tol=args.tol,
+        max_no_improvement=args.max_no_improvement,
+        n_init=args.n_init,
+        reassignment_ratio=args.reassignment_ratio,
+    )
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    logging.info(str(args))
+    main(args)
diff --git a/egs2/TEMPLATE/ssl1/pyscripts/utils b/egs2/TEMPLATE/ssl1/pyscripts/utils
new file mode 120000
index 00000000000..df4157a4658
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/pyscripts/utils
@@ -0,0 +1 @@
+../../asr1/pyscripts/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/scripts/audio b/egs2/TEMPLATE/ssl1/scripts/audio
new file mode 120000
index 00000000000..520055e2ba9
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/scripts/audio
@@ -0,0 +1 @@
+../../asr1/scripts/audio/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/scripts/feats b/egs2/TEMPLATE/ssl1/scripts/feats
new file mode 120000
index 00000000000..5d04839550c
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/scripts/feats
@@ -0,0 +1 @@
+../../asr1/scripts/feats/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/scripts/km.sh b/egs2/TEMPLATE/ssl1/scripts/km.sh
new file mode 100755
index 00000000000..a1b1f1fff61
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/scripts/km.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+stage=1
+stop_stage=100
+train_set=
+dev_set=
+test_set=
+datadir=
+kmrootdir=
+dictdir=
+
+nclusters=100
+feature_type=mfcc
+
+# Extract intermediate Hubert embedding from official hubert model:
+hubert_url="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt"
+hubert_dir_path="./downloads/hubert_pretrained_models/hubert_base_ls960.pt"
+
+# Extract intermediate Hubert embedding from espnet-trained model:
+# hubert_url="espnet"
+# hubert_dir_path="" # Pretrained Hubert model dir contains 'valid.acc.best.pth' and 'config.yaml'
+
+portion=0.1
+nj=1
+python=python3       # Specify python to execute espnet commands.
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 <--nclusters:100> <--feature_type:mfcc>"
+    exit 0
+fi
+
+km_path="${kmrootdir}/km_${train_set}_${feature_type}/km_${nclusters}clusters.mdl"
+mkdir -p "$(dirname ${km_path})"
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Learn K-means with ${feature_type} feature based on scikit-learn"
+
+    ${python} pyscripts/sklearn_km.py \
+              --feats-dir "${datadir}/${train_set}" \
+              --km-path "${km_path}" \
+              --n-cluster "${nclusters}" \
+              --feature-type "${feature_type}" \
+              --hubert-model-url "${hubert_url}" \
+              --hubert-model-path "${hubert_dir_path}" \
+              --nj ${nj} \
+              --portion ${portion}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Generate K-means pseudo-labels"
+    
+    for task in ${train_set} ${dev_set} ${test_set}; do
+        # move ${datadir}/${task}/ to new folders and rename ptext
+        plabel_dir="${datadir}/${task}_${feature_type}_km${nclusters}"
+        if [[ -d "${plabel_dir}" ]]; then
+            echo "${plabel_dir} already exists, will remove it"
+            rm -r ${plabel_dir}
+        fi
+        mkdir -p ${plabel_dir}
+        cp -r ${datadir}/${task}/* ${plabel_dir}
+        
+        ${python} pyscripts/dump_km_label.py \
+                  --km-path "${km_path}" \
+                  --label-path "${plabel_dir}/text" \
+                  --recog-set "${plabel_dir}" \
+                  --feature "${feature_type}" \
+                  --hurl "${hubert_url}" \
+                  --hdir "${hubert_dir_path}" \
+                  --nj ${nj}
+        
+        utils/fix_data_dir.sh ${plabel_dir}
+    done
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Generate char-based fairseq style dictionary: <token> <count>"
+    # generate dictionaries
+    oov="<unk>"         # Out of vocabulary symbol.
+    blank="<blank>"     # CTC blank symbol
+    pad="<pad>"
+    sos_eos="<sos/eos>" # sos and eos symbole
+    
+    mkdir -p ${dictdir}
+    
+    <${datadir}/${train_set}_${feature_type}_km${nclusters}/text cut -d" " -f2- | \
+        awk '{for (i=1; i<=NF; i++) {count[$i]+=1}} END{for (k in count) {print(k, count[k])}}' | \
+        sort -n -r -k 2 | \
+        awk -v oov=${oov} -v blank=${blank} -v sos_eos=${sos_eos} -v pad=${pad} \
+            'BEGIN{print(blank, 0); print(oov, 0); print(pad, 0)} {print($0)} END{print(sos_eos, 0)}' > ${dictdir}/dict.txt
+    
+    <${datadir}/${train_set}_${feature_type}_km${nclusters}/text cut -d" " -f2- | \
+        awk '{for (i=1; i<=NF; i++) {count[$i]+=1}} END{for (k in count) {print(k, count[k])}}' | \
+        sort -n -r -k 2  | \
+        awk -v oov=${oov} -v blank=${blank} -v sos_eos=${sos_eos} -v pad=${pad} \
+            'BEGIN{print(blank); print(oov)} {print($1)} END{print(sos_eos)}' > ${dictdir}/tokens.txt
+    
+    log "Successfully generate the ${dictdir}/{dict,tokens}.txt"
+    
+fi
diff --git a/egs2/TEMPLATE/ssl1/scripts/utils b/egs2/TEMPLATE/ssl1/scripts/utils
new file mode 120000
index 00000000000..d35add51f35
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/scripts/utils
@@ -0,0 +1 @@
+../../asr1/scripts/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/setup.sh b/egs2/TEMPLATE/ssl1/setup.sh
new file mode 100755
index 00000000000..b144824daba
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/ssl1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in hubert.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/ssl1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/ssl1/steps b/egs2/TEMPLATE/ssl1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/ssl1/utils b/egs2/TEMPLATE/ssl1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/ssl1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/cmd.sh b/egs2/TEMPLATE/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/TEMPLATE/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/TEMPLATE/st1/conf/fbank.conf b/egs2/TEMPLATE/st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/TEMPLATE/st1/conf/pbs.conf b/egs2/TEMPLATE/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/TEMPLATE/st1/conf/pitch.conf b/egs2/TEMPLATE/st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/TEMPLATE/st1/conf/queue.conf b/egs2/TEMPLATE/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/TEMPLATE/st1/conf/slurm.conf b/egs2/TEMPLATE/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/TEMPLATE/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/TEMPLATE/st1/db.sh b/egs2/TEMPLATE/st1/db.sh
new file mode 120000
index 00000000000..318d781d123
--- /dev/null
+++ b/egs2/TEMPLATE/st1/db.sh
@@ -0,0 +1 @@
+../asr1/db.sh
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/local/path.sh b/egs2/TEMPLATE/st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/TEMPLATE/st1/path.sh b/egs2/TEMPLATE/st1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/TEMPLATE/st1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/TEMPLATE/st1/pyscripts b/egs2/TEMPLATE/st1/pyscripts
new file mode 120000
index 00000000000..90e7cf60b04
--- /dev/null
+++ b/egs2/TEMPLATE/st1/pyscripts
@@ -0,0 +1 @@
+../asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/scripts b/egs2/TEMPLATE/st1/scripts
new file mode 120000
index 00000000000..1000492f630
--- /dev/null
+++ b/egs2/TEMPLATE/st1/scripts
@@ -0,0 +1 @@
+../asr1/scripts
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/setup.sh b/egs2/TEMPLATE/st1/setup.sh
new file mode 100755
index 00000000000..4cacb253e2e
--- /dev/null
+++ b/egs2/TEMPLATE/st1/setup.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0 <target-dir>
+EOF
+)
+
+
+if [ $# -ne 1 ]; then
+    log "${help_message}"
+    log "Error: 1 positional argument is required."
+    exit 2
+fi
+
+
+dir=$1
+mkdir -p "${dir}"
+
+if [ ! -d "${dir}"/../../TEMPLATE ]; then
+    log "Error: ${dir}/../../TEMPLATE should exist. You may specify wrong directory."
+    exit 1
+fi
+
+targets=""
+
+# Copy
+for f in cmd.sh conf local; do
+    target="${dir}"/../../TEMPLATE/st1/"${f}"
+    cp -r "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to TEMPLATE
+for f in st.sh path.sh db.sh scripts pyscripts; do
+    target=../../TEMPLATE/st1/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+
+# Symlinks to Kaldi
+for f in steps utils; do
+    target=../../../tools/kaldi/egs/wsj/s5/"${f}"
+    ln -sf "${target}" "${dir}"
+    targets+="${dir}/${target} "
+done
+
+log "Created: ${targets}"
diff --git a/egs2/TEMPLATE/st1/st.sh b/egs2/TEMPLATE/st1/st.sh
new file mode 100755
index 00000000000..9867f341f88
--- /dev/null
+++ b/egs2/TEMPLATE/st1/st.sh
@@ -0,0 +1,1707 @@
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1              # Processes starts from the specified stage.
+stop_stage=10000     # Processes is stopped at the specified stage.
+skip_data_prep=false # Skip data preparation stages.
+skip_train=false     # Skip training stages.
+skip_eval=false      # Skip decoding and evaluation stages.
+skip_upload=true     # Skip packing and uploading stages.
+skip_upload_hf=true  # Skip uploading to hugging face stages.
+ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1          # The number of nodes.
+nj=32                # The number of parallel jobs.
+inference_nj=32      # The number of parallel jobs in decoding.
+gpu_inference=false  # Whether to perform gpu decoding.
+dumpdir=dump         # Directory to dump features.
+expdir=exp           # Directory to save experiments.
+python=python3       # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts= # The options given to local/data.sh.
+
+# Speed perturbation related
+speed_perturb_factors=  # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
+
+# Feature extraction related
+feats_type=raw       # Feature type (raw or fbank_pitch).
+audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+fs=16k               # Sampling rate.
+min_wav_duration=0.1 # Minimum duration in second.
+max_wav_duration=20  # Maximum duration in second.
+
+# Tokenization related
+oov="<unk>"         # Out of vocabulary symbol.
+blank="<blank>"     # CTC blank symbol
+sos_eos="<sos/eos>" # sos and eos symbole
+token_joint=false       # whether to use a single bpe system for both source and target languages
+src_case=lc.rm
+src_token_type=bpe      # Tokenization type (char or bpe) for source languages.
+src_nbpe=30             # The number of BPE vocabulary for source language.
+src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe).
+src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language.
+src_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE of source language
+src_bpe_char_cover=1.0  # character coverage when modeling BPE for source language
+tgt_case=tc
+tgt_token_type=bpe      # Tokenization type (char or bpe) for target language.
+tgt_nbpe=30             # The number of BPE vocabulary for target language.
+tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language.
+tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language.
+tgt_bpe_nlsyms=         # non-linguistic symbols list, separated by a comma, for BPE for target language.
+tgt_bpe_char_cover=1.0  # character coverage when modeling BPE for target language.
+
+# Ngram model related
+use_ngram=false
+ngram_exp=
+ngram_num=3
+
+# Language model related
+use_lm=true       # Use language model for ST decoding.
+lm_tag=           # Suffix to the result dir for language model training.
+lm_exp=           # Specify the directory path for LM experiment.
+                  # If this option is specified, lm_tag is ignored.
+lm_stats_dir=     # Specify the directory path for LM statistics.
+lm_config=        # Config for language model training.
+lm_args=          # Arguments for language model training, e.g., "--max_epoch 10".
+                  # Note that it will overwrite args in lm config.
+use_word_lm=false # Whether to use word language model.
+num_splits_lm=1   # Number of splitting for lm corpus.
+# shellcheck disable=SC2034
+word_vocab_size=10000 # Size of word vocabulary.
+
+# ST model related
+st_tag=        # Suffix to the result dir for st model training.
+st_exp=        # Specify the directory path for ST experiment.
+               # If this option is specified, st_tag is ignored.
+st_stats_dir=  # Specify the directory path for ST statistics.
+st_config=     # Config for st model training.
+st_args=       # Arguments for st model training, e.g., "--max_epoch 10".
+               # Note that it will overwrite args in st config.
+pretrained_asr=               # Pretrained model to load
+ignore_init_mismatch=false      # Ignore initial mismatch
+feats_normalize=global_mvn # Normalizaton layer type.
+num_splits_st=1            # Number of splitting for lm corpus.
+src_lang=es                # source language abbrev. id (e.g., es)
+tgt_lang=en                # target language abbrev. id (e.g., en)
+
+# Upload model related
+hf_repo=
+
+# Decoding related
+use_k2=false      # Whether to use k2 based decoder
+batch_size=1
+inference_tag=    # Suffix to the result dir for decoding.
+inference_config= # Config for decoding.
+inference_args=   # Arguments for decoding, e.g., "--lm_weight 0.1".
+                  # Note that it will overwrite args in inference config.
+inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_ngram=${ngram_num}gram.bin
+inference_st_model=valid.acc.ave.pth # ST model path for decoding.
+                                      # e.g.
+                                      # inference_st_model=train.loss.best.pth
+                                      # inference_st_model=3epoch.pth
+                                      # inference_st_model=valid.acc.best.pth
+                                      # inference_st_model=valid.loss.ave.pth
+download_model= # Download a model from Model Zoo and use it for decoding.
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=       # Name of training set.
+valid_set=       # Name of validation set used for monitoring/tuning network training.
+test_sets=       # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+src_bpe_train_text=  # Text file path of bpe training set for source language.
+tgt_bpe_train_text=  # Text file path of bpe training set for target language.
+lm_train_text=   # Text file path of language model training set.
+lm_dev_text=     # Text file path of language model development set.
+lm_test_text=    # Text file path of language model evaluation set.
+nlsyms_txt=none  # Non-linguistic symbol list if existing.
+cleaner=none     # Text cleaner.
+g2p=none         # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+score_opts=                # The options given to sclite scoring
+local_score_opts=          # The options given to local/score.sh.
+st_speech_fold_length=800 # fold_length for speech data during ST training.
+st_text_fold_length=150   # fold_length for text data during ST training.
+lm_fold_length=150         # fold_length for LM training.
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
+
+Options:
+    # General configuration
+    --stage          # Processes starts from the specified stage (default="${stage}").
+    --stop_stage     # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train     # Skip training stages (default="${skip_train}").
+    --skip_eval      # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_upload    # Skip packing and uploading stages (default="${skip_upload}").
+    --ngpu           # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes      # The number of nodes (default="${num_nodes}").
+    --nj             # The number of parallel jobs (default="${nj}").
+    --inference_nj   # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference  # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir        # Directory to dump features (default="${dumpdir}").
+    --expdir         # Directory to save experiments (default="${expdir}").
+    --python         # Specify python to execute espnet commands (default="${python}").
+
+    # Data preparation related
+    --local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
+
+    # Speed perturbation related
+    --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --fs               # Sampling rate (default="${fs}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+
+    # Tokenization related
+    --oov                     # Out of vocabulary symbol (default="${oov}").
+    --blank                   # CTC blank symbol (default="${blank}").
+    --sos_eos                 # sos and eos symbole (default="${sos_eos}").
+    --token_joint=false       # Whether to use a single bpe system for both source and target languages.
+                              # if set as true, will use tgt_* for processing (default="${token_joint}").
+    --src_token_type=bpe      # Tokenization type (char or bpe) for source languages. (default="${src_token_type}").
+    --src_nbpe=30             # The number of BPE vocabulary for source language. (default="${src_nbpe}").
+    --src_bpemode=unigram     # Mode of BPE for source language (unigram or bpe). (default="${src_bpemode}").
+    --src_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for source language. (default="${src_bpe_input_sentence_size}").
+    --src_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE of source language. (default="${src_bpe_nlsyms}").
+    --src_bpe_char_cover=1.0  # Character coverage when modeling BPE for source language. (default="${src_bpe_char_cover}").
+    --tgt_token_type=bpe      # Tokenization type (char or bpe) for target language. (default="${tgt_token_type}").
+    --tgt_nbpe=30             # The number of BPE vocabulary for target language. (default="${tgt_nbpe}").
+    --tgt_bpemode=unigram     # Mode of BPE (unigram or bpe) for target language. (default="${tgt_bpemode}").
+    --tgt_bpe_input_sentence_size=100000000 # Size of input sentence for BPE for target language. (default="${tgt_bpe_input_sentence_size}").
+    --tgt_bpe_nlsyms=         # Non-linguistic symbols list, separated by a comma, for BPE for target language. (default="${tgt_bpe_nlsyms}").
+    --tgt_bpe_char_cover=1.0  # Character coverage when modeling BPE for target language. (default="${tgt_bpe_char_cover}").
+
+    # Language model related
+    --lm_tag          # Suffix to the result dir for language model training (default="${lm_tag}").
+    --lm_exp          # Specify the directory path for LM experiment.
+                      # If this option is specified, lm_tag is ignored (default="${lm_exp}").
+    --lm_stats_dir    # Specify the directory path for LM statistics (default="${lm_stats_dir}").
+    --lm_config       # Config for language model training (default="${lm_config}").
+    --lm_args         # Arguments for language model training (default="${lm_args}").
+                      # e.g., --lm_args "--max_epoch 10"
+                      # Note that it will overwrite args in lm config.
+    --use_word_lm     # Whether to use word language model (default="${use_word_lm}").
+    --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
+    --num_splits_lm   # Number of splitting for lm corpus (default="${num_splits_lm}").
+
+    # ST model related
+    --st_tag           # Suffix to the result dir for st model training (default="${st_tag}").
+    --st_exp           # Specify the directory path for ST experiment.
+                       # If this option is specified, st_tag is ignored (default="${st_exp}").
+    --st_stats_dir     # Specify the directory path for ST statistics (default="${st_stats_dir}").
+    --st_config        # Config for st model training (default="${st_config}").
+    --st_args          # Arguments for st model training (default="${st_args}").
+                       # e.g., --st_args "--max_epoch 10"
+                       # Note that it will overwrite args in st config.
+    --pretrained_asr=          # Pretrained model to load (default="${pretrained_asr}").
+    --ignore_init_mismatch=      # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
+    --feats_normalize  # Normalizaton layer type. (default="${feats_normalize}").
+    --num_splits_st    # Number of splitting for lm corpus.  (default="${num_splits_st}").
+    --src_lang=        # source language abbrev. id (e.g., es). (default="${src_lang}")
+    --tgt_lang=        # target language abbrev. id (e.g., en). (default="${tgt_lang}")
+
+    # Decoding related
+    --inference_tag       # Suffix to the result dir for decoding (default="${inference_tag}").
+    --inference_config    # Config for decoding (default="${inference_config}").
+    --inference_args      # Arguments for decoding (default="${inference_args}").
+                          # e.g., --inference_args "--lm_weight 0.1"
+                          # Note that it will overwrite args in inference config.
+    --inference_lm        # Language model path for decoding (default="${inference_lm}").
+    --inference_st_model # ST model path for decoding (default="${inference_st_model}").
+    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    # [Task dependent] Set the datadir name created by local/data.sh
+    --train_set     # Name of training set (required).
+    --valid_set     # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets     # Names of test sets.
+                    # Multiple items (e.g., both dev and eval sets) can be specified (required).
+    --src_bpe_train_text # Text file path of bpe training set for source language.
+    --tgt_bpe_train_text # Text file path of bpe training set for target language
+    --lm_train_text  # Text file path of language model training set.
+    --lm_dev_text   # Text file path of language model development set (default="${lm_dev_text}").
+    --lm_test_text  # Text file path of language model evaluation set (default="${lm_test_text}").
+    --nlsyms_txt    # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
+    --cleaner       # Text cleaner (default="${cleaner}").
+    --g2p           # g2p method (default="${g2p}").
+    --lang          # The language type of corpus (default=${lang}).
+    --score_opts             # The options given to sclite scoring (default="{score_opts}").
+    --local_score_opts       # The options given to local/score.sh (default="{local_score_opts}").
+    --st_speech_fold_length # fold_length for speech data during ST training (default="${st_speech_fold_length}").
+    --st_text_fold_length   # fold_length for text data during ST training (default="${st_text_fold_length}").
+    --lm_fold_length         # fold_length for LM training (default="${lm_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(pyscripts/utils/print_args.py $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+# Check required arguments
+[ -z "${train_set}" ] && { log "${help_message}"; log "Error: --train_set is required"; exit 2; };
+[ -z "${valid_set}" ] && { log "${help_message}"; log "Error: --valid_set is required"; exit 2; };
+[ -z "${test_sets}" ] && { log "${help_message}"; log "Error: --test_sets is required"; exit 2; };
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats=${dumpdir}/raw
+elif [ "${feats_type}" = fbank_pitch ]; then
+    data_feats=${dumpdir}/fbank_pitch
+elif [ "${feats_type}" = fbank ]; then
+    data_feats=${dumpdir}/fbank
+elif [ "${feats_type}" == extracted ]; then
+    data_feats=${dumpdir}/extracted
+else
+    log "${help_message}"
+    log "Error: not supported: --feats_type ${feats_type}"
+    exit 2
+fi
+
+# Extra files for translation process
+utt_extra_files="text.${src_case}.${src_lang} text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for bpe training if not specified.
+[ -z "${src_bpe_train_text}" ] && src_bpe_train_text="${data_feats}/${train_set}/text.${src_case}.${src_lang}"
+[ -z "${tgt_bpe_train_text}" ] && tgt_bpe_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_train_text}" ] && lm_train_text="${data_feats}/${train_set}/text.${tgt_case}.${tgt_lang}"
+# Use the same text as ST for lm training if not specified.
+[ -z "${lm_dev_text}" ] && lm_dev_text="${data_feats}/${valid_set}/text.${tgt_case}.${tgt_lang}"
+# Use the text of the 1st evaldir if lm_test is not specified
+[ -z "${lm_test_text}" ] && lm_test_text="${data_feats}/${test_sets%% *}/text.${tgt_case}.${tgt_lang}"
+
+# Check tokenization type
+if [ "${lang}" != noinfo ]; then
+    token_listdir=data/${lang}_token_list
+else
+    token_listdir=data/token_list
+fi
+# The tgt bpedir is set for all cases when using bpe
+tgt_bpedir="${token_listdir}/tgt_bpe_${tgt_bpemode}${tgt_nbpe}"
+tgt_bpeprefix="${tgt_bpedir}"/bpe
+tgt_bpemodel="${tgt_bpeprefix}".model
+tgt_bpetoken_list="${tgt_bpedir}"/tokens.txt
+tgt_chartoken_list="${token_listdir}"/char/tgt_tokens.txt
+if "${token_joint}"; then
+    # if token_joint, the bpe training will use both src_lang and tgt_lang to train a single bpe model
+    src_bpedir="${tgt_bpedir}"
+    src_bpeprefix="${tgt_bpeprefix}"
+    src_bpemodel="${tgt_bpemodel}"
+    src_bpetoken_list="${tgt_bpetoken_list}"
+    src_chartoken_list="${tgt_chartoken_list}"
+else
+    src_bpedir="${token_listdir}/src_bpe_${tgt_bpemode}${tgt_nbpe}"
+    src_bpeprefix="${src_bpedir}"/bpe
+    src_bpemodel="${src_bpeprefix}".model
+    src_bpetoken_list="${src_bpedir}"/tokens.txt
+    src_chartoken_list="${token_listdir}"/char/src_tokens.txt
+fi
+
+# NOTE: keep for future development.
+# shellcheck disable=SC2034
+tgt_wordtoken_list="${token_listdir}"/word/tgt_tokens.txt
+if "${token_joint}"; then
+    src_wordtoken_list="${tgt_wordtoken_list}"
+else
+    src_wordtoken_list="${token_listdir}"/word/src_tokens.txt
+fi
+
+# Set token types for src and tgt langs
+if [ "${src_token_type}" = bpe ]; then
+    src_token_list="${src_bpetoken_list}"
+elif [ "${src_token_type}" = char ]; then
+    src_token_list="${src_chartoken_list}"
+    src_bpemodel=none
+elif [ "${src_token_type}" = word ]; then
+    src_token_list="${src_wordtoken_list}"
+    src_bpemodel=none
+else
+    log "Error: not supported --src_token_type '${src_token_type}'"
+    exit 2
+fi
+if [ "${tgt_token_type}" = bpe ]; then
+    tgt_token_list="${tgt_bpetoken_list}"
+elif [ "${tgt_token_type}" = char ]; then
+    tgt_token_list="${tgt_chartoken_list}"
+    tgt_bpemodel=none
+elif [ "${tgt_token_type}" = word ]; then
+    tgt_token_list="${tgt_wordtoken_list}"
+    tgt_bpemodel=none
+else
+    log "Error: not supported --tgt_token_type '${tgt_token_type}'"
+    exit 2
+fi
+if ${use_word_lm}; then
+    log "Error: Word LM is not supported yet"
+    exit 2
+
+    lm_token_list="${tgt_wordtoken_list}"
+    lm_token_type=word
+else
+    lm_token_list="${tgt_token_list}"
+    lm_token_type="${tgt_token_type}"
+fi
+
+
+# Set tag for naming of model directory
+if [ -z "${st_tag}" ]; then
+    if [ -n "${st_config}" ]; then
+        st_tag="$(basename "${st_config}" .yaml)_${feats_type}"
+    else
+        st_tag="train_${feats_type}"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        st_tag+="_${lang}_${tgt_token_type}_${tgt_case}"
+    else
+        st_tag+="_${tgt_token_type}_${tgt_case}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        st_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${st_args}" ]; then
+        st_tag+="$(echo "${st_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        st_tag+="_sp"
+    fi
+fi
+if [ -z "${lm_tag}" ]; then
+    if [ -n "${lm_config}" ]; then
+        lm_tag="$(basename "${lm_config}" .yaml)"
+    else
+        lm_tag="train"
+    fi
+    if [ "${lang}" != noinfo ]; then
+        lm_tag+="_${lang}_${lm_token_type}"
+    else
+        lm_tag+="_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_tag+="${tgt_nbpe}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${lm_args}" ]; then
+        lm_tag+="$(echo "${lm_args}" | sed -e "s/--/\_/g" -e "s/[ |=/]//g")"
+    fi
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${st_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        st_stats_dir="${expdir}/st_stats_${feats_type}_${lang}_${tgt_token_type}"
+    else
+        st_stats_dir="${expdir}/st_stats_${feats_type}_${tgt_token_type}"
+    fi
+    if [ "${tgt_token_type}" = bpe ]; then
+        st_stats_dir+="${tgt_nbpe}"
+    fi
+    if [ -n "${speed_perturb_factors}" ]; then
+        st_stats_dir+="_sp"
+    fi
+fi
+if [ -z "${lm_stats_dir}" ]; then
+    if [ "${lang}" != noinfo ]; then
+        lm_stats_dir="${expdir}/lm_stats_${lang}_${lm_token_type}"
+    else
+        lm_stats_dir="${expdir}/lm_stats_${lm_token_type}"
+    fi
+    if [ "${lm_token_type}" = bpe ]; then
+        lm_stats_dir+="${tgt_nbpe}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${st_exp}" ]; then
+    st_exp="${expdir}/st_${st_tag}"
+fi
+if [ -z "${lm_exp}" ]; then
+    lm_exp="${expdir}/lm_${lm_tag}"
+fi
+if [ -z "${ngram_exp}" ]; then
+    ngram_exp="${expdir}/ngram"
+fi
+
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${inference_args}" ]; then
+        inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    if "${use_ngram}"; then
+        inference_tag+="_ngram_$(basename "${ngram_exp}")_$(echo "${inference_ngram}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_st_model_$(echo "${inference_st_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+
+    if "${use_k2}"; then
+      inference_tag+="_use_k2"
+    fi
+fi
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        if [ -n "${speed_perturb_factors}" ]; then
+            log "Stage 2: Speed perturbation: data/${train_set} -> data/${train_set}_sp"
+            for factor in ${speed_perturb_factors}; do
+                if [[ $(bc <<<"${factor} != 1.0") == 1 ]]; then
+                    scripts/utils/perturb_data_dir_speed.sh --utt_extra_files "${utt_extra_files}" \
+                         "${factor}" "data/${train_set}" "data/${train_set}_sp${factor}"
+                    _dirs+="data/${train_set}_sp${factor} "
+                else
+                    # If speed factor is 1, same as the original
+                    _dirs+="data/${train_set} "
+                fi
+            done
+            utils/combine_data.sh --extra_files "${utt_extra_files}" "data/${train_set}_sp" ${_dirs}
+            for extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py data/"${train_set}_sp"/${extra_file} > data/"${train_set}_sp"/${extra_file}.tmp 
+                mv data/"${train_set}_sp"/${extra_file}.tmp data/"${train_set}_sp"/${extra_file}
+            done 
+        else
+           log "Skip stage 2: Speed perturbation"
+        fi
+    fi
+
+    if [ -n "${speed_perturb_factors}" ]; then
+        train_set="${train_set}_sp"
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        if [ "${feats_type}" = raw ]; then
+            log "Stage 3: Format wav.scp: data/ -> ${data_feats}"
+
+            # ====== Recreating "wav.scp" ======
+            # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+            # shouldn't be used in training process.
+            # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+            # and it can also change the audio-format and sampling rate.
+            # If nothing is need, then format_wav_scp.sh does nothing:
+            # i.e. the input file format and rate is same as the output.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                echo "${expand_utt_extra_files}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel,reco2dur}
+                _opts=
+                if [ -e data/"${dset}"/segments ]; then
+                    # "segments" is used for splitting wav files which are written in "wav".scp
+                    # into utterances. The file format of segments:
+                    #   <segment_id> <record_id> <start_time> <end_time>
+                    #   "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5"
+                    # Where the time is written in seconds.
+                    _opts+="--segments data/${dset}/segments "
+                fi
+                # shellcheck disable=SC2086
+                scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                    --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                    "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank_pitch ]; then
+            log "[Require Kaldi] Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # 1. Copy datadir
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                # 2. Feature extract
+                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
+                steps/make_fbank_pitch.sh --nj "${_nj}" --cmd "${train_cmd}" "${data_feats}${_suf}/${dset}"
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
+
+                # 3. Derive the the frame length and feature dimension
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
+                # 4. Write feats_dim
+                head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
+                    | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
+
+                # 5. Write feats_type
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        elif [ "${feats_type}" = fbank ]; then
+            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+            log "${feats_type} is not supported yet."
+            exit 1
+
+        elif  [ "${feats_type}" = extracted ]; then
+            log "Stage 3: ${feats_type} extract: data/ -> ${data_feats}"
+            # Assumming you don't have wav.scp, but feats.scp is created by local/data.sh instead.
+
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                # Generate dummy wav.scp to avoid error by copy_data_dir.sh
+                <data/"${dset}"/cmvn.scp awk ' { print($1,"<DUMMY>") }' > data/"${dset}"/wav.scp
+                utils/copy_data_dir.sh --validate_opts --non-print data/"${dset}" "${data_feats}${_suf}/${dset}"
+
+                # expand the utt_extra_files for multi-references
+                expand_utt_extra_files=""
+                for extra_file in ${utt_extra_files}; do
+                    # with regex to suuport multi-references
+                    for single_file in $(ls data/"${dset}"/${extra_file}*); do
+                        cp ${single_file} "${data_feats}${_suf}/${dset}"
+                        expand_utt_extra_files="${expand_utt_extra_files} $(basename ${single_file})"
+                    done 
+                done
+                utils/fix_data_dir.sh --utt_extra_files "${expand_utt_extra_files}*" "${data_feats}${_suf}/${dset}"
+                for extra_file in ${expand_utt_extra_files}; do
+                    LC_ALL=C sort -u -k1,1 "${data_feats}${_suf}/${dset}/${extra_file}" -o "${data_feats}${_suf}/${dset}/${extra_file}"
+                done
+
+                # Derive the the frame length and feature dimension
+                _nj=$(min "${nj}" "$(<"${data_feats}${_suf}/${dset}/utt2spk" wc -l)")
+                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
+
+                pyscripts/feats/feat-to-shape.py "scp:head -n 1 ${data_feats}${_suf}/${dset}/feats.scp |" - | \
+                    awk '{ print $2 }' | cut -d, -f2 > "${data_feats}${_suf}/${dset}/feats_dim"
+
+                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+            done
+
+        else
+            log "Error: not supported: --feats_type ${feats_type}"
+            exit 2
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            utils/copy_data_dir.sh --validate_opts --non-print "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+
+            for utt_extra_file in ${utt_extra_files}; do
+                cp "${data_feats}/org/${dset}/${utt_extra_file}" "${data_feats}/${dset}"
+            done
+            # Remove short utterances
+            _feats_type="$(<${data_feats}/${dset}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+                # utt2num_samples is created by format_wav_scp.sh
+                <"${data_feats}/org/${dset}/utt2num_samples" \
+                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                        >"${data_feats}/${dset}/utt2num_samples"
+                <"${data_feats}/org/${dset}/wav.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                    >"${data_feats}/${dset}/wav.scp"
+            else
+                # Get frame shift in ms from conf/fbank.conf
+                _frame_shift=
+                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
+                    # Assume using conf/fbank.conf for feature extraction
+                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
+                fi
+                if [ -z "${_frame_shift}" ]; then
+                    # If not existing, use the default number in Kaldi (=10ms).
+                    # If you are using different number, you have to change the following value manually.
+                    _frame_shift=10
+                fi
+
+                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
+                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
+
+                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
+                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
+                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
+                        >"${data_feats}/${dset}/feats_shape"
+                <"${data_feats}/org/${dset}/feats.scp" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
+                    >"${data_feats}/${dset}/feats.scp"
+            fi
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" "${data_feats}/${dset}"
+            for utt_extra_file in ${utt_extra_files}; do
+                python pyscripts/utils/remove_duplicate_keys.py ${data_feats}/${dset}/${utt_extra_file} \
+                    > ${data_feats}/${dset}/${utt_extra_file}.tmp 
+                mv ${data_feats}/${dset}/${utt_extra_file}.tmp ${data_feats}/${dset}/${utt_extra_file}
+            done 
+        done
+
+        # shellcheck disable=SC2002
+        cat ${lm_train_text} | awk ' { if( NF != 1 ) print $0; } ' > "${data_feats}/lm_train.txt"
+    fi
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        # Combine source and target texts when using joint tokenization
+        if "${token_joint}"; then
+            log "Merge src and target data if joint BPE"
+
+            cat $tgt_bpe_train_text > ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            [ ! -z "${src_bpe_train_text}" ] && cat ${src_bpe_train_text} >> ${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}
+            # Set the new text as the target text
+            tgt_bpe_train_text="${data_feats}/${train_set}/text.${src_lang}_${tgt_lang}"
+        fi
+
+        # First generate tgt lang
+        if [ "${tgt_token_type}" = bpe ]; then
+            log "Stage 5a: Generate token_list from ${tgt_bpe_train_text} using BPE for tgt_lang"
+
+            mkdir -p "${tgt_bpedir}"
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${tgt_bpedir}"/train.txt
+
+            if [ -n "${tgt_bpe_nlsyms}" ]; then
+                _opts_spm="--user_defined_symbols=${tgt_bpe_nlsyms}"
+            else
+                _opts_spm=""
+            fi
+
+            spm_train \
+                --input="${tgt_bpedir}"/train.txt \
+                --vocab_size="${tgt_nbpe}" \
+                --model_type="${tgt_bpemode}" \
+                --model_prefix="${tgt_bpeprefix}" \
+                --character_coverage=${tgt_bpe_char_cover} \
+                --input_sentence_size="${tgt_bpe_input_sentence_size}" \
+                ${_opts_spm}
+
+            {
+            echo "${blank}"
+            echo "${oov}"
+            # Remove <unk>, <s>, </s> from the vocabulary
+            <"${tgt_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+            echo "${sos_eos}"
+            } > "${tgt_token_list}"
+
+        elif [ "${tgt_token_type}" = char ] || [ "${tgt_token_type}" = word ]; then
+            log "Stage 5a: Generate character level token_list from ${tgt_bpe_train_text}  for tgt_lang"
+
+            _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+            # shellcheck disable=SC2002
+            cat ${tgt_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+            # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+            # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+            ${python} -m espnet2.bin.tokenize_text  \
+                --token_type "${tgt_token_type}" \
+                --input "${data_feats}/token_train.txt" --output "${tgt_token_list}" ${_opts} \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+
+        else
+            log "Error: not supported --token_type '${tgt_token_type}'"
+            exit 2
+        fi
+
+        # Create word-list for word-LM training
+        if ${use_word_lm} && [ "${tgt_token_type}" != word ]; then
+            log "Generate word level token_list from ${data_feats}/lm_train.txt"
+            ${python} -m espnet2.bin.tokenize_text \
+                --token_type word \
+                --input "${data_feats}/lm_train.txt" --output "${lm_token_list}" \
+                --field 2- \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --write_vocabulary true \
+                --vocabulary_size "${word_vocab_size}" \
+                --add_symbol "${blank}:0" \
+                --add_symbol "${oov}:1" \
+                --add_symbol "${sos_eos}:-1"
+        fi
+
+        # Then generate src lang
+        if "${token_joint}"; then
+            log "Stage 5b: Skip separate token construction for src_lang when setting ${token_joint} as true"
+        else
+            if [ "${src_token_type}" = bpe ]; then
+                log "Stage 5b: Generate token_list from ${src_bpe_train_text} using BPE for src_lang"
+
+                mkdir -p "${src_bpedir}"
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${src_bpedir}"/train.txt
+
+                if [ -n "${src_bpe_nlsyms}" ]; then
+                    _opts_spm="--user_defined_symbols=${src_bpe_nlsyms}"
+                else
+                    _opts_spm=""
+                fi
+
+                spm_train \
+                    --input="${src_bpedir}"/train.txt \
+                    --vocab_size="${src_nbpe}" \
+                    --model_type="${src_bpemode}" \
+                    --model_prefix="${src_bpeprefix}" \
+                    --character_coverage=${src_bpe_char_cover} \
+                    --input_sentence_size="${src_bpe_input_sentence_size}" \
+                    ${_opts_spm}
+
+                {
+                echo "${blank}"
+                echo "${oov}"
+                # Remove <unk>, <s>, </s> from the vocabulary
+                <"${src_bpeprefix}".vocab awk '{ if( NR != 1 && NR != 2 && NR != 3 ){ print $1; } }'
+                echo "${sos_eos}"
+                } > "${src_token_list}"
+
+            elif [ "${src_token_type}" = char ] || [ "${src_token_type}" = word ]; then
+                log "Stage 5b: Generate character level token_list from ${src_bpe_train_text}  for src_lang"
+
+                _opts="--non_linguistic_symbols ${nlsyms_txt}"
+
+                # shellcheck disable=SC2002
+                cat ${src_bpe_train_text} | cut -f 2- -d" "  > "${data_feats}"/token_train.txt
+
+                # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+                # 0 is reserved for CTC-blank for ST and also used as ignore-index in the other task
+                ${python} -m espnet2.bin.tokenize_text  \
+                    --token_type "${src_token_type}" \
+                    --input "${data_feats}/token_train.txt" --output "${src_token_list}" ${_opts} \
+                    --field 2- \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --write_vocabulary true \
+                    --add_symbol "${blank}:0" \
+                    --add_symbol "${oov}:1" \
+                    --add_symbol "${sos_eos}:-1"
+
+            else
+                log "Error: not supported --token_type '${src_token_type}'"
+                exit 2
+            fi
+
+
+        fi
+    fi
+
+else
+    log "Skip the stages for data preparation"
+fi
+
+
+# ========================== Data preparation is done here. ==========================
+
+
+if ! "${skip_train}"; then
+    if "${use_lm}"; then
+        if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+            log "Stage 6: LM collect stats: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            # 1. Split the key file
+            _logdir="${lm_stats_dir}/logdir"
+            mkdir -p "${_logdir}"
+            # Get the minimum number among ${nj} and the number lines of input files
+            _nj=$(min "${nj}" "$(<${data_feats}/lm_train.txt wc -l)" "$(<${lm_dev_text} wc -l)")
+
+            key_file="${data_feats}/lm_train.txt"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/train.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            key_file="${lm_dev_text}"
+            split_scps=""
+            for n in $(seq ${_nj}); do
+                split_scps+=" ${_logdir}/dev.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Generate run.sh
+            log "Generate '${lm_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+            mkdir -p "${lm_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${lm_stats_dir}/run.sh"; chmod +x "${lm_stats_dir}/run.sh"
+
+            # 3. Submit jobs
+            log "LM collect-stats started... log: '${_logdir}/stats.*.log'"
+            # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+            #       but it's used only for deciding the sample ids.
+            # shellcheck disable=SC2086
+            ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+                ${python} -m espnet2.bin.lm_train \
+                    --collect_stats true \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --train_data_path_and_name_and_type "${data_feats}/lm_train.txt,text,text" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --train_shape_file "${_logdir}/train.JOB.scp" \
+                    --valid_shape_file "${_logdir}/dev.JOB.scp" \
+                    --output_dir "${_logdir}/stats.JOB" \
+                    ${_opts} ${lm_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+            # 4. Aggregate shape files
+            _opts=
+            for i in $(seq "${_nj}"); do
+                _opts+="--input_dir ${_logdir}/stats.${i} "
+            done
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${lm_stats_dir}"
+
+            # Append the num-tokens at the last dimensions. This is used for batch-bins count
+            <"${lm_stats_dir}/train/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/train/text_shape.${lm_token_type}"
+
+            <"${lm_stats_dir}/valid/text_shape" \
+                awk -v N="$(<${lm_token_list} wc -l)" '{ print $0 "," N }' \
+                >"${lm_stats_dir}/valid/text_shape.${lm_token_type}"
+        fi
+
+
+        if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+            log "Stage 7: LM Training: train_set=${data_feats}/lm_train.txt, dev_set=${lm_dev_text}"
+
+            _opts=
+            if [ -n "${lm_config}" ]; then
+                # To generate the config file: e.g.
+                #   % python3 -m espnet2.bin.lm_train --print_config --optim adam
+                _opts+="--config ${lm_config} "
+            fi
+
+            if [ "${num_splits_lm}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${lm_stats_dir}/splits${num_splits_lm}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps "${data_feats}/lm_train.txt" "${lm_stats_dir}/train/text_shape.${lm_token_type}" \
+                      --num_splits "${num_splits_lm}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${lm_token_type} "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${data_feats}/lm_train.txt,text,text "
+                _opts+="--train_shape_file ${lm_stats_dir}/train/text_shape.${lm_token_type} "
+            fi
+
+            # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+            log "Generate '${lm_exp}/run.sh'. You can resume the process from stage 7 using this script"
+            mkdir -p "${lm_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${lm_exp}/run.sh"; chmod +x "${lm_exp}/run.sh"
+
+            log "LM training started... log: '${lm_exp}/train.log'"
+            if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+                # SGE can't include "/" in a job name
+                jobname="$(basename ${lm_exp})"
+            else
+                jobname="${lm_exp}/train.log"
+            fi
+
+            # TODO(jiatong): fix bpe
+            # shellcheck disable=SC2086
+            ${python} -m espnet2.bin.launch \
+                --cmd "${cuda_cmd} --name ${jobname}" \
+                --log "${lm_exp}"/train.log \
+                --ngpu "${ngpu}" \
+                --num_nodes "${num_nodes}" \
+                --init_file_prefix "${lm_exp}"/.dist_init_ \
+                --multiprocessing_distributed true -- \
+                ${python} -m espnet2.bin.lm_train \
+                    --ngpu "${ngpu}" \
+                    --use_preprocessor true \
+                    --bpemodel "${tgt_bpemodel}" \
+                    --token_type "${lm_token_type}"\
+                    --token_list "${lm_token_list}" \
+                    --non_linguistic_symbols "${nlsyms_txt}" \
+                    --cleaner "${cleaner}" \
+                    --g2p "${g2p}" \
+                    --valid_data_path_and_name_and_type "${lm_dev_text},text,text" \
+                    --valid_shape_file "${lm_stats_dir}/valid/text_shape.${lm_token_type}" \
+                    --fold_length "${lm_fold_length}" \
+                    --resume true \
+                    --output_dir "${lm_exp}" \
+                    ${_opts} ${lm_args}
+
+        fi
+
+
+        if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+            log "Stage 8: Calc perplexity: ${lm_test_text}"
+            _opts=
+            # TODO(kamo): Parallelize?
+            log "Perplexity calculation started... log: '${lm_exp}/perplexity_test/lm_calc_perplexity.log'"
+            # shellcheck disable=SC2086
+            ${cuda_cmd} --gpu "${ngpu}" "${lm_exp}"/perplexity_test/lm_calc_perplexity.log \
+                ${python} -m espnet2.bin.lm_calc_perplexity \
+                    --ngpu "${ngpu}" \
+                    --data_path_and_name_and_type "${lm_test_text},text,text" \
+                    --train_config "${lm_exp}"/config.yaml \
+                    --model_file "${lm_exp}/${inference_lm}" \
+                    --output_dir "${lm_exp}/perplexity_test" \
+                    ${_opts}
+            log "PPL: ${lm_test_text}: $(cat ${lm_exp}/perplexity_test/ppl)"
+
+        fi
+
+    else
+        log "Stage 6-8: Skip lm-related stages: use_lm=${use_lm}"
+    fi
+
+
+    if "${use_ngram}"; then
+        mkdir -p ${ngram_exp}
+    fi
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        if "${use_ngram}"; then
+            log "Stage 9: Ngram Training: train_set=${data_feats}/lm_train.txt"
+            cut -f 2 -d " " ${data_feats}/lm_train.txt | lmplz -S "20%" --discount_fallback -o ${ngram_num} - >${ngram_exp}/${ngram_num}gram.arpa
+            build_binary -s ${ngram_exp}/${ngram_num}gram.arpa ${ngram_exp}/${ngram_num}gram.bin 
+        else
+            log "Stage 9: Skip ngram stages: use_ngram=${use_ngram}"
+        fi
+    fi
+
+
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        _st_train_dir="${data_feats}/${train_set}"
+        _st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 10: ST collect stats: train_set=${_st_train_dir}, valid_set=${_st_valid_dir}"
+
+        _opts=
+        if [ -n "${st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.st_train --print_config --optim adam
+            _opts+="--config ${st_config} "
+        fi
+
+        _feats_type="$(<${_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _input_size="$(<${_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+        fi
+
+        # 1. Split the key file
+        _logdir="${st_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_st_train_dir}/${_scp} wc -l)" "$(<${_st_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_st_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_st_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${st_stats_dir}/run.sh'. You can resume the process from stage 10 using this script"
+        mkdir -p "${st_stats_dir}"; echo "${run_args} --stage 10 \"\$@\"; exit \$?" > "${st_stats_dir}/run.sh"; chmod +x "${st_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "ST collect-stats started... log: '${_logdir}/stats.*.log'"
+
+        # NOTE: --*_shape_file doesn't require length information if --batch_type=unsorted,
+        #       but it's used only for deciding the sample ids.
+
+        # TODO(jiatong): fix different bpe model
+        # shellcheck disable=SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m espnet2.bin.st_train \
+                --collect_stats true \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --src_token_type "${src_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/${_scp},speech,${_type}" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --train_data_path_and_name_and_type "${_st_train_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${st_args} || { cat "${_logdir}"/stats.1.log; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${st_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${st_stats_dir}/train/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/train/text_shape.${tgt_token_type}"
+
+        <"${st_stats_dir}/train/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/train/src_text_shape.${src_token_type}"
+
+        <"${st_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${tgt_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/valid/text_shape.${tgt_token_type}"
+
+        <"${st_stats_dir}/valid/src_text_shape" \
+            awk -v N="$(<${src_token_list} wc -l)" '{ print $0 "," N }' \
+            >"${st_stats_dir}/valid/src_text_shape.${src_token_type}"
+    fi
+
+
+    if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then
+        _st_train_dir="${data_feats}/${train_set}"
+        _st_valid_dir="${data_feats}/${valid_set}"
+        log "Stage 11: ST Training: train_set=${_st_train_dir}, valid_set=${_st_valid_dir}"
+
+        _opts=
+        if [ -n "${st_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.st_train --print_config --optim adam
+            _opts+="--config ${st_config} "
+        fi
+
+        _feats_type="$(<${_st_train_dir}/feats_type)"
+        if [ "${_feats_type}" = raw ]; then
+            _scp=wav.scp
+            # "sound" supports "wav", "flac", etc.
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                _type=sound
+            fi
+            _fold_length="$((st_speech_fold_length * 100))"
+            _opts+="--frontend_conf fs=${fs} "
+        else
+            _scp=feats.scp
+            _type=kaldi_ark
+            _fold_length="${st_speech_fold_length}"
+            _input_size="$(<${_st_train_dir}/feats_dim)"
+            _opts+="--input_size=${_input_size} "
+
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            # Default normalization is utterance_mvn and changes to global_mvn
+            _opts+="--normalize=global_mvn --normalize_conf stats_file=${st_stats_dir}/train/feats_stats.npz "
+        fi
+
+        if [ "${num_splits_st}" -gt 1 ]; then
+            # If you met a memory error when parsing text files, this option may help you.
+            # The corpus is split into subsets and each subset is used for training one by one in order,
+            # so the memory footprint can be limited to the memory required for each dataset.
+
+            _split_dir="${st_stats_dir}/splits${num_splits_st}"
+            if [ ! -f "${_split_dir}/.done" ]; then
+                rm -f "${_split_dir}/.done"
+                ${python} -m espnet2.bin.split_scps \
+                  --scps \
+                      "${_st_train_dir}/${_scp}" \
+                      "${_st_train_dir}/text.${tgt_case}.${tgt_lang}" \
+                      "${_st_train_dir}/text.${src_case}.${src_lang}" \
+                      "${st_stats_dir}/train/speech_shape" \
+                      "${st_stats_dir}/train/text_shape.${tgt_token_type}" \
+                      "${st_stats_dir}/train/src_text_shape.${src_token_type}" \
+                  --num_splits "${num_splits_st}" \
+                  --output_dir "${_split_dir}"
+                touch "${_split_dir}/.done"
+            else
+                log "${_split_dir}/.done exists. Spliting is skipped"
+            fi
+
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_split_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${_split_dir}/speech_shape "
+            _opts+="--train_shape_file ${_split_dir}/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${_split_dir}/src_text_shape.${src_token_type} "
+            _opts+="--multiple_iterator true "
+        else
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/${_scp},speech,${_type} "
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/text.${tgt_case}.${tgt_lang},text,text "
+            _opts+="--train_data_path_and_name_and_type ${_st_train_dir}/text.${src_case}.${src_lang},src_text,text "
+            _opts+="--train_shape_file ${st_stats_dir}/train/speech_shape "
+            _opts+="--train_shape_file ${st_stats_dir}/train/text_shape.${tgt_token_type} "
+            _opts+="--train_shape_file ${st_stats_dir}/train/src_text_shape.${src_token_type} "
+        fi
+
+        log "Generate '${st_exp}/run.sh'. You can resume the process from stage 11 using this script"
+        mkdir -p "${st_exp}"; echo "${run_args} --stage 11 \"\$@\"; exit \$?" > "${st_exp}/run.sh"; chmod +x "${st_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+        log "ST training started... log: '${st_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${st_exp})"
+        else
+            jobname="${st_exp}/train.log"
+        fi
+
+        # TODO(jiatong): fix bpe
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${st_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${st_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m espnet2.bin.st_train \
+                --use_preprocessor true \
+                --bpemodel "${tgt_bpemodel}" \
+                --token_type "${tgt_token_type}" \
+                --token_list "${tgt_token_list}" \
+                --src_bpemodel "${src_bpemodel}" \
+                --src_token_type "${src_token_type}" \
+                --src_token_list "${src_token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${tgt_case}.${tgt_lang},text,text" \
+                --valid_data_path_and_name_and_type "${_st_valid_dir}/text.${src_case}.${src_lang},src_text,text" \
+                --valid_shape_file "${st_stats_dir}/valid/speech_shape" \
+                --valid_shape_file "${st_stats_dir}/valid/text_shape.${tgt_token_type}" \
+                --valid_shape_file "${st_stats_dir}/valid/src_text_shape.${src_token_type}" \
+                --resume true \
+                --init_param ${pretrained_asr} \
+                --ignore_init_mismatch ${ignore_init_mismatch} \
+                --fold_length "${_fold_length}" \
+                --fold_length "${st_text_fold_length}" \
+                --fold_length "${st_text_fold_length}" \
+                --output_dir "${st_exp}" \
+                ${_opts} ${st_args}
+
+    fi
+else
+    log "Skip the training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    st_exp="${expdir}/${download_model}"
+    mkdir -p "${st_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${st_exp}/config.txt"
+
+    # Get the path of each file
+    _st_model_file=$(<"${st_exp}/config.txt" sed -e "s/.*'st_model_file': '\([^']*\)'.*$/\1/")
+    _st_train_config=$(<"${st_exp}/config.txt" sed -e "s/.*'st_train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_st_model_file}" "${st_exp}"
+    ln -sf "${_st_train_config}" "${st_exp}"
+    inference_st_model=$(basename "${_st_model_file}")
+
+    if [ "$(<${st_exp}/config.txt grep -c lm_file)" -gt 0 ]; then
+        _lm_file=$(<"${st_exp}/config.txt" sed -e "s/.*'lm_file': '\([^']*\)'.*$/\1/")
+        _lm_train_config=$(<"${st_exp}/config.txt" sed -e "s/.*'lm_train_config': '\([^']*\)'.*$/\1/")
+
+        lm_exp="${expdir}/${download_model}/lm"
+        mkdir -p "${lm_exp}"
+
+        ln -sf "${_lm_file}" "${lm_exp}"
+        ln -sf "${_lm_train_config}" "${lm_exp}"
+        inference_lm=$(basename "${_lm_file}")
+    fi
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
+        log "Stage 12: Decoding: training_dir=${st_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        if "${use_lm}"; then
+            if "${use_word_lm}"; then
+                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
+            else
+                _opts+="--lm_train_config ${lm_exp}/config.yaml "
+                _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            fi
+        fi
+        if "${use_ngram}"; then
+             _opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
+        fi
+
+        # 2. Generate run.sh
+        log "Generate '${st_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
+        mkdir -p "${st_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${st_exp}/${inference_tag}/run.sh"; chmod +x "${st_exp}/${inference_tag}/run.sh"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${st_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/logdir"
+            mkdir -p "${_logdir}"
+
+            _feats_type="$(<${_data}/feats_type)"
+            if [ "${_feats_type}" = raw ]; then
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    _type=sound
+                fi
+            else
+                _scp=feats.scp
+                _type=kaldi_ark
+            fi
+
+            # 1. Split the key file
+            key_file=${_data}/${_scp}
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            st_inference_tool="espnet2.bin.st_inference"
+
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/st_inference.*.log'"
+            # shellcheck disable=SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/st_inference.JOB.log \
+                ${python} -m ${st_inference_tool} \
+                    --batch_size ${batch_size} \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --st_train_config "${st_exp}"/config.yaml \
+                    --st_model_file "${st_exp}"/"${inference_st_model}" \
+                    --output_dir "${_logdir}"/output.JOB \
+                    ${_opts} ${inference_args}
+
+            # 3. Concatenates the output files from each jobs
+            for f in token token_int score text; do
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
+            done
+        done
+    fi
+
+    if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
+        log "Stage 13: Scoring"
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _dir="${st_exp}/${inference_tag}/${dset}"
+
+            # TODO(jiatong): add asr scoring and inference
+
+            _scoredir="${_dir}/score_bleu"
+            mkdir -p "${_scoredir}"
+
+            paste \
+                <(<"${_data}/text.${tgt_case}.${tgt_lang}" \
+                    ${python} -m espnet2.bin.tokenize_text  \
+                        -f 2- --input - --output - \
+                        --token_type word \
+                        --non_linguistic_symbols "${nlsyms_txt}" \
+                        --remove_non_linguistic_symbols true \
+                        --cleaner "${cleaner}" \
+                        ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/ref.trn.org"
+
+            # NOTE(kamo): Don't use cleaner for hyp
+            paste \
+                <(<"${_dir}/text"  \
+                        ${python} -m espnet2.bin.tokenize_text  \
+                            -f 2- --input - --output - \
+                            --token_type word \
+                            --non_linguistic_symbols "${nlsyms_txt}" \
+                            --remove_non_linguistic_symbols true \
+                            ) \
+                <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                    >"${_scoredir}/hyp.trn.org"
+            
+            # remove utterance id
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org" > "${_scoredir}/ref.trn"
+            perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/hyp.trn.org" > "${_scoredir}/hyp.trn"
+
+            # detokenizer
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn" > "${_scoredir}/ref.trn.detok"
+            detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/hyp.trn" > "${_scoredir}/hyp.trn.detok"
+
+            if [ ${tgt_case} = "tc" ]; then
+                echo "Case sensitive BLEU result (single-reference)" >> ${_scoredir}/result.tc.txt
+                sacrebleu "${_scoredir}/ref.trn.detok" \
+                          -i "${_scoredir}/hyp.trn.detok" \
+                          -m bleu chrf ter \
+                          >> ${_scoredir}/result.tc.txt
+                
+                log "Write a case-sensitive BLEU (single-reference) result in ${_scoredir}/result.tc.txt"
+            fi
+
+            # detokenize & remove punctuation except apostrophe
+            remove_punctuation.pl < "${_scoredir}/ref.trn.detok" > "${_scoredir}/ref.trn.detok.lc.rm"
+            remove_punctuation.pl < "${_scoredir}/hyp.trn.detok" > "${_scoredir}/hyp.trn.detok.lc.rm"
+            echo "Case insensitive BLEU result (single-reference)" >> ${_scoredir}/result.lc.txt
+            sacrebleu -lc "${_scoredir}/ref.trn.detok.lc.rm" \
+                      -i "${_scoredir}/hyp.trn.detok.lc.rm" \
+                      -m bleu chrf ter \
+                      >> ${_scoredir}/result.lc.txt
+            log "Write a case-insensitve BLEU (single-reference) result in ${_scoredir}/result.lc.txt"
+
+            # process multi-references cases
+            multi_references=$(ls "${_data}/text.${tgt_case}.${tgt_lang}".* || echo "")
+            if [ "${multi_references}" != "" ]; then
+                case_sensitive_refs=""
+                case_insensitive_refs=""
+                for multi_reference in ${multi_references}; do
+                    ref_idx="${multi_reference##*.}"
+                    paste \
+                        <(<${multi_reference} \
+                            ${python} -m espnet2.bin.tokenize_text  \
+                                -f 2- --input - --output - \
+                                --token_type word \
+                                --non_linguistic_symbols "${nlsyms_txt}" \
+                                --remove_non_linguistic_symbols true \
+                                --cleaner "${cleaner}" \
+                                ) \
+                        <(<"${_data}/utt2spk" awk '{ print "(" $2 "-" $1 ")" }') \
+                            >"${_scoredir}/ref.trn.org.${ref_idx}"
+                    
+                    # 
+                    perl -pe 's/\([^\)]+\)//g;' "${_scoredir}/ref.trn.org.${ref_idx}" > "${_scoredir}/ref.trn.${ref_idx}"
+                    detokenizer.perl -l ${tgt_lang} -q < "${_scoredir}/ref.trn.${ref_idx}" > "${_scoredir}/ref.trn.detok.${ref_idx}"
+                    remove_punctuation.pl < "${_scoredir}/ref.trn.detok.${ref_idx}" > "${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                    case_sensitive_refs="${case_sensitive_refs} ${_scoredir}/ref.trn.detok.${ref_idx}"
+                    case_insensitive_refs="${case_insensitive_refs} ${_scoredir}/ref.trn.detok.lc.rm.${ref_idx}"
+                done
+
+                if [ ${tgt_case} = "tc" ]; then
+                    echo "Case sensitive BLEU result (multi-references)" >> ${_scoredir}/result.tc.txt
+                    sacrebleu ${case_sensitive_refs} \
+                        -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                        >> ${_scoredir}/result.tc.txt
+                    log "Write a case-sensitve BLEU (multi-reference) result in ${_scoredir}/result.tc.txt"
+                fi
+
+                echo "Case insensitive BLEU result (multi-references)" >> ${_scoredir}/result.lc.txt
+                sacrebleu -lc ${case_insensitive_refs} \
+                    -i ${_scoredir}/hyp.trn.detok.lc.rm -m bleu chrf ter \
+                    >> ${_scoredir}/result.lc.txt
+                log "Write a case-insensitve BLEU (multi-reference) result in ${_scoredir}/result.lc.txt"
+            fi
+        done
+
+        # Show results in Markdown syntax
+        scripts/utils/show_translation_result.sh --case $tgt_case "${st_exp}" > "${st_exp}"/RESULTS.md
+        cat "${cat_exp}"/RESULTS.md
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${st_exp}/${st_exp##*/}_${inference_st_model%.*}.zip"
+if ! "${skip_upload}"; then
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Pack model: ${packed_model}"
+
+        _opts=
+        if "${use_lm}"; then
+            _opts+="--lm_train_config ${lm_exp}/config.yaml "
+            _opts+="--lm_file ${lm_exp}/${inference_lm} "
+            _opts+="--option ${lm_exp}/perplexity_test/ppl "
+            _opts+="--option ${lm_exp}/images "
+        fi
+        if [ "${feats_normalize}" = global_mvn ]; then
+            _opts+="--option ${st_stats_dir}/train/feats_stats.npz "
+        fi
+        if [ "${tgt_token_type}" = bpe ]; then
+            _opts+="--option ${tgt_bpemodel} "
+            _opts+="--option ${src_bpemodel} "
+        fi
+        if [ "${nlsyms_txt}" != none ]; then
+            _opts+="--option ${nlsyms_txt} "
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.pack st \
+            --st_train_config "${st_exp}"/config.yaml \
+            --st_model_file "${st_exp}"/"${inference_st_model}" \
+            ${_opts} \
+            --option "${st_exp}"/RESULTS.md \
+            --option "${st_exp}"/RESULTS.md \
+            --option "${st_exp}"/images \
+            --outpath "${packed_model}"
+    fi
+
+
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Upload model to Zenodo: ${packed_model}"
+
+        # To upload your model, you need to do:
+        #   1. Sign up to Zenodo: https://zenodo.org/
+        #   2. Create access token: https://zenodo.org/account/settings/applications/tokens/new/
+        #   3. Set your environment: % export ACCESS_TOKEN="<your token>"
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="
+git checkout $(git show -s --format=%H)"
+
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/st1/ -> foo/st1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/st1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # Generate description file
+        cat << EOF > "${st_exp}"/description
+This model was trained by ${_creator_name} using ${_task} recipe in <a href="https://github.com/espnet/espnet/">espnet</a>.
+<p>&nbsp;</p>
+<ul>
+<li><strong>Python API</strong><pre><code class="language-python">See https://github.com/espnet/espnet_model_zoo</code></pre></li>
+<li><strong>Evaluate in the recipe</strong><pre>
+<code class="language-bash">git clone https://github.com/espnet/espnet
+cd espnet${_checkout}
+pip install -e .
+cd $(pwd | rev | cut -d/ -f1-3 | rev)
+./run.sh --skip_data_prep false --skip_train true --download_model ${_model_name}</code>
+</pre></li>
+<li><strong>Results</strong><pre><code>$(cat "${st_exp}"/RESULTS.md)</code></pre></li>
+<li><strong>ST config</strong><pre><code>$(cat "${st_exp}"/config.yaml)</code></pre></li>
+<li><strong>LM config</strong><pre><code>$(if ${use_lm}; then cat "${lm_exp}"/config.yaml; else echo NONE; fi)</code></pre></li>
+</ul>
+EOF
+
+        # NOTE(kamo): The model file is uploaded here, but not published yet.
+        #   Please confirm your record at Zenodo and publish it by yourself.
+
+        # shellcheck disable=SC2086
+        espnet_model_zoo_upload \
+            --file "${packed_model}" \
+            --title "ESPnet2 pretrained model, ${_model_name}, fs=${fs}, lang=${lang}" \
+            --description_file "${st_exp}"/description \
+            --creator_name "${_creator_name}" \
+            --license "CC-BY-4.0" \
+            --use_sandbox false \
+            --publish false
+    fi
+else
+    log "Skip the uploading stages"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 16: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1             
+  
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+  
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+  
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=speech-translation
+        # shellcheck disable=SC2034     
+        espnet_task=ST
+        # shellcheck disable=SC2034
+        task_exp=${st_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/TEMPLATE/st1/steps b/egs2/TEMPLATE/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/TEMPLATE/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/TEMPLATE/st1/utils b/egs2/TEMPLATE/st1/utils
new file mode 120000
index 00000000000..6d93948f170
--- /dev/null
+++ b/egs2/TEMPLATE/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils/
\ No newline at end of file
diff --git a/egs2/TEMPLATE/tts1/README.md b/egs2/TEMPLATE/tts1/README.md
index a15cc9de3e3..a94a6cd5913 100644
--- a/egs2/TEMPLATE/tts1/README.md
+++ b/egs2/TEMPLATE/tts1/README.md
@@ -8,7 +8,7 @@ This is a template of TTS recipe for ESPnet2.
   * [Table of Contents](#table-of-contents)
   * [Recipe flow](#recipe-flow)
     * [1\. Data preparation](#1-data-preparation)
-    * [2\. Wav dump or Feature extraction](#2-wav-dump-or-feature-extraction)
+    * [2\. Wav dump / Embedding preparation](#2-wav-dump--embedding-preparation)
     * [3\. Removal of long / short data](#3-removal-of-long--short-data)
     * [4\. Token list generation](#4-token-list-generation)
     * [5\. TTS statistics collection](#5-tts-statistics-collection)
@@ -19,6 +19,11 @@ This is a template of TTS recipe for ESPnet2.
     * [FastSpeech training](#fastspeech-training)
     * [FastSpeech2 training](#fastspeech2-training)
     * [Multi speaker model with X-vector training](#multi-speaker-model-with-x-vector-training)
+    * [Multi speaker model with speaker ID embedding training](#multi-speaker-model-with-speaker-id-embedding-training)
+    * [Multi language model with language ID embedding training](#multi-language-model-with-language-id-embedding-training)
+    * [VITS training](#vits-training)
+    * [Joint text2wav training](#joint-text2wav-training)
+    * [Evaluation](#evaluation)
   * [Supported text frontend](#supported-text-frontend)
   * [Supported text cleaner](#supported-text-cleaner)
   * [Supported Models](#supported-models)
@@ -32,13 +37,16 @@ This is a template of TTS recipe for ESPnet2.
     * [How to add a new cleaner module?](#how-to-add-a-new-cleaner-module)
     * [How to use trained model in python?](#how-to-use-trained-model-in-python)
     * [How to get pretrained models?](#how-to-get-pretrained-models)
-    * [How to load the pretrained model?](#how-to-load-the-pretrained-model)
+    * [How to load the pretrained parameters?](#how-to-load-the-pretrained-parameters)
     * [How to finetune the pretrained model?](#how-to-finetune-the-pretrained-model)
     * [How to add a new model?](#how-to-add-a-new-model)
     * [How to test my model with an arbitrary given text?](#how-to-test-my-model-with-an-arbitrary-given-text)
+    * [How to train vocoder?](#how-to-train-vocoder)
+    * [How to train vocoder with text2mel GTA outputs?](#how-to-train-vocoder-with-text2mel-gta-outputs)
     * [How to handle the errors in validate_data_dir.sh?](#how-to-handle-the-errors-in-validate_data_dirsh)
     * [Why the model generate meaningless speech at the end?](#why-the-model-generate-meaningless-speech-at-the-end)
     * [Why the model cannot be trained well with my own dataset?](#why-the-model-cannot-be-trained-well-with-my-own-dataset)
+    * [Why the outputs contains metallic noise when combining neural vocoder?](#why-the-outputs-contains-metallic-noise-when-combining-neural-vocoder)
     * [How is the duration for FastSpeech2 generated?](#how-is-the-duration-for-fastspeech2-generated)
     * [Why the output of Tacotron2 or Transformer is non-deterministic?](#why-the-output-of-tacotron2-or-transformer-is-non-deterministic)
 
@@ -54,18 +62,21 @@ It calls `local/data.sh` to creates Kaldi-style data directories in `data/` for
 See also:
 - [About Kaldi-style data directory](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE#about-kaldi-style-data-directory)
 
-### 2. Wav dump or Feature extraction
+### 2. Wav dump / Embedding preparation
 
-Feature extraction stage.
-The processing in this stage is changed according to `--feats_type` option (Default: `feats_type=raw`).
-In the case of `feats_type=raw`, reformat `wav.scp` in date directories.
-In the other cases (`feats_type=fbank` and `feats_type=stft`), feature extraction with Librosa will be performed.
-Since the performance is almost the same, we recommend using `feats_type=raw`.
+Wav dumping stage.
+This stage reformats `wav.scp` in data directories.
 
-Additionaly, we support X-vector extraction in this stage as you can use in ESPnet1.
-If you specify `--use_xvector true` (Default: `use_xvector=false`), we extract mfcc features, vad decision, and X-vector.
+Additionally, We support X-vector extraction in this stage as you can use in ESPnet1.
+If you specify `--use_xvector true` (Default: `use_xvector=false`), we extract X-vectors.
+You can select the type of toolkit to use (kaldi, speechbrain, or espnet) when you specify `--xvector_tool <option>` 
+(Default: `xvector_tool=kaldi`).
+If you specify kaldi, then we additionally extract mfcc features and vad decision.
 This processing requires the compiled kaldi, please be careful.
 
+Also, speaker ID embedding and language ID embedding preparation will be performed in this stage if you specify `--use_sid true` and `--use_lid true` options.
+Note that this processing assume that `utt2spk` or `utt2lang` are correctly created in stage 1, please be careful.
+
 ### 3. Removal of long / short data
 
 Processing stage to remove long and short utterances from the training and validation data.
@@ -110,12 +121,16 @@ See also:
 ### 8-9. (Optional) Pack results for upload
 
 Packing stage.
-It packs the trained model files and uploads to [Zenodo](https://zenodo.org/).
+It packs the trained model files and uploads to [Zenodo](https://zenodo.org/) (Zenodo upload will be deprecated).
 If you want to run this stage, you need to register your account in zenodo.
 
 See also:
 - [ESPnet Model Zoo](https://github.com/espnet/espnet_model_zoo)
 
+#### Stage 10: Upload model to Hugging Face
+
+Upload the trained model to Hugging Face for sharing. Additional information at [Docs](https://espnet.github.io/espnet/espnet2_tutorial.html#packing-and-sharing-your-trained-model).
+
 ## How to run
 
 Here, we show the procedure to run the recipe using `egs2/ljspeech/tts1`.
@@ -147,14 +162,17 @@ Then, you can get the following directories in the recipe directory.
 ├── data/ # Kaldi-style data directory
 │   ├── dev/        # validation set
 │   ├── eval1/      # evaluation set
-│   ├── token_list/ # token list (dictionary)
 │   └── tr_no_dev/  # training set
 ├── dump/ # feature dump directory
+│   ├── token_list/    # token list (dictionary)
 │   └── raw/
-│       ├── dev/       # validation set
-│       ├── eval1/     # evaluation set
+│       ├── org/
+│       │    ├── tr_no_dev/ # training set before filtering
+│       │    └── dev/       # validation set before filtering
 │       ├── srctexts   # text to create token list
-│       └── tr_no_dev/ # training set
+│       ├── eval1/     # evaluation set
+│       ├── dev/       # validation set after filtering
+│       └── tr_no_dev/ # training set after filtering
 └── exp/ # experiment directory
     ├── tts_stats_raw_phn_tacotron_g2p_en_no_space # statistics
     └── tts_train_raw_phn_tacotron_g2p_en_no_space # model
@@ -182,15 +200,28 @@ Then, you can get the following directories in the recipe directory.
         ├── *.ave_5best.pth         # model averaged parameters
         └── *.best.pth              # symlink to the best model parameter loss
 ```
-In decoding, we use Griffin-Lim for waveform generation.
-If you want to combine with neural vocoder, please use [kan-bayashi/ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN).
-See the detail in [decoding with ESPnet-TTS model's feature](https://github.com/kan-bayashi/ParallelWaveGAN#decoding-with-espnet-tts-models-features).
+In decoding, we use Griffin-Lim for waveform generation as a default (End-to-end text-to-wav model can generate waveform directly such as VITS and Joint training model).
+If you want to combine with neural vocoders, you can combine with [kan-bayashi/ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN).
+
+```sh
+# Make sure you already install parallel_wavegan repo
+$ . ./path.sh && pip install -U parallel_wavegan
+
+# Use parallel_wavegan provided pretrained ljspeech style melgan as a vocoder
+$ ./run.sh --stage 7 --inference_args "--vocoder_tag parallel_wavegan/ljspeech_style_melgan.v1" --inference_tag decode_with_ljspeech_style_melgan.v1
+
+# Use the vocoder trained by `parallel_wavegan` repo manually
+$ ./run.sh --stage 7 --vocoder_file /path/to/checkpoint-xxxxxxsteps.pkl --inference_tag decode_with_my_vocoder
+```
+
+If you want to generate waveform from dumped features, please check [decoding with ESPnet-TTS model's feature](https://github.com/kan-bayashi/ParallelWaveGAN#decoding-with-espnet-tts-models-features).
 
 For the first time, we recommend performing each stage step-by-step via `--stage` and `--stop-stage` options.
 ```sh
 $ ./run.sh --stage 1 --stop-stage 1
 $ ./run.sh --stage 2 --stop-stage 2
-$ ./run.sh --stage 3 --stop-stage 3
+...
+$ ./run.sh --stage 7 --stop-stage 7
 ```
 This might helps you to understand each stage's processing and directory structure.
 
@@ -275,6 +306,323 @@ $ ./run.sh --stage 6 --use_xvector true --train_config /path/to/your_xvector_con
 
 You can find the example config in [`egs2/vctk/tts1/conf/tuning`](../../vctk/tts1/conf/tuning).
 
+### Multi-speaker model with speaker ID embedding training
+
+First, you need to run from the stage 2 and 3 with `--use_sid true` to extract speaker ID.
+```sh
+$ ./run.sh --stage 2 --stop-stage 3 --use_sid true
+```
+You can find the speaker ID file in `dump/raw/*/utt2sid`.
+Note that you need to correctly create `utt2spk` in data prep stage to generate `utt2sid`.
+Then, you can run the training with the config which has `spks: #spks` in `tts_conf`.
+```yaml
+# e.g.
+tts_conf:
+    spks: 128  # Number of speakers
+```
+Please run the training from stage 6.
+```sh
+$ ./run.sh --stage 6 --use_sid true --train_config /path/to/your_multi_spk_config.yaml
+```
+
+### Multi-language model with language ID embedding training
+
+First, you need to run from the stage 2 and 3 with `--use_lid true` to extract speaker ID.
+```sh
+$ ./run.sh --stage 2 --stop-stage 3 --use_lid true
+```
+You can find the speaker ID file in `dump/raw/*/utt2lid`.
+**Note that you need to additionally create `utt2lang` file in data prep stage to generate `utt2lid`.**
+Then, you can run the training with the config which has `langs: #langs` in `tts_conf`.
+```yaml
+# e.g.
+tts_conf:
+    langs: 4  # Number of languages
+```
+Please run the training from stage 6.
+```sh
+$ ./run.sh --stage 6 --use_lid true --train_config /path/to/your_multi_lang_config.yaml
+```
+
+Of course you can further combine with x-vector or speaker ID embedding.
+If you want to use both sid and lid, the process should be like this:
+```sh
+$ ./run.sh --stage 2 --stop-stage 3 --use_lid true --use_sid true
+```
+Make your config.
+```yaml
+# e.g.
+tts_conf:
+    langs: 4   # Number of languages
+    spks: 128  # Number of speakers
+```
+Please run the training from stage 6.
+```sh
+$ ./run.sh --stage 6 --use_lid true --use_sid true --train_config /path/to/your_multi_spk_multi_lang_config.yaml
+```
+
+### VITS training
+
+First, the VITS config is **hard coded for 22.05 khz or 44.1 khz** and use different feature extraction method.
+(Note that you can use any feature extraction method but the default method is `linear_spectrogram`.)
+If you want to use it with 24 khz or 16 khz dataset, please be careful about these point.
+
+```sh
+# Assume that data prep stage (stage 1) is finished
+$ ./run.sh --stage 1 --stop-stage 1
+
+# Single speaker 22.05 khz case
+$ ./run.sh \
+    --stage 2 \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --win_length null \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_vits.yaml \
+    --inference_config ./conf/tuning/decode_vits.yaml \
+    --inference_model latest.pth
+
+# Single speaker 44.1 khz case
+$ ./run.sh \
+    --stage 2 \
+    --ngpu 4 \
+    --fs 44100 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --win_length null \
+    --dumpdir dump/44k \
+    --expdir exp/44k \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_full_band_vits.yaml \
+    --inference_config ./conf/tuning/decode_vits.yaml \
+    --inference_model latest.pth
+
+# Multi speaker with SID 22.05 khz case
+$ ./run.sh \
+    --stage 2 \
+    --use_sid true \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --win_length null \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_multi_spk_vits.yaml \
+    --inference_config ./conf/tuning/decode_vits.yaml \
+    --inference_model latest.pth
+
+# Multi speaker with SID 44.1 khz case
+$ ./run.sh \
+    --stage 2 \
+    --use_sid true \
+    --ngpu 4 \
+    --fs 44100 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --win_length null \
+    --dumpdir dump/44k \
+    --expdir exp/44k \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_full_band_multi_spk_vits.yaml \
+    --inference_config ./conf/tuning/decode_vits.yaml \
+    --inference_model latest.pth
+
+# Multi speaker with X-vector 22.05 khz case (need compiled kaldi to run)
+$ ./run.sh \
+    --stage 2 \
+    --use_xvector true \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --win_length null \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_xvector_vits.yaml \
+    --inference_config ./conf/tuning/decode_vits.yaml \
+    --inference_model latest.pth
+```
+
+The training time requires long times (around several weeks) but around 100k samples can generate a reasonable sounds.
+
+You can find the example configs in:
+- [`egs2/ljspeech/tts1/conf/tuning/train_vits.yaml`: Single speaker 22.05 khz config](../../ljspeech/tts1/conf/tuning/train_vits.yaml).
+- [`egs2/jsut/tts1/conf/tuning/train_full_band_vits.yaml`: Single speaker 44.1 khz config](../../jsut/tts1/conf/tuning/train_full_band_vits.yaml).
+- [`egs2/vctk/tts1/conf/tuning/train_multi_spk_vits.yaml`: Multi speaker with SID 22.05 khz config](../../vctk/tts1/conf/tuning/train_multi_spk_vits.yaml).
+- [`egs2/vctk/tts1/conf/tuning/train_full_band_multi_spk_vits.yaml`: Multi speaker with SID 44.1 khz config](../../vctk/tts1/conf/tuning/train_full_band_multi_spk_vits.yaml).
+- [`egs2/libritts/tts1/conf/tuning/train_xvector_vits.yaml`: Multi speaker with X-vector 22.05 khz config](../../libritts/tts1/conf/tuning/train_xvector_vits.yaml).
+
+### Joint text2wav training
+
+Joint training enables us to train both text2mel and vocoder model jointly with GAN-based training.
+Currently, we tested on only for non-autoregressive text2mel models with ljspeech dataset but the following models and vocoders are supported.
+
+**Text2mel**
+
+- Tacotron2
+- Transformer
+- FastSpeech
+- FastSpeech2
+
+**Vocoder**
+
+- ParallelWaveGAN G / D
+- (Multi-band) MelGAN G / D
+- HiFiGAN G / D
+- StyleMelGAN G / D
+
+Here, we show the example procedure to train conformer fastspeech2 + hifigan jointly with two training strategy (training from scratch and fine-tuning of pretrained text2mel and vocoder).
+
+```sh
+# Make sure you are ready to train fastspeech2 (already prepared durations file with teacher model)
+$ ...
+
+# Case 1: Train conformer fastspeech2 + hifigan G + hifigan D from scratch
+$ ./run.sh \
+    --stage 6 \
+    --tts_task gan_tts \
+    --train_config ./conf/tuning/train_joint_conformer_fastspeech2_hifigan.yaml
+
+# Case 2: Fine-tuning of pretrained conformer fastspeech2 + hifigan G + hifigan D
+# (a) Prepare pretrained models as follows
+$ tree -L 2 exp
+exp
+...
+├── ljspeech_hifigan.v1  # pretrained vocoder
+│   ├── checkpoint-2500000steps.pkl
+│   ├── config.yml
+│   └── stats.h5
+├── tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space  # pretrained text2mel
+│   ├── config.yaml
+│   ├── images
+│   └── train.loss.ave_5best.pth
+...
+# If you want to use the same files of this example
+$ ipython
+# Download text2mel model
+[ins] In [1]: from espnet_model_zoo.downloader import ModelDownloader
+[ins] In [2]: d = ModelDownloader("./downloads")
+[ins] In [3]: d.download_and_unpack("kan-bayashi/ljspeech_conformer_fastspeech2")
+# Download vocoder
+[ins] In [4]: from parallel_wavegan.utils import download_pretrained_model
+[ins] In [5]: download_pretrained_model("ljspeech_hifigan.v1", "downloads")
+# Move them to exp directory
+$ mv download/59c43ac0d40b121060bd71dd418f5ece/exp/tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space exp
+$ mv downloads/ljspeech_hifigan.v1 exp
+
+# (b) Convert .pkl checkpoint to espnet loadable format
+$ ipython
+[ins] In [1]: import torch
+[ins] In [2]: d = torch.load("./exp/ljspeech_hifigan.v1/checkpoint-2500000steps.pkl")
+[ins] In [3]: torch.save(d["model"]["generator"], "generator.pth")
+[ins] In [4]: torch.save(d["model"]["discriminator"], "discriminator.pth")
+
+# (c) Prepare configuration
+$ vim conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml
+# edit text2mel_params / generator_params / discriminator_params to be the same as the pretrained model
+# edit init_param part to specify the correct path of the pretrained model
+
+# (d) Run training
+$ ./run.sh \
+    --stage 6 \
+    --tts_task gan_tts \
+    --train_config ./conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml
+```
+
+You can find the example configs in:
+- [`egs2/ljspeech/tts1/conf/tuning/train_joint_conformer_fastspeech2_hifigan.yaml`: Joint training of conformer fastspeech2 + hifigan](../../ljspeech/tts1/conf/tuning/train_joint_conformer_fastspeech2_hifigan.yaml).
+- [`egs2/ljspeech/tts1/conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml`: Joint fine-tuning of conformer fastspeech2 + hifigan](../../ljspeech/tts1/conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml).
+
+### Evaluation
+
+We provide three objective evaluation metrics:
+
+- Mel-cepstral distortion (MCD)
+- Log-F0 root mean square error (log-F0 RMSE)
+- Character error rate (CER)
+
+MCD and log-F0 RMSE reflect speaker, prosody, and phonetic content similarities, and CER can reflect the intelligibility.
+For MCD and log-F0 RMSE, we apply dynamic time-warping (DTW) to match the length difference between ground-truth speech and generated speech.
+
+Here we show the example command to calculate objective metrics:
+
+```sh
+cd egs2/<recipe_name>/tts1
+. ./path.sh
+
+# Evaluate MCD
+./pyscripts/utils/evaluate_mcd.py \
+    exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav.scp \
+    dump/raw/eval1/wav.scp
+
+# Evaluate log-F0 RMSE
+./pyscripts/utils/evaluate_f0.py \
+    exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav.scp \
+    dump/raw/eval1/wav.scp
+
+# If you want to calculate more precisely, limit the F0 range
+./pyscripts/utils/evaluate_f0.py \
+    --f0min xxx \
+    --f0max yyy \
+    exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav.scp \
+    dump/raw/eval1/wav.scp
+
+# Evaluate CER
+./scripts/utils/evaluate_asr.sh \
+    --model_tag <asr_model_tag> \
+    --nj 1 \
+    --inference_args "--beam_size 10 --ctc_weight 0.4 --lm_weight 0.0" \
+    --gt_text "dump/raw/eval1/text" \
+    exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav.scp \
+    exp/<model_dir_name>/<decode_dir_name>/asr_results
+
+# Since ASR model does not use punctuation, it is better to remove punctuations if it contains
+./utils/remove_punctuation.pl < dump/raw/eval1/text > dump/raw/eval1/text.no_punc
+./scripts/utils/evaluate_asr.sh \
+    --model_tag <asr_model_tag> \
+    --nj 1 \
+    --inference_args "--beam_size 10 --ctc_weight 0.4 --lm_weight 0.0" \
+    --gt_text "dump/raw/eval1/text.no_punc" \
+    exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav.scp \
+    exp/<model_dir_name>/<decode_dir_name>/asr_results
+
+# Some ASR models assume the existence of silence at the beginning and the end of audio
+# Then, you can perform silence padding with sox to get more reasonable ASR results
+awk < "exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav.scp" \
+    '{print $1 " sox " $2 " -t wav - pad 0.25 0.25 |"}' \
+    > exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav_pad.scp
+./scripts/utils/evaluate_asr.sh \
+    --model_tag <asr_model_tag> \
+    --nj 1 \
+    --inference_args "--beam_size 10 --ctc_weight 0.4 --lm_weight 0.0" \
+    --gt_text "dump/raw/eval1/text.no_punc" \
+    exp/<model_dir_name>/<decode_dir_name>/eval1/wav/wav_pad.scp \
+    exp/<model_dir_name>/<decode_dir_name>/asr_results
+
+```
+
+While these objective metrics can estimate the quality of synthesized speech, it is still difficult to fully determine human perceptual quality from these values, especially with high-fidelity generated speech.
+Therefore, we recommend performing the subjective evaluation if you want to check perceptual quality in detail.
+
+You can refer [this page](https://github.com/kan-bayashi/webMUSHRA/blob/master/HOW_TO_SETUP.md) to launch web-based subjective evaluation system with [webMUSHRA](https://github.com/audiolabs/webMUSHRA).
+
 ## Supported text frontend
 
 You can change via `--g2p` option in `tts.sh`.
@@ -293,10 +641,16 @@ You can change via `--g2p` option in `tts.sh`.
     - e.g. `こ、こんにちは` -> `[コ, 、, コ, ン, ニ, チ, ワ]`
 - `pyopenjtalk_accent`: [r9y9/pyopenjtalk](https://github.com/r9y9/pyopenjtalk)
     - Add accent labels in addition to phoneme labels
+    - Based on [Developing a Japanese End-to-End Speech Synthesis Server Considering Accent Phrases](https://jglobal.jst.go.jp/detail?JGLOBAL_ID=202102244593559954)
     - e.g. `こ、こんにちは` -> `[k, 1, 0, o, 1, 0, k, 5, -4, o, 5, -4, N, 5, -3, n, 5, -2, i, 5, -2, ch, 5, -1, i, 5, -1, w, 5, 0, a, 5, 0]`
 - `pyopenjtalk_accent_with_pause`: [r9y9/pyopenjtalk](https://github.com/r9y9/pyopenjtalk)
-    - Add a pause label in addition to phoneme and accenet labels
+    - Add a pause label in addition to phoneme and accent labels
+    - Based on [Developing a Japanese End-to-End Speech Synthesis Server Considering Accent Phrases](https://jglobal.jst.go.jp/detail?JGLOBAL_ID=202102244593559954)
     - e.g. `こ、こんにちは` -> `[k, 1, 0, o, 1, 0, pau, k, 5, -4, o, 5, -4, N, 5, -3, n, 5, -2, i, 5, -2, ch, 5, -1, i, 5, -1, w, 5, 0, a, 5, 0]`
+- `pyopenjtalk_prosody`: [r9y9/pyopenjtalk](https://github.com/r9y9/pyopenjtalk)
+    - Use special symbols for prosody control
+    - Based on [Prosodic features control by symbols as input of sequence-to-sequence acoustic modeling for neural TTS](https://doi.org/10.1587/transinf.2020EDP7104)
+    - e.g. `こ、こんにちは` -> `[^, k, #, o, _, k, o, [, N, n, i, ch, i, w, a, $]`
 - `pypinyin`: [mozillanzg/python-pinyin](https://github.com/mozillazg/python-pinyin)
     - e.g. `卡尔普陪外孙玩滑梯。` -> `[ka3, er3, pu3, pei2, wai4, sun1, wan2, hua2, ti1, 。]`
 - `pypinyin_phone`: [mozillanzg/python-pinyin](https://github.com/mozillazg/python-pinyin)
@@ -305,6 +659,46 @@ You can change via `--g2p` option in `tts.sh`.
 - `espeak_ng_arabic`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
     - e.g. `السلام عليكم` -> `[ʔ, a, s, s, ˈa, l, aː, m, ʕ, l, ˈiː, k, m]`
     - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_german`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `Das hört sich gut an.` -> `[d, a, s, h, ˈœ, ɾ, t, z, ɪ, ç, ɡ, ˈuː, t, ˈa, n, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_french`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `Bonjour le monde.` -> `[b, ɔ̃, ʒ, ˈu, ʁ, l, ə-, m, ˈɔ̃, d, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_spanish`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `Hola Mundo.` -> `[ˈo, l, a, m, ˈu, n, d, o, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_russian`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `Привет мир.` -> `[p, rʲ, i, vʲ, ˈe, t, mʲ, ˈi, r, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_greek`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `Γειά σου Κόσμε.` -> `[j, ˈa, s, u, k, ˈo, s, m, e, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_finnish`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `Hei maailma.` -> `[h, ˈei, m, ˈaː, ɪ, l, m, a, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_hungarian`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `Helló Világ.` -> `[h, ˈɛ, l, l, oː, v, ˈi, l, aː, ɡ, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_dutch`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `Hallo Wereld.` -> `[h, ˈɑ, l, oː, ʋ, ˈɪː, r, ə, l, t, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_hindi`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - e.g. `नमस्ते दुनिया` -> `[n, ə, m, ˈʌ, s, t, eː, d, ˈʊ, n, ɪ, j, ˌaː]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `espeak_ng_english_us_vits`: [espeak-ng/espeak-ng](https://github.com/espeak-ng/espeak-ng)
+    - VITS official implementation-like processing (https://github.com/jaywalnut310/vits)
+    - e.g. `Hello World.` -> `[h, ə, l, ˈ, o, ʊ, , <space>, w, ˈ, ɜ, ː, l, d, .]`
+    - This result provided by the wrapper library [bootphon/phonemizer](https://github.com/bootphon/phonemizer)
+- `g2pk`: [Kyubyong/g2pK](https://github.com/Kyubyong/g2pK)
+    - e.g. `안녕하세요 세계입니다.` -> `[ᄋ, ᅡ, ᆫ, ᄂ, ᅧ, ᆼ, ᄒ, ᅡ, ᄉ, ᅦ, ᄋ, ᅭ,  , ᄉ, ᅦ, ᄀ, ᅨ, ᄋ, ᅵ, ᆷ, ᄂ, ᅵ, ᄃ, ᅡ, .]`
+- `g2pk_no_space`: [Kyubyong/g2pK](https://github.com/Kyubyong/g2pK)
+    - Same G2P but do not use word separator
+    - e.g. `안녕하세요 세계입니다.` -> `[ᄋ, ᅡ, ᆫ, ᄂ, ᅧ, ᆼ, ᄒ, ᅡ, ᄉ, ᅦ, ᄋ, ᅭ, ᄉ, ᅦ, ᄀ, ᅨ, ᄋ, ᅵ, ᆷ, ᄂ, ᅵ, ᄃ, ᅡ, .]`
+- `korean_jaso`: [jdongian/python-jamo](https://github.com/jdongian/python-jamo)
+    - e.g. `나는 학교에 갑니다.` -> `[ᄂ, ᅡ, ᄂ, ᅳ, ᆫ, <space>, ᄒ, ᅡ, ᆨ, ᄀ, ᅭ, ᄋ, ᅦ, <space>, ᄀ, ᅡ, ᆸ, ᄂ, ᅵ, ᄃ, ᅡ, .]`
+- `korean_jaso_no_space`: [jdongian/python-jamo](https://github.com/jdongian/python-jamo)
+    - e.g. `나는 학교에 갑니다.` -> `[ᄂ, ᅡ, ᄂ, ᅳ, ᆫ, ᄒ, ᅡ, ᆨ, ᄀ, ᅭ, ᄋ, ᅦ, ᄀ, ᅡ, ᆸ, ᄂ, ᅵ, ᄃ, ᅡ, .]`
 
 You can see the code example from [here](https://github.com/espnet/espnet/blob/cd7d28e987b00b30f8eb8efd7f4796f048dc3be9/test/espnet2/text/test_phoneme_tokenizer.py).
 
@@ -331,25 +725,22 @@ You can train the following models by changing `*.yaml` config for `--train_conf
 - [FastSpeech](https://arxiv.org/abs/1905.09263)
 - [FastSpeech2](https://arxiv.org/abs/2006.04558) ([FastPitch](https://arxiv.org/abs/2006.06873))
 - [Conformer](https://arxiv.org/abs/2005.08100)-based FastSpeech / FastSpeech2
+- [VITS](https://arxiv.org/abs/2106.06103)
 
 You can find example configs of the above models in [`egs2/ljspeech/tts1/conf/tuning`](../../ljspeech/tts1/conf/tuning).
 
-### Multi speaker model
+### Multi speaker model extension
 
-- [X-Vector](https://ieeexplore.ieee.org/abstract/document/8461375) + Tacotron2
-- X-Vector + Transformer-TTS
-- X-Vector + FastSpeech
-- X-Vector + FastSpeech2
-- X-Vector + Conformer-based FastSpeech / FastSpeech2
-- [GST](https://arxiv.org/abs/1803.09017) + Tacotron2
-- GST + Transformer-TTS
-- GST + FastSpeech
-- GST + FastSpeech2
-- GST + Conformer-based FastSpeech / FastSpeech2
+You can use / combine the following embedding to build multi-speaker model:
+- [X-Vector](https://ieeexplore.ieee.org/abstract/document/8461375)
+- [GST](https://arxiv.org/abs/1803.09017)
+- Speaker ID embedding (One-hot vector -> Continuous embedding)
+- Language ID embedding (One-hot vector -> Continuous embedding)
 
 X-Vector is provided by kaldi and pretrained with VoxCeleb corpus.
-GST and X-vector can be combined (Not tested well).
-You can find example configs of the above models in [`egs2/vctk/tts1/conf/tuning`](../../vctk/tts1/conf/tuning).
+You can find example configs of the above models in:
+- [`egs2/vctk/tts1/conf/tuning`](../../vctk/tts1/conf/tuning).
+- [`egs2/libritts/tts1/conf/tuning`](../../vctk/libritts/conf/tuning).
 
 ## FAQ
 
@@ -370,13 +761,16 @@ See [how to make/port new recipe](https://github.com/espnet/espnet/tree/master/e
 
 ### How to add a new `g2p` module?
 
-Update [`espnet2/text/phoneme_tokenizer.py`](https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py) to add new module.
-Then, add new choice in the argument parser of [`espnet2/bin/tokenize_text.py`](https://github.com/espnet/espnet/blob/cd7d28e987b00b30f8eb8efd7f4796f048dc3be9/espnet2/bin/tokenize_text.py#L226-L240) and [`espnet2/tasks/tts.py`](https://github.com/espnet/espnet/blob/cd7d28e987b00b30f8eb8efd7f4796f048dc3be9/espnet2/tasks/tts.py#L180-L194).
+Add a new module in [`espnet2/text/phoneme_tokenizer.py`](https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py) and add it to `g2p_choices` in [`espnet2/text/phoneme_tokenizer.py`](https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py).
 
-We have the warpper module of [bootphon/phonemizer](https://github.com/bootphon/phonemizer).
+We have the wrapper module of [bootphon/phonemizer](https://github.com/bootphon/phonemizer).
 You can find the module [`espnet2/text/phoneme_tokenizer.py`](https://github.com/kan-bayashi/espnet/blob/7cc12c6a25924892b281c2c1513de52365a1be0b/espnet2/text/phoneme_tokenizer.py#L110).
 If the g2p you wanted is implemented in [bootphon/phonemizer](https://github.com/bootphon/phonemizer), we can easily add it [like this](https://github.com/kan-bayashi/espnet/blob/7cc12c6a25924892b281c2c1513de52365a1be0b/espnet2/text/phoneme_tokenizer.py#L172-L173) (Note that you need to update the choice as I mentioned the above).
 
+Example PRs may help you:
+- [#3382 Support Korean G2P](https://github.com/espnet/espnet/pull/3382)
+- [#3463 Support G2P functions for various languages ](https://github.com/espnet/espnet/pull/3463)
+
 ### How to add a new `cleaner` module?
 
 Update [`espnet2/text/cleaner.py`](https://github.com/espnet/espnet/blob/master/espnet2/text/cleaner.py) to add new module.
@@ -384,14 +778,40 @@ Then, add new choice in the argument parser of [`espnet2/bin/tokenize_text.py`](
 
 ### How to use trained model in python?
 
+```python
+from espnet2.bin.tts_inference import Text2Speech
+# without vocoder
+tts = Text2Speech.from_pretrained(model_file="/path/to/model.pth")
+wav = tts("Hello, world")["wav"]
+
+# with local vocoder
+tts = Text2Speech.from_pretrained(model_file="/path/to/model.pth", vocoder_file="/path/to/vocoder.pkl")
+wav = tts("Hello, world")["wav"]
+
+# with pretrained vocoder (use ljseepch style melgan as an example)
+tts = Text2Speech.from_pretrained(model_file="/path/to/model.pth", vocoder_tag="parallel_wavegan/ljspeech_style_melgan.v1")
+wav = tts("Hello, world")["wav"]
+```
 See [use a pretrained model for inference](https://github.com/espnet/espnet_model_zoo#use-a-pretrained-model-for-inference).
 
 ### How to get pretrained models?
 
 Use [ESPnet model zoo](https://github.com/espnet/espnet_model_zoo).
-You can find the all of the pretrained model list from [here](https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv).
+You can find the all of the pretrained model list from [here](https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv) or search for pretrained models at [Hugging Face](https://huggingface.co/models?library=espnet).
+
+If you want to use pretrained models written in `egs2/hogehoge/tts1/README.md`, go to Zenodo URL and copy the URL of download in the below of the page.
+Then, you can use as follows:
+```python
+from espnet2.bin.tts_inference import Text2Speech
+
+# provide copied URL directly
+tts = Text2Speech.from_pretrained(
+    "https://zenodo.org/record/5414980/files/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause_train.total_count.ave.zip?download=1",
+)
+wav = tts("こんにちは、世界。")["wav"]
+```
 
-### How to load the pretrained model?
+### How to load the pretrained parameters?
 
 Please use `--init_param` option or add it in training config (`*.yaml`).
 
@@ -430,6 +850,57 @@ Under construction.
 
 See Google Colab demo notebook: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb)
 
+If you want to try in local:
+
+```python
+from espnet2.bin.tts_inference import Text2Speech
+
+# with local model
+tts = Text2Speech.from_pretrained(model_file="/path/to/model.pth")
+wav = tts("Hello, world")["wav"]
+
+# with local model and local vocoder
+tts = Text2Speech.from_pretrained(model_file="/path/to/model.pth", vocoder_file="/path/to/vocoder.pkl")
+wav = tts("Hello, world")["wav"]
+
+# with local model and pretrained vocoder (use ljseepch as an example)
+tts = Text2Speech.from_pretrained(model_file="/path/to/model.pth", vocoder_tag="parallel_wavegan/ljspeech_style_melgan.v1")
+wav = tts("Hello, world")["wav"]
+
+# with pretrained model and pretrained vocoder (use ljseepch as an example)
+tts = Text2Speech.from_pretrained(model_tag="kan-bayashi/ljspeech_conformer_fastspeech2", vocoder_tag="parallel_wavegan/ljspeech_style_melgan.v1")
+wav = tts("Hello, world")["wav"]
+```
+
+### How to train vocoder?
+
+Please use [kan-bayashi/ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN), which provides the recipes to train various GAN-based vocoders.
+If the recipe is not prepared, you can quickly start the training with espnet2 TTS recipe.
+See [Run training using ESPnet2-TTS recipe within 5 minutes](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs#run-training-using-espnet2-tts-recipe-within-5-minutes).
+
+Or you can try [joint training of text2mel & vocoder](#joint-text2wav-training).
+
+The trained vocoder can be used as follows:
+
+- With python
+  ```python
+  from espnet2.bin.tts_inference import Text2Speech
+  tts = Text2Speech.from_pretrained(model_file="/path/to/model.pth", vocoder_file="/path/to/your_trained_vocoder_checkpoint.pkl")
+  wav = tts("Hello, world")["wav"]
+  ```
+
+- With TTS recipe
+  ```sh
+  $ ./run.sh --stage 7 --vocoder_file /path/to/your_trained_vocoder_checkpoint.pkl --inference_tag decode_with_my_vocoder
+  ```
+
+- [With command line](https://github.com/kan-bayashi/ParallelWaveGAN#decoding-with-espnet-tts-models-features)
+
+### How to train vocoder with text2mel GTA outputs?
+
+Sometimes, we want to finetune the vocoder with text2mel groundtruth aligned (GTA) outputs.
+See [Run finetuning using ESPnet2-TTS GTA outputs](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs#run-finetuning-using-espnet2-tts-gta-outputs).
+
 ### How to handle the errors in `validate_data_dir.sh`?
 
 > `utils/validate_data_dir.sh: text contains N lines with non-printable characters which occurs at this line`
@@ -459,14 +930,21 @@ There are several solutions to solve this issue:
 The most of the problems are caused by the bad cleaning of the dataset.
 Please check the following items carefully:
 
+- Check the attention plot during the training. Loss value is not so meaningful in TTS.
+    - You can check [this PR](https://github.com/espnet/espnet/pull/2807) as an example.
 - Remove the silence at the beginning and end of the speech.
+    - You can use silence trimming scripts in [this example](https://github.com/espnet/espnet/blob/52ea42d8abfbfb63500e91a150a285aa7d14bfd6/egs2/hui_acg/tts1/local/data.sh#L61-L70).
 - Separate speech if it contains a long silence at the middle of speech.
 - Use phonemes instead of characters if G2P is available.
 - Clean the text as possible as you can (abbreviation, number, etc...)
 - Add the pose symbol in text if the speech contains the silence.
 - If the dataset is small, please consider the use of adaptation with pretrained model.
 - If the dataset is small, please consider the use of large reduction factor, which helps the attention learning.
-- Check the attention plot during the training. Loss value is not so meaningfull in TTS.
+
+### Why the outputs contains metallic noise when combining neural vocoder?
+
+This will be happened especially when the neural vocoders did not use noise as the input (e.g., MelGAN, HiFiGAN), which are less robust to the mismatch of acoustic features.
+The metallic sound can reduce by performing vocoder [finetuning with text2mel GTA outputs](#how-to-train-vocoder-with-text2mel-gta-outputs) or [joint training / finetuning of text2mel and vocoder](#joint-text2wav-training).
 
 ### How is the duration for FastSpeech2 generated?
 
@@ -477,3 +955,6 @@ See more info in [FastSpeech paper](https://arxiv.org/abs/1905.09263).
 
 This is because we use prenet in the decoder, which always applies dropout.
 See more info in [Tacotron2 paper](https://arxiv.org/abs/1712.05884).
+
+If you want to fix the results, you can use [`--always_fix_seed` option](https://github.com/espnet/espnet/blob/f03101557753517ebac8c432f0793d97d68fa5f0/espnet2/bin/tts_inference.py#L601-L606).
+
diff --git a/egs2/TEMPLATE/tts1/cmd.sh b/egs2/TEMPLATE/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/TEMPLATE/tts1/cmd.sh
+++ b/egs2/TEMPLATE/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/TEMPLATE/tts1/path.sh b/egs2/TEMPLATE/tts1/path.sh
old mode 100644
new mode 100755
diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh
index f927c155cad..0bd2e0debb8 100755
--- a/egs2/TEMPLATE/tts1/tts.sh
+++ b/egs2/TEMPLATE/tts1/tts.sh
@@ -32,6 +32,7 @@ skip_data_prep=false # Skip data preparation stages.
 skip_train=false     # Skip training stages.
 skip_eval=false      # Skip decoding and evaluation stages.
 skip_upload=true     # Skip packing and uploading stages.
+skip_upload_hf=true # Skip uploading to hugging face stages.
 ngpu=1               # The number of gpus ("0" uses cpu, otherwise use gpu).
 num_nodes=1          # The number of nodes.
 nj=32                # The number of parallel jobs.
@@ -45,23 +46,31 @@ python=python3       # Specify python to execute espnet commands.
 local_data_opts="" # Options to be passed to local/data.sh.
 
 # Feature extraction related
-feats_type=raw       # Feature type (fbank or stft or raw).
-audio_format=flac    # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
-min_wav_duration=0.1 # Minimum duration in second.
-max_wav_duration=20  # Maximum duration in second.
-use_xvector=false    # Whether to use x-vector (Require Kaldi).
-# Only used for feats_type != raw
-fs=16000          # Sampling rate.
-fmin=80           # Minimum frequency of Mel basis.
-fmax=7600         # Maximum frequency of Mel basis.
-n_mels=80         # The number of mel basis.
-n_fft=1024        # The number of fft points.
-n_shift=256       # The number of shift points.
-win_length=null   # Window length.
-# Only used for the model using pitch features (e.g. FastSpeech2)
-f0min=80          # Maximum f0 for pitch extraction.
-f0max=400         # Minimum f0 for pitch extraction.
-
+feats_type=raw             # Input feature type.
+audio_format=flac          # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+min_wav_duration=0.1       # Minimum duration in second.
+max_wav_duration=20        # Maximum duration in second.
+use_sid=false              # Whether to use speaker id as the inputs (Need utt2spk in data directory).
+use_lid=false              # Whether to use language id as the inputs (Need utt2lang in data directory).
+feats_extract=fbank        # On-the-fly feature extractor.
+feats_normalize=global_mvn # On-the-fly feature normalizer.
+fs=16000                   # Sampling rate.
+n_fft=1024                 # The number of fft points.
+n_shift=256                # The number of shift points.
+win_length=null            # Window length.
+fmin=80                    # Minimum frequency of Mel basis.
+fmax=7600                  # Maximum frequency of Mel basis.
+n_mels=80                  # The number of mel basis.
+# Only used for the model using pitch & energy features (e.g. FastSpeech2)
+f0min=80  # Maximum f0 for pitch extraction.
+f0max=400 # Minimum f0 for pitch extraction.
+
+# X-Vector related
+use_xvector=false   # Whether to use x-vector.
+xvector_tool=kaldi  # Toolkit for extracting x-vector (speechbrain, espnet, kaldi)
+xvector_model=speechbrain/spkrec-ecapa-voxceleb  # For only espnet or speechbrain
+
+# Vocabulary related
 oov="<unk>"         # Out of vocabrary symbol.
 blank="<blank>"     # CTC blank symbol.
 sos_eos="<sos/eos>" # sos and eos symbols.
@@ -71,15 +80,16 @@ train_config=""    # Config for training.
 train_args=""      # Arguments for training, e.g., "--max_epoch 1".
                    # Note that it will overwrite args in train config.
 tag=""             # Suffix for training directory.
-tts_exp=""         # Specify the direcotry path for experiment. If this option is specified, tag is ignored.
-tts_stats_dir=""   # Specify the direcotry path for statistics. If empty, automatically decided.
+tts_exp=""         # Specify the directory path for experiment. If this option is specified, tag is ignored.
+tts_stats_dir=""   # Specify the directory path for statistics. If empty, automatically decided.
 num_splits=1       # Number of splitting for tts corpus.
 teacher_dumpdir="" # Directory of teacher outputs (needed if tts=fastspeech).
 write_collected_feats=false # Whether to dump features in stats collection.
+tts_task=tts                # TTS task (tts or gan_tts).
 
 # Decoding related
 inference_config="" # Config for decoding.
-inference_args=""   # Arguments for decoding, e.g., "--threshold 0.75".
+inference_args=""   # Arguments for decoding (e.g., "--threshold 0.75").
                     # Note that it will overwrite args in inference config.
 inference_tag=""    # Suffix for decoding directory.
 inference_model=train.loss.ave.pth # Model path for decoding.
@@ -88,8 +98,8 @@ inference_model=train.loss.ave.pth # Model path for decoding.
                                    # inference_model=3epoch.pth
                                    # inference_model=valid.acc.best.pth
                                    # inference_model=valid.loss.ave.pth
-griffin_lim_iters=4 # the number of iterations of Griffin-Lim.
-download_model=""   # Download a model from Model Zoo and use it for decoding.
+vocoder_file=none  # Vocoder parameter file, If set to none, Griffin-Lim will be used.
+download_model=""  # Download a model from Model Zoo and use it for decoding.
 
 # [Task dependent] Set the datadir name created by local/data.sh
 train_set=""     # Name of training set.
@@ -97,13 +107,16 @@ valid_set=""     # Name of validation set used for monitoring/tuning network tra
 test_sets=""     # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
 srctexts=""      # Texts to create token list. Multiple items can be specified.
 nlsyms_txt=none  # Non-linguistic symbol list (needed if existing).
-token_type=phn   # Transcription type.
+token_type=phn   # Transcription type (char or phn).
 cleaner=tacotron # Text cleaner.
 g2p=g2p_en       # g2p method (needed if token_type=phn).
 lang=noinfo      # The language type of corpus.
 text_fold_length=150   # fold_length for text data.
 speech_fold_length=800 # fold_length for speech data.
 
+# Upload model related
+hf_repo=
+
 help_message=$(cat << EOF
 Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>" --srctexts "<srctexts>"
 
@@ -128,11 +141,17 @@ Options:
     --local_data_opts # Options to be passed to local/data.sh (default="${local_data_opts}").
 
     # Feature extraction related
-    --feats_type       # Feature type (fbank or stft or raw, default="${feats_type}").
+    --feats_type       # Feature type (default="${feats_type}").
     --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
     --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
     --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
-    --use_xvector      # Whether to use X-vector (Require Kaldi, default="${use_xvector}").
+    --use_xvector      # Whether to use X-vector (default="${use_xvector}").
+    --xvector_tool     # Toolkit for generating the X-vectors (default="${xvector_tool}").
+    --xvector_model    # Pretrained model to generate the X-vectors (default="${xvector_model}").
+    --use_sid          # Whether to use speaker id as the inputs (default="${use_sid}").
+    --use_lid          # Whether to use language id as the inputs (default="${use_lid}").
+    --feats_extract    # On the fly feature extractor (default="${feats_extract}").
+    --feats_normalize  # Feature normalizer for on the fly feature extractor (default="${feats_normalize}")
     --fs               # Sampling rate (default="${fs}").
     --fmax             # Maximum frequency of Mel basis (default="${fmax}").
     --fmin             # Minimum frequency of Mel basis (default="${fmin}").
@@ -152,12 +171,14 @@ Options:
                     # e.g., --train_args "--max_epoch 1"
                     # Note that it will overwrite args in train config.
     --tag           # Suffix for training directory (default="${tag}").
-    --tts_exp       # Specify the direcotry path for experiment.
+    --tts_exp       # Specify the directory path for experiment.
                     # If this option is specified, tag is ignored (default="${tts_exp}").
-    --tts_stats_dir # Specify the direcotry path for statistics.
+    --tts_stats_dir # Specify the directory path for statistics.
                     # If empty, automatically decided (default="${tts_stats_dir}").
     --num_splits    # Number of splitting for tts corpus (default="${num_splits}").
+    --teacher_dumpdir       # Directory of teacher outputs (needed if tts=fastspeech, default="${teacher_dumpdir}").
     --write_collected_feats # Whether to dump features in statistics collection (default="${write_collected_feats}").
+    --tts_task              # TTS task {tts or gan_tts} (default="${tts_task}").
 
     # Decoding related
     --inference_config  # Config for decoding (default="${inference_config}").
@@ -166,7 +187,8 @@ Options:
                         # Note that it will overwrite args in inference config.
     --inference_tag     # Suffix for decoding directory (default="${inference_tag}").
     --inference_model   # Model path for decoding (default=${inference_model}).
-    --griffin_lim_iters # The number of iterations of Griffin-Lim (default=${griffin_lim_iters}).
+    --vocoder_file      # Vocoder paramemter file (default=${vocoder_file}).
+                        # If set to none, Griffin-Lim vocoder will be used.
     --download_model    # Download a model from Model Zoo and use it for decoding (default="${download_model}").
 
     # [Task dependent] Set the datadir name created by local/data.sh.
@@ -201,20 +223,16 @@ fi
 . ./cmd.sh
 
 # Check feature type
-if [ "${feats_type}" = fbank ]; then
-    data_feats="${dumpdir}/fbank"
-elif [ "${feats_type}" = stft ]; then
-    data_feats="${dumpdir}/stft"
-elif [ "${feats_type}" = raw ]; then
+if [ "${feats_type}" = raw ]; then
     data_feats="${dumpdir}/raw"
 else
     log "${help_message}"
-    log "Error: not supported: --feats_type ${feats_type}"
+    log "Error: only supported: --feats_type raw"
     exit 2
 fi
 
 # Check token list type
-token_listdir="data/token_list/${token_type}"
+token_listdir="${dumpdir}/token_list/${token_type}"
 if [ "${cleaner}" != none ]; then
     token_listdir+="_${cleaner}"
 fi
@@ -223,6 +241,14 @@ if [ "${token_type}" = phn ]; then
 fi
 token_list="${token_listdir}/tokens.txt"
 
+# Check old version token list dir existence
+if [ -e data/token_list ] && [ ! -e "${dumpdir}/token_list" ]; then
+    log "Default token_list directory path is changed from data to ${dumpdir}."
+    log "Copy data/token_list to ${dumpdir}/token_list for the compatibility."
+    [ ! -e ${dumpdir} ] && mkdir -p ${dumpdir}
+    cp -a "data/token_list" "${dumpdir}/token_list"
+fi
+
 # Set tag for naming of model directory
 if [ -z "${tag}" ]; then
     if [ -n "${train_config}" ]; then
@@ -256,7 +282,11 @@ fi
 
 # The directory used for collect-stats mode
 if [ -z "${tts_stats_dir}" ]; then
-    tts_stats_dir="${expdir}/tts_stats_${feats_type}_${token_type}"
+    tts_stats_dir="${expdir}/tts_stats_${feats_type}"
+    if [ "${feats_extract}" != fbank ]; then
+        tts_stats_dir+="_${feats_extract}"
+    fi
+    tts_stats_dir+="_${token_type}"
     if [ "${cleaner}" != none ]; then
         tts_stats_dir+="_${cleaner}"
     fi
@@ -290,127 +320,145 @@ if ! "${skip_data_prep}"; then
         # If nothing is need, then format_wav_scp.sh does nothing:
         # i.e. the input file format and rate is same as the output.
 
-        if [ "${feats_type}" = raw ]; then
-            log "Stage 2: Format wav.scp: data/ -> ${data_feats}/"
-            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                    _suf="/org"
-                else
-                    _suf=""
-                fi
-                utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
-                rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
-                _opts=
-                if [ -e data/"${dset}"/segments ]; then
-                    _opts+="--segments data/${dset}/segments "
+        log "Stage 2: Format wav.scp: data/ -> ${data_feats}/"
+        for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+            if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                _suf="/org"
+            else
+                _suf=""
+            fi
+            utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
+            rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
+            _opts=
+            if [ -e data/"${dset}"/segments ]; then
+                _opts+="--segments data/${dset}/segments "
+            fi
+            # shellcheck disable=SC2086
+            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+        done
+
+        # Extract X-vector
+        if "${use_xvector}"; then
+            if [ "${xvector_tool}" = "kaldi" ]; then
+                log "Stage 2+: Extract X-vector: data/ -> ${dumpdir}/xvector (Require Kaldi)"
+                # Download X-vector pretrained model
+                xvector_exp=${expdir}/xvector_nnet_1a
+                if [ ! -e "${xvector_exp}" ]; then
+                    log "X-vector model does not exist. Download pre-trained model."
+                    wget http://kaldi-asr.org/models/8/0008_sitw_v2_1a.tar.gz
+                    tar xvf 0008_sitw_v2_1a.tar.gz
+                    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+                    mv 0008_sitw_v2_1a/exp/xvector_nnet_1a "${xvector_exp}"
+                    rm -rf 0008_sitw_v2_1a.tar.gz 0008_sitw_v2_1a
                 fi
-                # shellcheck disable=SC2086
-                scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
-                    --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
-                    "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
-                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
-            done
 
-        elif [ "${feats_type}" = fbank ] || [ "${feats_type}" = stft ] ; then
-            log "Stage 2: ${feats_type} extract: data/ -> ${data_feats}/"
+                # Generate the MFCC features, VAD decision, and X-vector
+                for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                    # 1. Copy datadir and resample to 16k
+                    utils/copy_data_dir.sh "data/${dset}" "${dumpdir}/mfcc/${dset}"
+                    utils/data/resample_data_dir.sh 16000 "${dumpdir}/mfcc/${dset}"
+
+                    # 2. Extract mfcc features
+                    _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/utt2spk wc -l)")
+                    steps/make_mfcc.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                        --write-utt2num-frames true \
+                        --mfcc-config conf/mfcc.conf \
+                        "${dumpdir}/mfcc/${dset}"
+                    utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
+
+                    # 3. Compute VAD decision
+                    _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/spk2utt wc -l)")
+                    sid/compute_vad_decision.sh --nj ${_nj} --cmd "${train_cmd}" \
+                        --vad-config conf/vad.conf \
+                        "${dumpdir}/mfcc/${dset}"
+                    utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
+
+                    # 4. Extract X-vector
+                    sid/nnet3/xvector/extract_xvectors.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                        "${xvector_exp}" \
+                        "${dumpdir}/mfcc/${dset}" \
+                        "${dumpdir}/xvector/${dset}"
+
+                    # 5. Filter scp
+                    # NOTE(kan-bayashi): Since sometimes mfcc or x-vector extraction is failed,
+                    #   the number of utts will be different from the original features (raw or fbank).
+                    #   To avoid this mismatch, perform filtering of the original feature scp here.
+                    if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                        _suf="/org"
+                    else
+                        _suf=""
+                    fi
+                    cp "${data_feats}${_suf}/${dset}"/wav.{scp,scp.bak}
+                    <"${data_feats}${_suf}/${dset}/wav.scp.bak" \
+                        utils/filter_scp.pl "${dumpdir}/xvector/${dset}/xvector.scp" \
+                        >"${data_feats}${_suf}/${dset}/wav.scp"
+                    utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
+                done
+            else
+                # Assume that others toolkits are python-based
+                log "Stage 2+: Extract X-vector: data/ -> ${dumpdir}/xvector using python toolkits"
+                for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                    if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                        _suf="/org"
+                    else
+                        _suf=""
+                    fi
+                    pyscripts/utils/extract_xvectors.py \
+                        --pretrained_model ${xvector_model} \
+                        --toolkit ${xvector_tool} \
+                        ${data_feats}${_suf}/${dset} \
+                        ${dumpdir}/xvector/${dset}
+                done
+            fi
+        fi
 
-            # Generate the fbank features; by default 80-dimensional fbanks on each frame
+        # Prepare spk id input
+        if "${use_sid}"; then
+            log "Stage 2+: Prepare speaker id: data/ -> ${data_feats}/"
             for dset in "${train_set}" "${valid_set}" ${test_sets}; do
                 if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
                     _suf="/org"
                 else
                     _suf=""
                 fi
-                # 1. Copy datadir
-                utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
-
-                # 2. Feature extract
-                # TODO(kamo): Wrap (nj->_nj) in make_fbank.sh
-                _nj=$(min "${nj}" "$(<${data_feats}${_suf}/${dset}/utt2spk wc -l)")
-                _opts=
-                if [ "${feats_type}" = fbank ] ; then
-                    _opts+="--fmax ${fmax} "
-                    _opts+="--fmin ${fmin} "
-                    _opts+="--n_mels ${n_mels} "
+                if [ "${dset}" = "${train_set}" ]; then
+                    # Make spk2sid
+                    # NOTE(kan-bayashi): 0 is reserved for unknown speakers
+                    echo "<unk> 0" > "${data_feats}${_suf}/${dset}/spk2sid"
+                    cut -f 2 -d " " "${data_feats}${_suf}/${dset}/utt2spk" | sort | uniq | \
+                        awk '{print $1 " " NR}' >> "${data_feats}${_suf}/${dset}/spk2sid"
                 fi
-
-                # shellcheck disable=SC2086
-                scripts/feats/make_"${feats_type}".sh --cmd "${train_cmd}" --nj "${_nj}" \
-                    --fs "${fs}" \
-                    --n_fft "${n_fft}" \
-                    --n_shift "${n_shift}" \
-                    --win_length "${win_length}" \
-                    ${_opts} \
-                    "${data_feats}${_suf}/${dset}"
-                utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
-
-                # 3. Derive the the frame length and feature dimension
-                scripts/feats/feat_to_shape.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                    "${data_feats}${_suf}/${dset}/feats.scp" "${data_feats}${_suf}/${dset}/feats_shape"
-
-                # 4. Write feats_dim
-                head -n 1 "${data_feats}${_suf}/${dset}/feats_shape" | awk '{ print $2 }' \
-                    | cut -d, -f2 > ${data_feats}${_suf}/${dset}/feats_dim
-
-                # 5. Write feats_type
-                echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+                pyscripts/utils/utt2spk_to_utt2sid.py \
+                    "${data_feats}/org/${train_set}/spk2sid" \
+                    "${data_feats}${_suf}/${dset}/utt2spk" \
+                    > "${data_feats}${_suf}/${dset}/utt2sid"
             done
         fi
 
-        # Extract X-vector
-        if "${use_xvector}"; then
-            log "Stage 2+: Extract X-vector: data/ -> ${dumpdir}/xvector (Require Kaldi)"
-            # Download X-vector pretrained model
-            xvector_exp=${expdir}/xvector_nnet_1a
-            if [ ! -e "${xvector_exp}" ]; then
-                log "X-vector model does not exist. Download pre-trained model."
-                wget http://kaldi-asr.org/models/8/0008_sitw_v2_1a.tar.gz
-                tar xvf 0008_sitw_v2_1a.tar.gz
-                [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
-                mv 0008_sitw_v2_1a/exp/xvector_nnet_1a "${xvector_exp}"
-                rm -rf 0008_sitw_v2_1a.tar.gz 0008_sitw_v2_1a
-            fi
-
-            # Generate the MFCC features, VAD decision, and X-vector
+        # Prepare lang id input
+        if "${use_lid}"; then
+            log "Stage 2+: Prepare lang id: data/ -> ${data_feats}/"
             for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                # 1. Copy datadir and resample to 16k
-                utils/copy_data_dir.sh "data/${dset}" "${dumpdir}/mfcc/${dset}"
-                utils/data/resample_data_dir.sh 16000 "${dumpdir}/mfcc/${dset}"
-
-                # 2. Extract mfcc features
-                _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/utt2spk wc -l)")
-                steps/make_mfcc.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                    --write-utt2num-frames true \
-                    --mfcc-config conf/mfcc.conf \
-                    "${dumpdir}/mfcc/${dset}"
-                utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
-
-                # 3. Compute VAD decision
-                sid/compute_vad_decision.sh --nj ${_nj} --cmd "${train_cmd}" \
-                    --vad-config conf/vad.conf \
-                    "${dumpdir}/mfcc/${dset}"
-                utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
-
-                # 4. Extract X-vector
-                sid/nnet3/xvector/extract_xvectors.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                    "${xvector_exp}" \
-                    "${dumpdir}/mfcc/${dset}" \
-                    "${dumpdir}/xvector/${dset}"
-
-                # 5. Filter scp
-                # NOTE(kan-bayashi): Since sometimes mfcc or x-vector extraction is failed,
-                #   the number of utts will be different from the original features (raw or fbank).
-                #   To avoid this mismatch, perform filtering of the original feature scp here.
                 if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
                     _suf="/org"
                 else
                     _suf=""
                 fi
-                cp "${data_feats}${_suf}/${dset}"/wav.{scp,scp.bak}
-                <"${data_feats}${_suf}/${dset}/wav.scp.bak" \
-                    utils/filter_scp.pl "${dumpdir}/xvector/${dset}/xvector.scp" \
-                    >"${data_feats}${_suf}/${dset}/wav.scp"
-                utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
+                if [ "${dset}" = "${train_set}" ]; then
+                    # Make lang2lid
+                    # NOTE(kan-bayashi): 0 is reserved for unknown languages
+                    echo "<unk> 0" > "${data_feats}${_suf}/${dset}/lang2lid"
+                    cut -f 2 -d " " "${data_feats}${_suf}/${dset}/utt2lang" | sort | uniq | \
+                        awk '{print $1 " " NR}' >> "${data_feats}${_suf}/${dset}/lang2lid"
+                fi
+                # NOTE(kan-bayashi): We can reuse the same script for making utt2sid
+                pyscripts/utils/utt2spk_to_utt2sid.py \
+                    "${data_feats}/org/${train_set}/lang2lid" \
+                    "${data_feats}${_suf}/${dset}/utt2lang" \
+                    > "${data_feats}${_suf}/${dset}/utt2lid"
             done
         fi
     fi
@@ -424,54 +472,41 @@ if ! "${skip_data_prep}"; then
             # Copy data dir
             utils/copy_data_dir.sh "${data_feats}/org/${dset}" "${data_feats}/${dset}"
             cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+            if [ -e "${data_feats}/org/${dset}/utt2sid" ]; then
+                cp "${data_feats}/org/${dset}/utt2sid" "${data_feats}/${dset}/utt2sid"
+            fi
+            if [ -e "${data_feats}/org/${dset}/utt2lid" ]; then
+                cp "${data_feats}/org/${dset}/utt2lid" "${data_feats}/${dset}/utt2lid"
+            fi
 
             # Remove short utterances
-            _feats_type="$(<${data_feats}/${dset}/feats_type)"
-            if [ "${_feats_type}" = raw ]; then
-                _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
-                _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
-                _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
-
-                # utt2num_samples is created by format_wav_scp.sh
-                <"${data_feats}/org/${dset}/utt2num_samples" \
-                    awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                        '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
-                        >"${data_feats}/${dset}/utt2num_samples"
-                <"${data_feats}/org/${dset}/wav.scp" \
-                    utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
-                    >"${data_feats}/${dset}/wav.scp"
-            else
-                # Get frame shift in ms from conf/fbank.conf
-                _frame_shift=
-                if [ -f conf/fbank.conf ] && [ "$(<conf/fbank.conf grep -c frame-shift)" -gt 0 ]; then
-                    # Assume using conf/fbank.conf for feature extraction
-                    _frame_shift="$(<conf/fbank.conf grep frame-shift | sed -e 's/[-a-z =]*\([0-9]*\)/\1/g')"
-                fi
-                if [ -z "${_frame_shift}" ]; then
-                    # If not existing, use the default number in Kaldi (=10ms).
-                    # If you are using different number, you have to change the following value manually.
-                    _frame_shift=10
-                fi
-
-                _min_length=$(python3 -c "print(int(${min_wav_duration} / ${_frame_shift} * 1000))")
-                _max_length=$(python3 -c "print(int(${max_wav_duration} / ${_frame_shift} * 1000))")
-
-                cp "${data_feats}/org/${dset}/feats_dim" "${data_feats}/${dset}/feats_dim"
-                <"${data_feats}/org/${dset}/feats_shape" awk -F, ' { print $1 } ' \
-                    | awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                        '{ if ($2 > min_length && $2 < max_length) print $0; }' \
-                        >"${data_feats}/${dset}/feats_shape"
-                <"${data_feats}/org/${dset}/feats.scp" \
-                    utils/filter_scp.pl "${data_feats}/${dset}/feats_shape"  \
-                    >"${data_feats}/${dset}/feats.scp"
-            fi
+            _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+            _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+            _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+            # utt2num_samples is created by format_wav_scp.sh
+            <"${data_feats}/org/${dset}/utt2num_samples" \
+                awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                    '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                    >"${data_feats}/${dset}/utt2num_samples"
+            <"${data_feats}/org/${dset}/wav.scp" \
+                utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                >"${data_feats}/${dset}/wav.scp"
 
             # Remove empty text
             <"${data_feats}/org/${dset}/text" \
                 awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
 
             # fix_data_dir.sh leaves only utts which exist in all files
-            utils/fix_data_dir.sh "${data_feats}/${dset}"
+            _fix_opts=""
+            if [ -e "${data_feats}/org/${dset}/utt2sid" ]; then
+                _fix_opts="--utt_extra_files utt2sid "
+            fi
+            if [ -e "${data_feats}/org/${dset}/utt2lid" ]; then
+                _fix_opts="--utt_extra_files utt2lid "
+            fi
+            # shellcheck disable=SC2086
+            utils/fix_data_dir.sh ${_fix_opts} "${data_feats}/${dset}"
 
             # Filter x-vector
             if "${use_xvector}"; then
@@ -481,9 +516,6 @@ if ! "${skip_data_prep}"; then
                     >"${dumpdir}/xvector/${dset}/xvector.scp"
             fi
         done
-
-        # shellcheck disable=SC2002
-        cat ${srctexts} | awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/srctexts"
     fi
 
 
@@ -494,6 +526,9 @@ if ! "${skip_data_prep}"; then
         # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
         # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
 
+        # shellcheck disable=SC2002
+        cat ${srctexts} | awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/srctexts"
+
         ${python} -m espnet2.bin.tokenize_text \
               --token_type "${token_type}" -f 2- \
               --input "${data_feats}/srctexts" --output "${token_list}" \
@@ -526,28 +561,22 @@ if ! "${skip_train}"; then
             _opts+="--config ${train_config} "
         fi
 
-        _feats_type="$(<${_train_dir}/feats_type)"
-        if [ "${_feats_type}" = raw ]; then
-            _scp=wav.scp
-            if [[ "${audio_format}" == *ark* ]]; then
-                _type=kaldi_ark
-            else
-                # "sound" supports "wav", "flac", etc.
-                _type=sound
-            fi
-            _opts+="--feats_extract fbank "
+        _scp=wav.scp
+        if [[ "${audio_format}" == *ark* ]]; then
+            _type=kaldi_ark
+        else
+            # "sound" supports "wav", "flac", etc.
+            _type=sound
+        fi
+        _opts+="--feats_extract ${feats_extract} "
+        _opts+="--feats_extract_conf n_fft=${n_fft} "
+        _opts+="--feats_extract_conf hop_length=${n_shift} "
+        _opts+="--feats_extract_conf win_length=${win_length} "
+        if [ "${feats_extract}" = fbank ]; then
             _opts+="--feats_extract_conf fs=${fs} "
-            _opts+="--feats_extract_conf n_fft=${n_fft} "
             _opts+="--feats_extract_conf fmin=${fmin} "
             _opts+="--feats_extract_conf fmax=${fmax} "
             _opts+="--feats_extract_conf n_mels=${n_mels} "
-            _opts+="--feats_extract_conf hop_length=${n_shift} "
-            _opts+="--feats_extract_conf win_length=${win_length} "
-        else
-            _scp=feats.scp
-            _type=kaldi_ark
-            _odim="$(<${_train_dir}/feats_dim)"
-            _opts+="--odim=${_odim} "
         fi
 
         # Add extra configs for additional inputs
@@ -576,6 +605,16 @@ if ! "${skip_train}"; then
             _opts+="--valid_data_path_and_name_and_type ${_xvector_valid_dir}/xvector.scp,spembs,kaldi_ark "
         fi
 
+        if "${use_sid}"; then
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2sid,sids,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2sid,sids,text_int "
+        fi
+
+        if "${use_lid}"; then
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2lid,lids,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2lid,lids,text_int "
+        fi
+
         # 1. Split the key file
         _logdir="${tts_stats_dir}/logdir"
         mkdir -p "${_logdir}"
@@ -607,7 +646,7 @@ if ! "${skip_train}"; then
         log "TTS collect_stats started... log: '${_logdir}/stats.*.log'"
         # shellcheck disable=SC2086
         ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
-            ${python} -m espnet2.bin.tts_train \
+            ${python} -m "espnet2.bin.${tts_task}_train" \
                 --collect_stats true \
                 --write_collected_feats "${write_collected_feats}" \
                 --use_preprocessor true \
@@ -662,27 +701,23 @@ if ! "${skip_train}"; then
             #####################################
             #     CASE 1: AR model training     #
             #####################################
-            _feats_type="$(<${_train_dir}/feats_type)"
-
-            if [ "${_feats_type}" = raw ]; then
-                _scp=wav.scp
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
                 # "sound" supports "wav", "flac", etc.
                 _type=sound
-                _fold_length="$((speech_fold_length * n_shift))"
-                _opts+="--feats_extract fbank "
+            fi
+            _fold_length="$((speech_fold_length * n_shift))"
+            _opts+="--feats_extract ${feats_extract} "
+            _opts+="--feats_extract_conf n_fft=${n_fft} "
+            _opts+="--feats_extract_conf hop_length=${n_shift} "
+            _opts+="--feats_extract_conf win_length=${win_length} "
+            if [ "${feats_extract}" = fbank ]; then
                 _opts+="--feats_extract_conf fs=${fs} "
                 _opts+="--feats_extract_conf fmin=${fmin} "
                 _opts+="--feats_extract_conf fmax=${fmax} "
                 _opts+="--feats_extract_conf n_mels=${n_mels} "
-                _opts+="--feats_extract_conf hop_length=${n_shift} "
-                _opts+="--feats_extract_conf n_fft=${n_fft} "
-                _opts+="--feats_extract_conf win_length=${win_length} "
-            else
-                _scp=feats.scp
-                _type=kaldi_ark
-                _fold_length="${speech_fold_length}"
-                _odim="$(<${_train_dir}/feats_dim)"
-                _opts+="--odim=${_odim} "
             fi
 
             if [ "${num_splits}" -gt 1 ]; then
@@ -729,7 +764,6 @@ if ! "${skip_train}"; then
             _teacher_train_dir="${teacher_dumpdir}/${train_set}"
             _teacher_valid_dir="${teacher_dumpdir}/${valid_set}"
             _fold_length="${speech_fold_length}"
-
             _opts+="--train_data_path_and_name_and_type ${_train_dir}/text,text,text "
             _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/durations,durations,text_int "
             _opts+="--train_shape_file ${tts_stats_dir}/train/text_shape.${token_type} "
@@ -749,23 +783,23 @@ if ! "${skip_train}"; then
                 _opts+="--valid_shape_file ${_teacher_valid_dir}/speech_shape "
             else
                 # Teacher forcing case: use groundtruth as the target
-                if [ "${feats_type}" = raw ]; then
-                    _scp=wav.scp
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    # "sound" supports "wav", "flac", etc.
                     _type=sound
-                    _fold_length="$((speech_fold_length * n_shift))"
-                    _opts+="--feats_extract fbank "
+                fi
+                _fold_length="$((speech_fold_length * n_shift))"
+                _opts+="--feats_extract ${feats_extract} "
+                _opts+="--feats_extract_conf n_fft=${n_fft} "
+                _opts+="--feats_extract_conf hop_length=${n_shift} "
+                _opts+="--feats_extract_conf win_length=${win_length} "
+                if [ "${feats_extract}" = fbank ]; then
                     _opts+="--feats_extract_conf fs=${fs} "
                     _opts+="--feats_extract_conf fmin=${fmin} "
                     _opts+="--feats_extract_conf fmax=${fmax} "
                     _opts+="--feats_extract_conf n_mels=${n_mels} "
-                    _opts+="--feats_extract_conf hop_length=${n_shift} "
-                    _opts+="--feats_extract_conf n_fft=${n_fft} "
-                    _opts+="--feats_extract_conf win_length=${win_length} "
-                else
-                    _scp=feats.scp
-                    _type=kaldi_ark
-                    _odim="$(head -n 1 "${tts_stats_dir}/train/speech_shape" | cut -f 2 -d ",")"
-                    _opts+="--odim=${_odim} "
                 fi
                 _opts+="--train_data_path_and_name_and_type ${_train_dir}/${_scp},speech,${_type} "
                 _opts+="--train_shape_file ${tts_stats_dir}/train/speech_shape "
@@ -818,6 +852,22 @@ if ! "${skip_train}"; then
             _opts+="--valid_data_path_and_name_and_type ${_xvector_valid_dir}/xvector.scp,spembs,kaldi_ark "
         fi
 
+        # Add spekaer ID to the inputs if needed
+        if "${use_sid}"; then
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2sid,sids,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2sid,sids,text_int "
+        fi
+
+        # Add language ID to the inputs if needed
+        if "${use_lid}"; then
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2lid,lids,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2lid,lids,text_int "
+        fi
+
+        if [ "${feats_normalize}" = "global_mvn" ]; then
+            _opts+="--normalize_conf stats_file=${tts_stats_dir}/train/feats_stats.npz "
+        fi
+
         log "Generate '${tts_exp}/run.sh'. You can resume the process from stage 6 using this script"
         mkdir -p "${tts_exp}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${tts_exp}/run.sh"; chmod +x "${tts_exp}/run.sh"
 
@@ -838,15 +888,14 @@ if ! "${skip_train}"; then
             --num_nodes "${num_nodes}" \
             --init_file_prefix "${tts_exp}"/.dist_init_ \
             --multiprocessing_distributed true -- \
-            ${python} -m espnet2.bin.tts_train \
+            ${python} -m "espnet2.bin.${tts_task}_train" \
                 --use_preprocessor true \
                 --token_type "${token_type}" \
                 --token_list "${token_list}" \
                 --non_linguistic_symbols "${nlsyms_txt}" \
                 --cleaner "${cleaner}" \
                 --g2p "${g2p}" \
-                --normalize global_mvn \
-                --normalize_conf "stats_file=${tts_stats_dir}/train/feats_stats.npz" \
+                --normalize "${feats_normalize}" \
                 --resume true \
                 --fold_length "${text_fold_length}" \
                 --fold_length "${_fold_length}" \
@@ -896,19 +945,6 @@ if ! "${skip_eval}"; then
             _opts+="--config ${inference_config} "
         fi
 
-        if [ -z "${teacher_dumpdir}" ]; then
-            _feats_type="$(<${data_feats}/${train_set}/feats_type)"
-        else
-            if [ -e "${teacher_dumpdir}/${train_set}/probs" ]; then
-                # Knowledge distillation
-                _feats_type=fbank
-            else
-                # Teacher forcing
-                _feats_type="$(<${data_feats}/${train_set}/feats_type)"
-            fi
-        fi
-
-        # NOTE(kamo): If feats_type=raw, vocoder_conf is unnecessary
         _scp=wav.scp
         if [[ "${audio_format}" == *ark* ]]; then
             _type=kaldi_ark
@@ -916,19 +952,6 @@ if ! "${skip_eval}"; then
             # "sound" supports "wav", "flac", etc.
             _type=sound
         fi
-        if [ "${_feats_type}" = fbank ] || [ "${_feats_type}" = stft ]; then
-            _opts+="--vocoder_conf n_fft=${n_fft} "
-            _opts+="--vocoder_conf n_shift=${n_shift} "
-            _opts+="--vocoder_conf win_length=${win_length} "
-            _opts+="--vocoder_conf fs=${fs} "
-            _scp=feats.scp
-            _type=kaldi_ark
-        fi
-        if [ "${_feats_type}" = fbank ]; then
-            _opts+="--vocoder_conf n_mels=${n_mels} "
-            _opts+="--vocoder_conf fmin=${fmin} "
-            _opts+="--vocoder_conf fmax=${fmax} "
-        fi
 
         log "Generate '${tts_exp}/${inference_tag}/run.sh'. You can resume the process from stage 7 using this script"
         mkdir -p "${tts_exp}/${inference_tag}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${tts_exp}/${inference_tag}/run.sh"; chmod +x "${tts_exp}/${inference_tag}/run.sh"
@@ -960,6 +983,16 @@ if ! "${skip_eval}"; then
                 _ex_opts+="--data_path_and_name_and_type ${_xvector_dir}/xvector.scp,spembs,kaldi_ark "
             fi
 
+            # Add spekaer ID to the inputs if needed
+            if "${use_sid}"; then
+                _ex_opts+="--data_path_and_name_and_type ${_data}/utt2sid,sids,text_int "
+            fi
+
+            # Add language ID to the inputs if needed
+            if "${use_lid}"; then
+                _ex_opts+="--data_path_and_name_and_type ${_data}/utt2lid,lids,text_int "
+            fi
+
             # 0. Copy feats_type
             cp "${_data}/feats_type" "${_dir}/feats_type"
 
@@ -985,36 +1018,53 @@ if ! "${skip_eval}"; then
                     --model_file "${tts_exp}"/"${inference_model}" \
                     --train_config "${tts_exp}"/config.yaml \
                     --output_dir "${_logdir}"/output.JOB \
-                    --vocoder_conf griffin_lim_iters="${griffin_lim_iters}" \
+                    --vocoder_file "${vocoder_file}" \
                     ${_opts} ${_ex_opts} ${inference_args}
 
             # 4. Concatenates the output files from each jobs
-            mkdir -p "${_dir}"/{norm,denorm,wav}
-            for i in $(seq "${_nj}"); do
-                 cat "${_logdir}/output.${i}/norm/feats.scp"
-            done | LC_ALL=C sort -k1 > "${_dir}/norm/feats.scp"
-            for i in $(seq "${_nj}"); do
-                 cat "${_logdir}/output.${i}/denorm/feats.scp"
-            done | LC_ALL=C sort -k1 > "${_dir}/denorm/feats.scp"
-            for i in $(seq "${_nj}"); do
-                 cat "${_logdir}/output.${i}/speech_shape/speech_shape"
-            done | LC_ALL=C sort -k1 > "${_dir}/speech_shape"
-            for i in $(seq "${_nj}"); do
-                mv -u "${_logdir}/output.${i}"/wav/*.wav "${_dir}"/wav
-                rm -rf "${_logdir}/output.${i}"/wav
-            done
+            if [ -e "${_logdir}/output.${_nj}/norm" ]; then
+                mkdir -p "${_dir}"/norm
+                for i in $(seq "${_nj}"); do
+                     cat "${_logdir}/output.${i}/norm/feats.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/norm/feats.scp"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/denorm" ]; then
+                mkdir -p "${_dir}"/denorm
+                for i in $(seq "${_nj}"); do
+                     cat "${_logdir}/output.${i}/denorm/feats.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/denorm/feats.scp"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/speech_shape" ]; then
+                for i in $(seq "${_nj}"); do
+                     cat "${_logdir}/output.${i}/speech_shape/speech_shape"
+                done | LC_ALL=C sort -k1 > "${_dir}/speech_shape"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/wav" ]; then
+                mkdir -p "${_dir}"/wav
+                for i in $(seq "${_nj}"); do
+                    mv -u "${_logdir}/output.${i}"/wav/*.wav "${_dir}"/wav
+                    rm -rf "${_logdir}/output.${i}"/wav
+                done
+                find "${_dir}/wav" -name "*.wav" | while read -r line; do
+                    echo "$(basename "${line}" .wav) ${line}"
+                done | LC_ALL=C sort -k1 > "${_dir}/wav/wav.scp"
+            fi
             if [ -e "${_logdir}/output.${_nj}/att_ws" ]; then
                 mkdir -p "${_dir}"/att_ws
+                for i in $(seq "${_nj}"); do
+                    mv -u "${_logdir}/output.${i}"/att_ws/*.png "${_dir}"/att_ws
+                    rm -rf "${_logdir}/output.${i}"/att_ws
+                done
+            fi
+            if [ -e "${_logdir}/output.${_nj}/durations" ]; then
                 for i in $(seq "${_nj}"); do
                      cat "${_logdir}/output.${i}/durations/durations"
                 done | LC_ALL=C sort -k1 > "${_dir}/durations"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/focus_rates" ]; then
                 for i in $(seq "${_nj}"); do
                      cat "${_logdir}/output.${i}/focus_rates/focus_rates"
                 done | LC_ALL=C sort -k1 > "${_dir}/focus_rates"
-                for i in $(seq "${_nj}"); do
-                    mv -u "${_logdir}/output.${i}"/att_ws/*.png "${_dir}"/att_ws
-                    rm -rf "${_logdir}/output.${i}"/att_ws
-                done
             fi
             if [ -e "${_logdir}/output.${_nj}/probs" ]; then
                 mkdir -p "${_dir}"/probs
@@ -1031,11 +1081,16 @@ fi
 
 
 packed_model="${tts_exp}/${tts_exp##*/}_${inference_model%.*}.zip"
-if ! "${skip_upload}"; then
+if [ -z "${download_model}" ]; then
+    # Skip pack preparation if using a downloaded model
     if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
         log "Stage 8: Pack model: ${packed_model}"
+        log "Warning: Upload model to Zenodo will be deprecated. We encourage to use Hugging Face"
 
         _opts=""
+        if [ -e "${tts_stats_dir}/train/feats_stats.npz" ]; then
+            _opts+=" --option ${tts_stats_dir}/train/feats_stats.npz"
+        fi
         if [ -e "${tts_stats_dir}/train/pitch_stats.npz" ]; then
             _opts+=" --option ${tts_stats_dir}/train/pitch_stats.npz"
         fi
@@ -1048,10 +1103,15 @@ if ! "${skip_upload}"; then
                 _opts+=" --option ${dumpdir}/xvector/${dset}/spk_xvector.ark"
             done
         fi
+        if "${use_sid}"; then
+            _opts+=" --option ${data_feats}/org/${train_set}/spk2sid"
+        fi
+        if "${use_lid}"; then
+            _opts+=" --option ${data_feats}/org/${train_set}/lang2lid"
+        fi
         ${python} -m espnet2.bin.pack tts \
             --train_config "${tts_exp}"/config.yaml \
             --model_file "${tts_exp}"/"${inference_model}" \
-            --option "${tts_stats_dir}"/train/feats_stats.npz  \
             --option "${tts_exp}"/images  \
             --outpath "${packed_model}" \
             ${_opts}
@@ -1060,8 +1120,9 @@ if ! "${skip_upload}"; then
         #   % unzip ${packed_model}
         #   % ./run.sh --stage 8 --tts_exp $(basename ${packed_model} .zip) --inference_model pretrain.pth
     fi
+fi
 
-
+if ! "${skip_upload}"; then
     if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
         log "Stage 9: Upload model to Zenodo: ${packed_model}"
 
@@ -1116,7 +1177,59 @@ EOF
             --publish false
     fi
 else
-    log "Skip the uploading stages"
+    log "Skip the uploading stage"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 10: Upload model to HuggingFace: ${hf_repo}"
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1
+
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=text-to-speech
+        # shellcheck disable=SC2034
+        espnet_task=TTS
+        # shellcheck disable=SC2034
+        task_exp=${tts_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aidatatang_200zh/asr1/README.md b/egs2/aidatatang_200zh/asr1/README.md
new file mode 100644
index 00000000000..3bea2a797cc
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/README.md
@@ -0,0 +1,19 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Fri Dec 24 23:34:58 EST 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `a5bacd349a47889aef795f999563018cf201ae64`
+  - Commit date: `Wed Dec 22 14:08:29 2021 -0500`
+
+## asr_train_asr_conformer_raw_zh_char_sp
+- Model link: https://huggingface.co/sw005320/aidatatang_200zh_conformer
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev|24216|234524|96.6|3.0|0.5|0.1|3.6|18.5|
+|decode_asr_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test|48144|468933|95.9|3.6|0.4|0.2|4.3|21.0|
\ No newline at end of file
diff --git a/egs2/aidatatang_200zh/asr1/asr.sh b/egs2/aidatatang_200zh/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/aidatatang_200zh/asr1/cmd.sh b/egs2/aidatatang_200zh/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/aidatatang_200zh/asr1/conf/decode_asr.yaml b/egs2/aidatatang_200zh/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..88fdbc20b91
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
+lm_weight: 0.3
diff --git a/egs2/aidatatang_200zh/asr1/conf/fbank.conf b/egs2/aidatatang_200zh/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/aidatatang_200zh/asr1/conf/pbs.conf b/egs2/aidatatang_200zh/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/aidatatang_200zh/asr1/conf/pitch.conf b/egs2/aidatatang_200zh/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/aidatatang_200zh/asr1/conf/queue.conf b/egs2/aidatatang_200zh/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/aidatatang_200zh/asr1/conf/slurm.conf b/egs2/aidatatang_200zh/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/aidatatang_200zh/asr1/conf/train_asr_conformer.yaml b/egs2/aidatatang_200zh/asr1/conf/train_asr_conformer.yaml
new file mode 100644
index 00000000000..3efd1757ab4
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 50
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/aidatatang_200zh/asr1/conf/train_lm_transformer.yaml b/egs2/aidatatang_200zh/asr1/conf/train_lm_transformer.yaml
new file mode 100644
index 00000000000..ace0739a939
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
diff --git a/egs2/aidatatang_200zh/asr1/db.sh b/egs2/aidatatang_200zh/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/aidatatang_200zh/asr1/local/data.sh b/egs2/aidatatang_200zh/asr1/local/data.sh
new file mode 100755
index 00000000000..cc722b5e191
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/local/data.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0
+
+Options:
+    --remove_archive (bool): true or false
+      With remove_archive=True, the archives will be removed after being successfully downloaded and un-tarred.
+EOF
+)
+SECONDS=0
+
+# Data preparation related
+data_url=www.openslr.org/resources/62
+remove_archive=false
+download_opt=
+
+log "$0 $*"
+
+
+. ./utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -gt 1 ]; then
+  log "${help_message}"
+  exit 2
+fi
+
+if "$remove_archive"; then
+  download_opt="--remove-archive"
+fi
+
+if [ -z "${AIDATATANG_200ZH}" ]; then
+  log "Error: \$AIDATATANG_200ZH is not set in db.sh."
+  exit 2
+fi
+
+
+log "Download data to ${AIDATATANG_200ZH}"
+if [ ! -d "${AIDATATANG_200ZH}" ]; then
+    mkdir -p "${AIDATATANG_200ZH}"
+fi
+# To absolute path
+AIDATATANG_200ZH=$(cd ${AIDATATANG_200ZH}; pwd)
+
+local/download_and_untar.sh ${download_opt} "${AIDATATANG_200ZH}" "${data_url}" aidatatang_200zh
+
+log "Data Preparation"
+local/data_prep.sh ${AIDATATANG_200ZH}/aidatatang_200zh/corpus ${AIDATATANG_200ZH}/aidatatang_200zh/transcript
+
+for x in train dev test; do
+    cp data/${x}/text data/${x}/text.org
+    paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \
+        > data/${x}/text
+    rm data/${x}/text.org
+done
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aidatatang_200zh/asr1/local/data_prep.sh b/egs2/aidatatang_200zh/asr1/local/data_prep.sh
new file mode 100755
index 00000000000..1e4bf127b28
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/local/data_prep.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <text-path>"
+  echo " $0 /export/a05/xna/data/data_aidatatang_200zh/corpus /export/a05/xna/data/data_aidatatang_200zh/transcript"
+  exit 1;
+fi
+
+aidatatang_audio_dir=$1
+aidatatang_text=$2/aidatatang_200_zh_transcript.txt
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+tmp_dir=data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aidatatang_audio_dir ] || [ ! -f $aidatatang_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 237265 ] && \
+  echo Warning: expected 237265 data files, found $n
+
+grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/train data/dev data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f data/train/$f || exit 1;
+  cp $dev_dir/$f data/dev/$f || exit 1;
+  cp $test_dir/$f data/test/$f || exit 1;
+done
+
+echo "$0: aidatatang_200zh data preparation succeeded"
+exit 0;
diff --git a/egs2/aidatatang_200zh/asr1/local/download_and_untar.sh b/egs2/aidatatang_200zh/asr1/local/download_and_untar.sh
new file mode 100755
index 00000000000..1056ead6d1a
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/local/download_and_untar.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/62 aidatatang_200zh"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: aidatatang_200zh."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="aidatatang_200zh"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="18756983399"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.gz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+dev_dir=$data/$part/corpus/dev
+test_dir=$data/$part/corpus/test
+train_dir=$data/$part/corpus/train
+if [ $part == "aidatatang_200zh" ]; then
+  for set in $dev_dir $test_dir $train_dir;do
+    cd $set
+    for wav in ./*.tar.gz; do
+      echo "Extracting wav from $wav"
+      tar -zxf $wav && rm $wav
+    done
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs2/aidatatang_200zh/asr1/local/path.sh b/egs2/aidatatang_200zh/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/aidatatang_200zh/asr1/path.sh b/egs2/aidatatang_200zh/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/aidatatang_200zh/asr1/pyscripts b/egs2/aidatatang_200zh/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/aidatatang_200zh/asr1/run.sh b/egs2/aidatatang_200zh/asr1/run.sh
new file mode 100755
index 00000000000..4f7f3423b1d
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/run.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr_conformer.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm_transformer.yaml
+use_lm=true
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/aidatatang_200zh/asr1/scripts b/egs2/aidatatang_200zh/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/aidatatang_200zh/asr1/steps b/egs2/aidatatang_200zh/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/aidatatang_200zh/asr1/utils b/egs2/aidatatang_200zh/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/aidatatang_200zh/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/aishell/asr1/README.md b/egs2/aishell/asr1/README.md
index 52f0cc1cb40..985604c1a43 100644
--- a/egs2/aishell/asr1/README.md
+++ b/egs2/aishell/asr1/README.md
@@ -1,3 +1,44 @@
+# Streaming Conformer + specaug + speed perturbation: feats=raw, n_fft=512, hop_length=128
+## Environments
+- date: `Mon Aug 23 16:31:48 CST 2021`
+- python version: `3.7.9 (default, Aug 31 2020, 12:42:55)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.9`
+- pytorch version: `pytorch 1.5.0`
+- Git hash: `b94d07028099a80c9c690341981ae7d550b5ca24`
+  - Commit date: `Mon Aug 23 00:47:47 2021 +0800`
+
+## With Transformer LM
+- Model link: (wait for upload)
+- ASR config: [./conf/train_asr_streaming_cpnformer.yaml](./conf/train_asr_streaming_conformer.yaml)
+- LM config: [./conf/tuning/train_lm_transformer.yaml](./conf/tuning/train_lm_transformer.yaml)
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_streaming_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev|14326|205341|94.0|5.8|0.3|0.3|6.3|42.2|
+|decode_asr_streaming_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test|7176|104765|92.9|6.7|0.5|0.7|7.8|46.2|
+# Streaming Transformer + speed perturbation: feats=raw, n_fft=512, hop_length=128
+## Environments
+- date: `Tue Aug 17 01:20:32 CST 2021`
+- python version: `3.7.9 (default, Aug 31 2020, 12:42:55)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.9`
+- pytorch version: `pytorch 1.5.0`
+- Git hash: `6f5f848e0a9bfca1b73393779233bde34add3df1`
+  - Commit date: `Mon Aug 16 21:50:08 2021 +0800`
+
+## With Transformer LM
+- Model link: (wait for upload)
+- ASR config: [./conf/train_asr_streaming_transformer.yaml](./conf/train_asr_streaming_transformer.yaml)
+- LM config: [./conf/tuning/train_lm_transformer.yaml](./conf/tuning/train_lm_transformer.yaml)
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_streaming_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev|14326|205341|93.6|6.2|0.1|0.5|6.8|46.8|
+|decode_asr_streaming_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test|7176|104765|93.0|6.7|0.2|0.8|7.8|50.7|
+
 # Conformer + specaug + speed perturbation: feats=raw, n_fft=512, hop_length=128
 ## Environments
 - date: `Fri Oct 16 11:10:17 JST 2020`
diff --git a/egs2/aishell/asr1/cmd.sh b/egs2/aishell/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/aishell/asr1/cmd.sh
+++ b/egs2/aishell/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/aishell/asr1/conf/decode_asr_simu_streaming.yaml b/egs2/aishell/asr1/conf/decode_asr_simu_streaming.yaml
new file mode 100644
index 00000000000..f79545ddb2f
--- /dev/null
+++ b/egs2/aishell/asr1/conf/decode_asr_simu_streaming.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
+lm_weight: 0.3
+streaming: True
diff --git a/egs2/aishell/asr1/conf/decode_asr_streaming.yaml b/egs2/aishell/asr1/conf/decode_asr_streaming.yaml
index f79545ddb2f..fc2b3b1af5d 100644
--- a/egs2/aishell/asr1/conf/decode_asr_streaming.yaml
+++ b/egs2/aishell/asr1/conf/decode_asr_streaming.yaml
@@ -1,7 +1,11 @@
-beam_size: 20
+beam_size: 10
 penalty: 0.0
 maxlenratio: 0.0
 minlenratio: 0.0
 ctc_weight: 0.6
 lm_weight: 0.3
-streaming: True
+sim_chunk_length: 512
+disable_repetition_detection: true
+decoder_text_length_limit: 0
+encoded_feat_length_limit: 0
+#streaming: True
diff --git a/egs2/aishell/asr1/conf/decode_asr_transformer_ngram.yaml b/egs2/aishell/asr1/conf/decode_asr_transformer_ngram.yaml
new file mode 100644
index 00000000000..5e3e8e049d4
--- /dev/null
+++ b/egs2/aishell/asr1/conf/decode_asr_transformer_ngram.yaml
@@ -0,0 +1,7 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.7
+ngram_weight: 0.9
diff --git a/egs2/aishell/asr1/conf/train_asr_streaming_conformer.yaml b/egs2/aishell/asr1/conf/train_asr_streaming_conformer.yaml
new file mode 100644
index 00000000000..8e4fa086555
--- /dev/null
+++ b/egs2/aishell/asr1/conf/train_asr_streaming_conformer.yaml
@@ -0,0 +1,95 @@
+# network architecture
+# encoder related
+encoder: contextual_block_conformer    # contextual_block_conformer is the core of streaming conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+    block_size: 40      # streaming configuration
+    hop_size: 16        # streaming configuration
+    look_ahead: 16      # streaming configuration
+    init_average: true  # streaming configuration
+    ctx_pos_enc: true   # streaming configuration
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 128
+#valid_batch_size: 1
+# optimization related
+accum_grad: 1
+grad_clip: 5
+patience: 3
+max_epoch: 50
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+early_stopping_criterion:
+    - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+# NoamLR is deprecated. Use WarmupLR.
+# The following is equivalent setting for NoamLR:
+#
+#    optim: adam
+#    optim_conf:
+#        lr: 10.
+#    scheduler: noamlr
+#    scheduler_conf:
+#        model_size: 256
+#        warmup_steps: 25000
+#
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 30000
+num_att_plot: 0
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/aishell/asr1/conf/train_asr_streaming_transformer.yaml b/egs2/aishell/asr1/conf/train_asr_streaming_transformer.yaml
index e1e31f59cfd..2abb8749e8d 100644
--- a/egs2/aishell/asr1/conf/train_asr_streaming_transformer.yaml
+++ b/egs2/aishell/asr1/conf/train_asr_streaming_transformer.yaml
@@ -36,12 +36,12 @@ model_conf:
 
 # minibatch related
 batch_type: folded
-batch_size: 32
-
+batch_size: 128
+#valid_batch_size: 1
 # optimization related
-accum_grad: 2
+accum_grad: 1
 grad_clip: 5
-patience: 0
+patience: 3
 max_epoch: 20
 val_scheduler_criterion:
     - valid
@@ -50,6 +50,10 @@ best_model_criterion:
 -   - valid
     - acc
     - max
+early_stopping_criterion:
+    - valid
+    - cer_ctc
+    - min
 keep_nbest_models: 10
 
 # NoamLR is deprecated. Use WarmupLR.
diff --git a/egs2/aishell/asr1/local/path.sh b/egs2/aishell/asr1/local/path.sh
old mode 100644
new mode 100755
index e69de29bb2d..fdcc1601176
--- a/egs2/aishell/asr1/local/path.sh
+++ b/egs2/aishell/asr1/local/path.sh
@@ -0,0 +1,9 @@
+MAIN_ROOT=$PWD/../../..
+
+# check extra kenlm module installation
+if [ ! -d $MAIN_ROOT/tools/kenlm/build/bin ] > /dev/null; then
+    echo "Error: it seems that kenlm is not installed." >&2
+    echo "Error: please install kenlm as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make kenlm.done" >&2
+    return 1
+fi
diff --git a/egs2/aishell/asr1/run_streaming.sh b/egs2/aishell/asr1/run_streaming.sh
index ac6b8e89d91..8149ae6e842 100755
--- a/egs2/aishell/asr1/run_streaming.sh
+++ b/egs2/aishell/asr1/run_streaming.sh
@@ -21,6 +21,7 @@ use_wordlm=false
 speed_perturb_factors="0.9 1.0 1.1"
 
 ./asr.sh                                               \
+    --use_streaming true                               \
     --lang zh                                          \
     --audio_format wav                                 \
     --feats_type raw                                   \
diff --git a/egs2/aishell3/tts1/README.md b/egs2/aishell3/tts1/README.md
new file mode 100644
index 00000000000..7c45657c0f3
--- /dev/null
+++ b/egs2/aishell3/tts1/README.md
@@ -0,0 +1,22 @@
+# AISHELL3 RECIPE
+
+This is the recipe of Mandrain multi-speaker TTS model with [aishell3](https://www.openslr.org/93/) corpus.
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+# INITIAL RESULTS
+
+## Pretrained models
+
+### aishell3_tts_train_raw_phn_pypinyin_g2p_phone_train.loss.best
+- Tacotron2
+- https://huggingface.co/ftshijt/ESPnet2_pretrained_model_ftshijt_aishell3_tts_train_raw_phn_pypinyin_g2p_phone_train.loss.best
diff --git a/egs2/aishell3/tts1/cmd.sh b/egs2/aishell3/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/aishell3/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/aishell3/tts1/conf/decode.yaml b/egs2/aishell3/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/conf/mfcc.conf b/egs2/aishell3/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/aishell3/tts1/conf/pbs.conf b/egs2/aishell3/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/aishell3/tts1/conf/queue.conf b/egs2/aishell3/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/aishell3/tts1/conf/slurm.conf b/egs2/aishell3/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/aishell3/tts1/conf/train.yaml b/egs2/aishell3/tts1/conf/train.yaml
new file mode 120000
index 00000000000..5825b613e30
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_gst+xvector_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/aishell3/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/aishell3/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/aishell3/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml b/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a6b8d59d422
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,113 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with GST + X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 4                               # number of heads in GST multi-head attention
+    gst_tokens: 16                             # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml b/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
new file mode 100644
index 00000000000..6065c914c39
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
@@ -0,0 +1,81 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 512              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 4                 # number of heads in GST multi-head attention
+    gst_tokens: 16               # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_transformer.yaml b/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_transformer.yaml
new file mode 100644
index 00000000000..737a26960d4
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/tuning/train_gst+xvector_transformer.yaml
@@ -0,0 +1,96 @@
+# This configuration is for ESPnet2 to train Transformer-TTS with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the learning of the diagonal attention.
+# It requires 4 GPUs with 32 GB memory and it takes around 3 days
+# to finish the training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    spk_embed_dim: 512               # dimension of speaker embedding
+    spk_embed_integration_type: add  # how to integrate speaker embedding
+    use_gst: true                    # whether to use GST embedding
+    gst_heads: 4                     # number of heads in GST multi-head attention
+    gst_tokens: 16                   # number of global style tokens
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 12000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/aishell3/tts1/conf/vad.conf b/egs2/aishell3/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/aishell3/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/aishell3/tts1/db.sh b/egs2/aishell3/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/aishell3/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/local/data.sh b/egs2/aishell3/tts1/local/data.sh
new file mode 100755
index 00000000000..5c99b8b6035
--- /dev/null
+++ b/egs2/aishell3/tts1/local/data.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=3
+threshold=35
+nj=40
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${AISHELL3}" ]; then
+   log "Fill the value of 'AISHELL3' of db.sh"
+   exit 1
+fi
+db_root=${AISHELL3}
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage -1: download data from openslr"
+    local/download_and_untar.sh "${db_root}" "https://www.openslr.org/resources/93/data_aishell3.tgz" data_aishell3.tgz 
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: prepare aishell3 data"
+    mkdir -p data
+    for x in train test; do
+        mkdir -p data/${x}
+        python local/data_prep.py --src "${db_root}"/${x}/ --dest data/${x}
+        sort data/${x}/utt2spk -o data/${x}/utt2spk
+        sort data/${x}/wav.scp -o data/${x}/wav.scp
+        sort data/${x}/text -o data/${x}/text
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+        utils/validate_data_dir.sh --no-feats data/${x}
+    done
+
+    for x in train_phn test_phn; do
+        mkdir -p data/${x}
+        python local/data_prep.py --src "${db_root}"/"$(echo ${x} | cut -d'_' -f 1)"/ --dest data/${x} --external_g2p false
+        sort data/${x}/utt2spk -o data/${x}/utt2spk
+        sort data/${x}/wav.scp -o data/${x}/wav.scp
+        sort data/${x}/text -o data/${x}/text
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+        utils/validate_data_dir.sh --no-feats data/${x}
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: scripts/audio/trim_silence.sh"
+    for x in train test train_phn test_phn; do
+        # shellcheck disable=SC2154
+        scripts/audio/trim_silence.sh \
+             --cmd "${train_cmd}" \
+             --nj "${nj}" \
+             --fs 44100 \
+             --win_length 2048 \
+             --shift_length 512 \
+             --threshold "${threshold}" \
+             data/${x} data/${x}/log
+
+        utils/fix_data_dir.sh data/${x}
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: split for development set"
+    utils/subset_data_dir.sh data/train 250 data/dev
+    utils/subset_data_dir.sh data/train_phn 250 data/dev_phn
+    utils/copy_data_dir.sh data/train data/train_no_dev
+    utils/copy_data_dir.sh data/train_phn data/train_phn_no_dev
+    utils/filter_scp.pl --exclude data/dev/wav.scp \
+        data/train/wav.scp > data/train_no_dev/wav.scp
+    utils/filter_scp.pl --exclude data/dev_phn/wav.scp \
+        data/train_phn/wav.scp > data/train_phn_no_dev/wav.scp
+    utils/fix_data_dir.sh data/train_no_dev
+    utils/fix_data_dir.sh data/train_phn_no_dev
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
+
diff --git a/egs2/aishell3/tts1/local/data_prep.py b/egs2/aishell3/tts1/local/data_prep.py
new file mode 100644
index 00000000000..66c10bf125e
--- /dev/null
+++ b/egs2/aishell3/tts1/local/data_prep.py
@@ -0,0 +1,42 @@
+import argparse
+import os
+
+SPK_LABEL_LEN = 7
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str)
+    parser.add_argument("--dest", type=str)
+    parser.add_argument("--external_g2p", type=bool, default=True)
+
+    args = parser.parse_args()
+
+    wav_dir = os.path.join(args.src, "wav")
+    transcript = open(os.path.join(args.src, "content.txt"), "r", encoding="utf-8")
+
+    wavscp = open(os.path.join(args.dest, "wav.scp"), "w", encoding="utf-8")
+    utt2spk = open(os.path.join(args.dest, "utt2spk"), "w", encoding="utf-8")
+    text = open(os.path.join(args.dest, "text"), "w", encoding="utf-8")
+
+    while True:
+        utt_info = transcript.readline()
+        if not utt_info:
+            break
+
+        (wav_name, text_info) = utt_info.strip().split("\t")
+        if args.external_g2p:
+            text_info = text_info.split(" ")[::2]
+        else:
+            text_info = text_info.split(" ")[1::2]
+
+        spk_id = wav_name[:SPK_LABEL_LEN]
+        utt_id = wav_name[:-4]
+
+        wavscp.write("{} {}\n".format(utt_id, os.path.join(wav_dir, spk_id, wav_name)))
+        utt2spk.write("{} {}\n".format(utt_id, spk_id))
+        text.write("{} {}\n".format(utt_id, " ".join(text_info)))
+
+    transcript.close()
+    wavscp.close()
+    utt2spk.close()
+    text.close()
diff --git a/egs2/aishell3/tts1/local/download_and_untar.sh b/egs2/aishell3/tts1/local/download_and_untar.sh
new file mode 100755
index 00000000000..fdbda821219
--- /dev/null
+++ b/egs2/aishell3/tts1/local/download_and_untar.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Luminar Technologies, Inc. (author: Daniel Galvez)
+#             2017  Ewald Enzinger
+# Apache 2.0
+
+# Adapted from egs/mini_librispeech/s5/local/download_and_untar.sh (commit 1cd6d2ac3a935009fdc4184cb8a72ddad98fe7d9)
+
+remove_archive=false
+filesize=19057141777 # data_aishell3.tgz  size
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
+  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+filename=$3
+filepath="$data/$filename"
+workspace=$PWD
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL."
+  exit 1;
+fi
+
+if [ -f $data/$filename.complete ]; then
+  echo "$0: data was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+if [ -f $filepath ]; then
+  size=$(/bin/ls -l $filepath | awk '{print $5}')
+  size_ok=false
+  if [ "$filesize" -eq "$size" ]; then size_ok=true; fi;
+  if ! $size_ok; then
+    echo "$0: removing existing file $filepath because its size in bytes ($size)"
+    echo "does not equal the size of the archives ($filesize)."
+    rm $filepath
+  else
+    echo "$filepath exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $filepath ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  echo "$0: downloading data from $url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $url; then
+    echo "$0: error executing wget $url"
+    exit 1;
+  fi
+  cd $workspace
+fi
+
+cd $data
+
+if ! tar -xzf $filename; then
+  echo "$0: error un-tarring archive $filepath"
+  exit 1;
+fi
+
+cd $workspace
+
+touch $data/$filename.complete
+
+echo "$0: Successfully downloaded and un-tarred $filepath"
+
+if $remove_archive; then
+  echo "$0: removing $filepath file since --remove-archive option was supplied."
+  rm $filepath
+fi
diff --git a/egs2/aishell3/tts1/local/path.sh b/egs2/aishell3/tts1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/aishell3/tts1/path.sh b/egs2/aishell3/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/aishell3/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/pyscripts b/egs2/aishell3/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/aishell3/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/run.sh b/egs2/aishell3/tts1/run.sh
new file mode 100755
index 00000000000..209c1d4f5a3
--- /dev/null
+++ b/egs2/aishell3/tts1/run.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 44100 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+train_set=train_no_dev
+valid_set=dev
+test_sets="dev test"
+g2p=pypinyin_g2p_phone
+# Input: 卡尔普陪外孙玩滑梯
+# pypinyin_g2p: ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+# pypinyin_g2p_phone: k a3 er3 p u3 p ei2 uai4 s un1 uan2 h ua2 t i1
+
+# if you want to use officially provided phoneme text (better for the quality)
+# train_set=train__no_dev_phn
+# valid_set=dev_phn
+# test_sets="dev_phn test_phn"
+# g2p=none
+
+./tts.sh \
+    --lang zh \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    --use_xvector true \
+    ${opts} "$@"
diff --git a/egs2/aishell3/tts1/scripts b/egs2/aishell3/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/aishell3/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/sid b/egs2/aishell3/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/aishell3/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/steps b/egs2/aishell3/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/aishell3/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/tts.sh b/egs2/aishell3/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/aishell3/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts1/utils b/egs2/aishell3/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/aishell3/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/README.md b/egs2/aishell4/asr1/README.md
new file mode 100644
index 00000000000..83450d40535
--- /dev/null
+++ b/egs2/aishell4/asr1/README.md
@@ -0,0 +1,18 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Wed Sep 22 11:04:03 EDT 2021`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a1`
+- pytorch version: `pytorch 1.9.0`
+- Git hash: `7887faeabbc2299922267928e190ed89cb032a36`
+  - Commit date: `Mon Sep 20 16:25:02 2021 -0400`
+
+## asr_complet
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_rnn_lm_lm_complet_valid.loss.ave_asr_model_valid.acc.ave/dev|996|21699|67.0|24.1|8.9|7.6|40.6|86.6|
+|decode_asr_rnn_lm_lm_complet_valid.loss.ave_asr_model_valid.acc.ave/test|4215|108807|57.3|33.4|9.3|10.1|52.7|95.1|
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/asr.sh b/egs2/aishell4/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/aishell4/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/cmd.sh b/egs2/aishell4/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/aishell4/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/aishell4/asr1/conf/fbank.conf b/egs2/aishell4/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/aishell4/asr1/conf/pbs.conf b/egs2/aishell4/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/aishell4/asr1/conf/pitch.conf b/egs2/aishell4/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/aishell4/asr1/conf/queue.conf b/egs2/aishell4/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/aishell4/asr1/conf/slurm.conf b/egs2/aishell4/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/aishell4/asr1/conf/train_lm_transformer.yaml b/egs2/aishell4/asr1/conf/train_lm_transformer.yaml
new file mode 100644
index 00000000000..ace0739a939
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
diff --git a/egs2/aishell4/asr1/conf/tuning/decode_asr_rnn.yaml b/egs2/aishell4/asr1/conf/tuning/decode_asr_rnn.yaml
new file mode 100644
index 00000000000..88fdbc20b91
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/tuning/decode_asr_rnn.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
+lm_weight: 0.3
diff --git a/egs2/aishell4/asr1/conf/tuning/decode_transformer.yaml b/egs2/aishell4/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/aishell4/asr1/conf/tuning/train_asr_conformer5.yaml b/egs2/aishell4/asr1/conf/tuning/train_asr_conformer5.yaml
new file mode 100644
index 00000000000..7a2e5940bfc
--- /dev/null
+++ b/egs2/aishell4/asr1/conf/tuning/train_asr_conformer5.yaml
@@ -0,0 +1,78 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 100
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/aishell4/asr1/db.sh b/egs2/aishell4/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/aishell4/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/local/data.sh b/egs2/aishell4/asr1/local/data.sh
new file mode 100755
index 00000000000..ffcdc8df4ff
--- /dev/null
+++ b/egs2/aishell4/asr1/local/data.sh
@@ -0,0 +1,359 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+FOLDER=git_aishell
+
+ . utils/parse_options.sh || exit 1;
+
+mkdir -p ${AISHELL4}
+if [ -z "${AISHELL4}" ]; then
+    log "Fill the value of 'AISHELL4' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log "data preparation started"
+
+#################################################################
+#####             Downloading their git          ################
+#################################################################
+
+
+# Github AISHELL4 : https://github.com/felixfuyihui/AISHELL-4.git
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ] ; then  
+    URL=https://github.com/DanBerrebbi/AISHELL-4.git
+    # our fork 
+
+    if [ ! -d "$FOLDER" ] ; then
+        git clone "$URL" "$FOLDER"
+        log "git successfully downloaded"
+    fi
+
+    pip install -r "$FOLDER"/requirements.txt 
+
+fi
+
+
+
+#################################################################
+#####            Downloading data and producing lists      ##############
+#################################################################
+
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] ; then
+
+    for room_name in "train_L" "train_M" "train_S" "test"
+    do 
+
+        wget https://www.openslr.org/resources/111/$room_name.tar.gz -P ${AISHELL4}/  
+        
+        
+        tar -xzvf ${AISHELL4}/"$room_name".tar.gz -C ${AISHELL4}/
+        
+
+        # after that untar step, you have one folder "$room_name" with two subfolders : 
+        #   - wav : a list of .flac audio files, each audio file is a conference meeting of about 30 minutes 
+        #   - TextGrid : a list of .TextGrid and .rttm files 
+
+        # then you have to produce a list of the names of the files located in the "$room_name"/wav/ directory 
+        # list should be like : 
+        #/dataset_dir/corpora/aishell4/train_L/wav/20200707_L_R001S01C01.flac
+        #/dataset_dir/corpora/aishell4/train_L/wav/20200709_L_R002S06C01.flac
+        #/dataset_dir/corpora/aishell4/train_L/wav/20200707_L_R001S04C01.flac
+        # ...
+
+        rm  ${AISHELL4}/$room_name/wav_list.txt
+        FILES="$PWD/${AISHELL4}/$room_name/wav/*"
+        for f in $FILES
+        do
+            echo "$f" >> ${AISHELL4}/$room_name/wav_list.txt
+        done
+
+
+
+        # then you have to produce a list of the names of the .TextGrid files located in the "$room_name"/textgrid/ directory 
+        # list should be like : 
+        #/dataset_dir/corpora/aishell4/train_L/TextGrid/textgrid_list/20200706_L_R001S08C01.TextGrid
+        # ...
+
+        rm ${AISHELL4}/$room_name/TextGrid_list.txt
+        FILES="$PWD/${AISHELL4}/$room_name/TextGrid/*.TextGrid"
+        for f in $FILES
+        do
+            echo "$f" >> ${AISHELL4}/$room_name/TextGrid_list.txt
+        done
+
+    done
+fi
+
+
+#################################################################
+#####            Join train_L, train_M and train_S       ########
+#################################################################
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 
+
+    mkdir -p ${AISHELL4}/full_train
+    for r in train_L train_M train_S ; do 
+        cat ${AISHELL4}/$r/TextGrid_list.txt >> ${AISHELL4}/full_train/TextGrid_list.txt
+        cat ${AISHELL4}/$r/wav_list.txt >> ${AISHELL4}/full_train/wav_list.txt
+    done
+fi
+
+
+
+#################################################################
+#####            ground truth for asr, using aishell4 github     ##############
+#################################################################
+
+
+wav_list_aishell4=${AISHELL4}/full_train/wav_list.txt
+text_grid_aishell4=${AISHELL4}/full_train/TextGrid_list.txt
+
+output_folder=$PWD/data/
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] ; then 
+
+    log "generating asr training data ..."
+    log "(this can take some time)"
+ 
+    python "$FOLDER"/data_preparation/generate_asr_trainingdata.py  --output_dir "$output_folder" --mode train --aishell4_wav_list "$wav_list_aishell4" --textgrid_list "$text_grid_aishell4" || log "ca a pas marché" ;
+
+    log "asr training data generated."
+
+fi
+ 
+
+
+
+#################################################################
+#####     creating wav.scp from output/train/wav directory    ##############
+#################################################################
+
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ] ; then 
+    FILES="$output_folder/train/wav/*"
+    for f in $FILES
+    do
+        g=$(echo $f | cut -d'/' -f 14 | cut -d'.' -f 1) 
+        echo "$g" "$f" >> $output_folder/train/wav.scp
+    done
+
+fi
+
+
+#################################################################
+#####            creating utt2spk and spk2utt  ########
+#################################################################
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ] ; then 
+    FILES="$output_folder/train/wav/*"
+    for f in $FILES
+    do
+        g=$(echo $f | cut -d'/' -f 14 | cut -d'.' -f 1) 
+        echo "$g" "$g"  >> $output_folder/train/utt2spk  # we put speaker_id = utt_id
+    done
+
+
+fi 
+
+
+
+
+
+
+
+#################################################################
+#####            sort and fix the data  ########
+#################################################################
+
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ] ; then
+    log "sorting files ... "
+    sort data/train/utt2spk -o data/train/utt2spk
+    # creating spk2utt from utt2spk
+    utils/utt2spk_to_spk2utt.pl $output_folder/train/utt2spk > $output_folder/train/spk2utt
+    sort data/train/wav.scp -o data/train/wav.scp
+    sort data/train/text -o data/train/text
+    log "files sorted"
+
+    # then, removing empty lines
+
+    log "fixing files ..."
+    ./utils/fix_data_dir.sh data/train/
+    log "files fixed"
+fi
+
+
+########################## generate the nlsyms.txt list 
+
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ] ; then 
+    echo data/train/text | perl -pe 's/(\<[^\>\<]+\>)/$1\n/g' | perl -pe 's/(\<[^\>\<]+\>)/\n$1/' | grep "^\<.*\>$" | sort -u > data/nlsyms.txt
+fi
+
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ] ; then 
+    log "random shuffling to prepare dev and test sets ..."
+
+    get_seeded_random()
+        {
+        seed="$1"
+        openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
+            </dev/zero 2>/dev/null
+        }
+
+    shuf  --random-source=<(get_seeded_random 76) data/train/utt2spk  -o data/train/utt2spk
+    shuf  --random-source=<(get_seeded_random 76) data/train/wav.scp  -o data/train/wav.scp
+    shuf  --random-source=<(get_seeded_random 76) data/train/text  -o data/train/text
+
+fi 
+
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ] ; then 
+    log "selecting lines for train, dev and test ..."
+
+    utils/subset_data_dir.sh --first data/train 1000 data/dev
+    n=$(($(wc -l < data/train/text) - 1000))
+    utils/subset_data_dir.sh --last data/train ${n} data/train_nodev
+
+fi
+
+
+if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ] ; then 
+    log "resorting the files ..."
+    log "train ..."
+    sort data/train_nodev/utt2spk -o data/train_nodev/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/train_nodev/utt2spk > data/train_nodev/spk2utt
+    sort data/train_nodev/wav.scp -o data/train_nodev/wav.scp
+    sort data/train_nodev/text -o data/train_nodev/text
+    log "files sorted"
+    log "dev ..."
+    sort data/dev/utt2spk -o data/dev/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+    sort data/dev/wav.scp -o data/dev/wav.scp
+    sort data/dev/text -o data/dev/text
+    log "files sorted"
+
+
+fi 
+
+
+
+
+
+
+#################################################################
+#####      Combining with aishell1 data  (train only for now)   
+#################################################################
+
+
+# pay attention : sorting issues with utt2spk :  (fix this by making speaker-ids prefixes of utt-ids)
+
+if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ] ; then 
+    
+    aishell1_data=../../aishell/asr1/data/train
+    aishell4_data=data/train_nodev
+
+    u2s=$aishell1_data/utt2spk
+    awk 'BEGIN {FS=" "; OFS="\n"}; {print $1" "$1}' $u2s > $aishell1_data/utt2spk2
+
+    mv $aishell1_data/utt2spk2 $aishell1_data/utt2spk
+    
+    utils/combine_data.sh data/combined_aishell_dir/train $aishell1_data $aishell4_data 
+
+    
+    sort data/combined_aishell_dir/train/utt2spk -o data/combined_aishell_dir/train/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/combined_aishell_dir/train/utt2spk > data/combined_aishell_dir/train/spk2utt
+    sort data/combined_aishell_dir/train/wav.scp -o data/combined_aishell_dir/train/wav.scp
+    sort data/combined_aishell_dir/train/text -o data/combined_aishell_dir/train/text
+
+    wc -l data/combined_aishell_dir/train/*
+
+
+fi 
+
+
+
+
+
+##########################
+##       test set 
+##########################
+
+
+wav_list_aishell4=${AISHELL4}/test/wav_list.txt
+text_grid_aishell4=${AISHELL4}/test/TextGrid_list.txt
+
+output_folder=$PWD/data
+
+if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ] ; then 
+
+    log "generating asr training data ..."
+    log "(this can take some time)"
+ 
+    python "$FOLDER"/data_preparation/generate_asr_trainingdata.py  --output_dir "$output_folder"/test --mode train --aishell4_wav_list "$wav_list_aishell4" --textgrid_list "$text_grid_aishell4" || log "ca a pas marché" ;
+
+    log "asr training data generated."
+
+    mv data/test/train/* data/test/
+    rm -r data/test/train
+
+fi
+ 
+
+
+if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ] ; then 
+    FILES="$output_folder/test/wav/*"
+    for f in $FILES
+    do
+        g=$(echo $f | cut -d'/' -f 13 | cut -d'.' -f 1) 
+        echo "$g" "$f" >> $output_folder/test/wav.scp
+    done
+
+fi
+
+
+if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ] ; then 
+    FILES="$output_folder/test/wav/*"
+    for f in $FILES
+    do
+        g=$(echo $f | cut -d'/' -f 13 | cut -d'.' -f 1) 
+        echo "$g" "$g"  >> $output_folder/test/utt2spk  # we put speaker_id = utt_id
+    done
+
+
+fi 
+
+
+if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ] ; then
+    log "sorting files ... "
+    sort data/test/utt2spk -o data/test/utt2spk
+    # creating spk2utt from utt2spk
+    utils/utt2spk_to_spk2utt.pl $output_folder/test/utt2spk > $output_folder/test/spk2utt
+    sort data/test/wav.scp -o data/test/wav.scp
+    sort data/test/text -o data/test/text
+    log "files sorted"
+
+    # then, removing empty lines
+
+    log "fixing files ..."
+    ./utils/fix_data_dir.sh data/test/
+    log "files fixed"
+fi
diff --git a/egs2/aishell4/asr1/local/path.sh b/egs2/aishell4/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/aishell4/asr1/path.sh b/egs2/aishell4/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/aishell4/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/pyscripts b/egs2/aishell4/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/aishell4/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/run.sh b/egs2/aishell4/asr1/run.sh
new file mode 100755
index 00000000000..b03e65c3d22
--- /dev/null
+++ b/egs2/aishell4/asr1/run.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+
+asr_config=conf/tuning/train_asr_conformer5.yaml
+inference_config=conf/tuning/decode_rnn.yaml
+lm_config=conf/train_lm_transformer.yaml
+use_lm=true
+use_wordlm=false
+
+
+# token_type are char and not bpe for chineese 
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --ngpu 1                                           \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "train_nodev"                       \
+    --valid_set "dev"                       \
+    --test_sets "test"                        \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --max_wav_duration 20. \
+    --lm_train_text "data/train/text" "$@" \
+    --nlsyms_txt data/nlsyms.txt
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/scripts b/egs2/aishell4/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/aishell4/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/steps b/egs2/aishell4/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/aishell4/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/aishell4/asr1/utils b/egs2/aishell4/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/aishell4/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/ami/asr1/README.md b/egs2/ami/asr1/README.md
index 86b3dc1bed5..a13abd58c78 100644
--- a/egs2/ami/asr1/README.md
+++ b/egs2/ami/asr1/README.md
@@ -1,4 +1,65 @@
 <!-- Generated by scripts/utils/show_asr_result.sh -->
+# mic=ihm: Updated results: +Specaug +Transformer-LM
+## Environments
+- date: `Thu Mar 18 03:15:25 UTC 2021`
+- python version: `3.8.8 (default, Feb 24 2021, 21:46:12)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.8`
+- pytorch version: `pytorch 1.8.0`
+- Git hash: `106acf3e84fc1d466d6c07c7914847a169f9713c`
+  - Commit date: `Thu Mar 18 04:22:14 2021 +0000`
+
+## Transformer LM
+- ASR config: [conf/tuning/train_asr_transformer4.yaml](conf/tuning/train_asr_transformer4.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Decode config: [conf/tuning/decode_transformer2.yaml](conf/tuning/decode_transformer2.yaml)
+- Pretrained model: [https://zenodo.org/record/4615756](https://zenodo.org/record/4615756)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer2_lm_lm_train_lm_transformer2_en_bpe100_valid.loss.ave_asr_model_valid.acc.ave/ihm_dev|13057|94802|83.5|12.0|4.6|2.6|19.1|51.9|
+|decode_transformer2_lm_lm_train_lm_transformer2_en_bpe100_valid.loss.ave_asr_model_valid.acc.ave/ihm_eval|12643|89666|83.8|12.0|4.2|2.2|18.3|49.4|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer2_lm_lm_train_lm_transformer2_en_bpe100_valid.loss.ave_asr_model_valid.acc.ave/ihm_dev|13057|451641|91.4|3.7|5.0|2.9|11.5|51.9|
+|decode_transformer2_lm_lm_train_lm_transformer2_en_bpe100_valid.loss.ave_asr_model_valid.acc.ave/ihm_eval|12643|432094|91.9|3.7|4.4|2.6|10.7|49.4|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer2_lm_lm_train_lm_transformer2_en_bpe100_valid.loss.ave_asr_model_valid.acc.ave/ihm_dev|13057|254909|87.5|6.7|5.8|2.6|15.1|51.9|
+|decode_transformer2_lm_lm_train_lm_transformer2_en_bpe100_valid.loss.ave_asr_model_valid.acc.ave/ihm_eval|12643|243330|88.5|6.4|5.1|2.5|14.0|49.4|
+
+## Without LM
+- ASR config: [conf/tuning/train_asr_transformer4.yaml](conf/tuning/train_asr_transformer4.yaml)
+- Decode config: [conf/tuning/decode_transformer2.yaml](conf/tuning/decode_transformer2.yaml)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer2_asr_model_valid.acc.ave/ihm_dev|13057|94802|83.4|12.9|3.7|3.2|19.8|52.7|
+|decode_transformer2_asr_model_valid.acc.ave/ihm_eval|12643|89666|83.6|13.0|3.4|2.7|19.1|50.3|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer2_asr_model_valid.acc.ave/ihm_dev|13057|451641|91.8|3.8|4.4|3.2|11.4|52.7|
+|decode_transformer2_asr_model_valid.acc.ave/ihm_eval|12643|432094|92.2|3.8|4.0|2.9|10.7|50.3|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer2_asr_model_valid.acc.ave/ihm_dev|13057|254909|87.7|6.9|5.4|2.9|15.2|52.7|
+|decode_transformer2_asr_model_valid.acc.ave/ihm_eval|12643|243330|88.4|6.8|4.8|2.7|14.2|50.3|
+
 
 # RESULTS on Dec 28, 2020
 ## Environments
diff --git a/egs2/ami/asr1/cmd.sh b/egs2/ami/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100755
--- a/egs2/ami/asr1/cmd.sh
+++ b/egs2/ami/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/ami/asr1/conf/tuning/decode_transformer2.yaml b/egs2/ami/asr1/conf/tuning/decode_transformer2.yaml
new file mode 100644
index 00000000000..5cd25457cf5
--- /dev/null
+++ b/egs2/ami/asr1/conf/tuning/decode_transformer2.yaml
@@ -0,0 +1,3 @@
+beam_size: 10
+ctc_weight: 0.3
+lm_weight: 0.4
diff --git a/egs2/ami/asr1/conf/tuning/train_asr_transformer2.yaml b/egs2/ami/asr1/conf/tuning/train_asr_transformer2.yaml
new file mode 100644
index 00000000000..59f6803d704
--- /dev/null
+++ b/egs2/ami/asr1/conf/tuning/train_asr_transformer2.yaml
@@ -0,0 +1,48 @@
+batch_type: numel
+batch_bins: 1400000
+accum_grad: 8
+max_epoch: 75
+patience: none
+# The initialization method for model parameters
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+ctc_conf:
+    ignore_nan_grad: true
diff --git a/egs2/ami/asr1/conf/tuning/train_asr_transformer3.yaml b/egs2/ami/asr1/conf/tuning/train_asr_transformer3.yaml
new file mode 100644
index 00000000000..f3c408ef02f
--- /dev/null
+++ b/egs2/ami/asr1/conf/tuning/train_asr_transformer3.yaml
@@ -0,0 +1,64 @@
+batch_type: numel
+batch_bins: 1400000
+accum_grad: 8
+max_epoch: 75
+patience: none
+# The initialization method for model parameters
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+ctc_conf:
+    ignore_nan_grad: true
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/ami/asr1/conf/tuning/train_asr_transformer4.yaml b/egs2/ami/asr1/conf/tuning/train_asr_transformer4.yaml
new file mode 100644
index 00000000000..e3f1d15a831
--- /dev/null
+++ b/egs2/ami/asr1/conf/tuning/train_asr_transformer4.yaml
@@ -0,0 +1,66 @@
+batch_type: numel
+batch_bins: 14000000
+accum_grad: 1
+max_epoch: 75
+patience: none
+# The initialization method for model parameters
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.003
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+ctc_conf:
+    ignore_nan_grad: true
+
+num_workers: 3
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/ami/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/ami/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..3c27530f61a
--- /dev/null
+++ b/egs2/ami/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 350000
+accum_grad: 2
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
diff --git a/egs2/ami/asr1/conf/tuning/train_lm_transformer2.yaml b/egs2/ami/asr1/conf/tuning/train_lm_transformer2.yaml
new file mode 100644
index 00000000000..f10e047049a
--- /dev/null
+++ b/egs2/ami/asr1/conf/tuning/train_lm_transformer2.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 1000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
diff --git a/egs2/ami/asr1/local/data.sh b/egs2/ami/asr1/local/data.sh
index 24ac9cf5f83..6a3b8875fab 100755
--- a/egs2/ami/asr1/local/data.sh
+++ b/egs2/ami/asr1/local/data.sh
@@ -1,10 +1,16 @@
 #!/usr/bin/env bash
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-. ./path.sh || exit 1;
-. ./cmd.sh || exit 1;
-. ./db.sh || exit 1;
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
 
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
 stage=0
 stop_stage=10
 SECONDS=0
@@ -20,40 +26,35 @@ SECONDS=0
 # ./run.sh --mic mdm8
 mic=ihm
 
+log "$0 $*"
 . utils/parse_options.sh
 
-log() {
-    local fname=${BASH_SOURCE[1]##*/}
-    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
 
 if [ ! -e "${AMI}" ]; then
     log "Fill the value of 'AMI' of db.sh"
     exit 1
 fi
 
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-
-. utils/parse_options.sh
-
 base_mic=${mic//[0-9]/} # sdm, ihm or mdm
 nmics=${mic//[a-z]/} # e.g. 8 for mdm8.
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     log "data stage 1: Data Download"
     if [ -d ${AMI} ] && ! touch ${AMI}/.foo 2>/dev/null; then
-	log "$0: directory $AMI seems to exist and not be owned by you."
-	log " ... Assuming the data does not need to be downloaded.  Please use --stage 2."
-	exit 1
+        log "$0: directory $AMI seems to exist and not be owned by you."
+        log " ... Assuming the data does not need to be downloaded.  Please use --stage 2."
+        exit 1
     fi
+
     if [ -e data/local/downloads/wget_${mic}.sh ]; then
-	log "data/local/downloads/wget_$mic.sh already exists, better quit than re-download... (use --stage N)"
-	exit 1
+        log "data/local/downloads/wget_$mic.sh already exists, better quit than re-download... (use --stage N)"
+        exit 1
     fi
+
     local/ami_download.sh ${mic} ${AMI}
 fi
 
@@ -64,28 +65,25 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 
     # common data prep
     if [ ! -d data/local/downloads ]; then
-	local/ami_text_prep.sh data/local/downloads
+        local/ami_text_prep.sh data/local/downloads
     fi
 
     # beamforming
     if [ "$base_mic" == "mdm" ]; then
-	PROCESSED_AMI_DIR=${PWD}/beamformed
-	if [ -z ${BEAMFORMIT} ]; then
-	    export BEAMFORMIT=${KALDI_ROOT}/tools/BeamformIt
-	fi
-	export PATH=${PATH}:${BEAMFORMIT}
-	! hash BeamformIt && log "Missing BeamformIt, run 'cd ../../../tools/kaldi/tools; extras/install_beamformit.sh; cd -;'" && exit 1
-	local/ami_beamform.sh --cmd "${train_cmd}" --nj 20 ${nmics} ${AMI} ${PROCESSED_AMI_DIR}
+        PROCESSED_AMI_DIR=${PWD}/beamformed
+        ! hash BeamformIt && log "Missing BeamformIt, run 'cd ../../../tools; installers/install_beamformit.sh; cd -;'" && exit 1
+        local/ami_beamform.sh --cmd "${train_cmd}" --nj 20 ${nmics} ${AMI} ${PROCESSED_AMI_DIR}
     else
-	PROCESSED_AMI_DIR=${AMI}
+        PROCESSED_AMI_DIR=${AMI}
     fi
+
     local/ami_${base_mic}_data_prep.sh ${PROCESSED_AMI_DIR} ${mic}
     # data augmentation
-    
+
     local/ami_${base_mic}_scoring_data_prep.sh ${PROCESSED_AMI_DIR} ${mic} dev
     local/ami_${base_mic}_scoring_data_prep.sh ${PROCESSED_AMI_DIR} ${mic} eval
     for dset in train dev eval; do
-	utils/copy_data_dir.sh data/${mic}/${dset}_orig data/${mic}_${dset}
+        utils/copy_data_dir.sh data/${mic}/${dset}_orig data/${mic}_${dset}
         rm -r data/${mic}/${dset}_orig
     done
 fi
diff --git a/egs2/ami/asr1/run.sh b/egs2/ami/asr1/run.sh
index 77d5c868ad2..3589ef40507 100755
--- a/egs2/ami/asr1/run.sh
+++ b/egs2/ami/asr1/run.sh
@@ -20,9 +20,9 @@ train_set=${mic}_train
 valid_set=${mic}_dev
 test_sets="${mic}_eval ${mic}_dev"
 
-asr_config=conf/train_asr.yaml
-lm_config=conf/train_lm.yaml
-inference_config=conf/decode_asr.yaml
+asr_config=conf/tuning/train_asr_transformer4.yaml
+lm_config=conf/tuning/train_lm_transformer2.yaml
+inference_config=conf/tuning/decode_transformer2.yaml
 
 speed_perturb_factors="0.9 1.0 1.1"
 
diff --git a/egs2/an4/asr1/README.md b/egs2/an4/asr1/README.md
new file mode 100644
index 00000000000..543ce9c0c42
--- /dev/null
+++ b/egs2/an4/asr1/README.md
@@ -0,0 +1,37 @@
+# RESULTS
+## Environments
+- date: `Sat Dec 25 15:43:23 EST 2021`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.9.0`
+- Git hash: `cdf0c002f2a64aa6a670cc7675192ac26f0d5add`
+  - Commit date: `Fri Dec 24 14:45:33 2021 -0500`
+
+## asr_train_asr_transformer_raw_en_bpe30_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam10_ctc0.3_lm0.1/test|130|773|92.9|5.2|1.9|0.3|7.4|31.5|
+|beam10_ctc0.3_lm0.1/train_dev|100|591|88.0|8.5|3.6|0.7|12.7|45.0|
+|beam10_ctc0.3_lm0/test|130|773|92.8|5.4|1.8|0.4|7.6|30.0|
+|beam10_ctc0.3_lm0/train_dev|100|591|88.0|8.0|4.1|0.8|12.9|47.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam10_ctc0.3_lm0.1/test|130|2565|96.9|1.0|2.1|0.5|3.6|31.5|
+|beam10_ctc0.3_lm0.1/train_dev|100|1915|93.2|2.5|4.3|0.3|7.1|45.0|
+|beam10_ctc0.3_lm0/test|130|2565|97.1|1.2|1.8|0.7|3.6|30.0|
+|beam10_ctc0.3_lm0/train_dev|100|1915|93.2|2.2|4.6|0.4|7.3|47.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam10_ctc0.3_lm0.1/test|130|2695|97.1|1.0|2.0|0.5|3.5|31.5|
+|beam10_ctc0.3_lm0.1/train_dev|100|2015|93.5|2.4|4.1|0.2|6.7|45.0|
+|beam10_ctc0.3_lm0/test|130|2695|97.2|1.1|1.7|0.7|3.5|30.0|
+|beam10_ctc0.3_lm0/train_dev|100|2015|93.5|2.1|4.4|0.4|6.9|47.0|
+
diff --git a/egs2/an4/asr1/cmd.sh b/egs2/an4/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/an4/asr1/cmd.sh
+++ b/egs2/an4/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/an4/asr1/conf/decode_asr.yaml b/egs2/an4/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..9ab823cfca0
--- /dev/null
+++ b/egs2/an4/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+ctc_weight: 0.3
+lm_weight: 0.1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
diff --git a/egs2/an4/asr1/conf/train_asr_transformer.yaml b/egs2/an4/asr1/conf/train_asr_transformer.yaml
index cd7322143cc..1676d8fa94c 100644
--- a/egs2/an4/asr1/conf/train_asr_transformer.yaml
+++ b/egs2/an4/asr1/conf/train_asr_transformer.yaml
@@ -1,6 +1,6 @@
 batch_type: folded
 batch_size: 64
-accum_grad: 2
+accum_grad: 1
 max_epoch: 200
 patience: none
 # The initialization method for model parameters
@@ -40,7 +40,7 @@ model_conf:
 
 optim: adam
 optim_conf:
-    lr: 0.005
+    lr: 0.001
 scheduler: warmuplr
 scheduler_conf:
-    warmup_steps: 20000
+    warmup_steps: 2500
diff --git a/egs2/an4/asr1/run.sh b/egs2/an4/asr1/run.sh
index 17857c94c2d..a5d8eb0ece9 100755
--- a/egs2/an4/asr1/run.sh
+++ b/egs2/an4/asr1/run.sh
@@ -7,8 +7,12 @@ set -o pipefail
 
 ./asr.sh \
     --lang en \
-    --train_set train_nodev \
+    --asr_config conf/train_asr_transformer.yaml \
+    --inference_config conf/decode_asr.yaml \
     --lm_config conf/train_lm.yaml \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --train_set train_nodev \
     --valid_set train_dev \
     --test_sets "train_dev test" \
-    --lm_train_text "data/train_nodev/text" "$@"
+    --bpe_train_text "dump/raw/train_nodev_sp/text" \
+    --lm_train_text "data/train_nodev_sp/text" "$@"
diff --git a/egs2/an4/tts1/cmd.sh b/egs2/an4/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/an4/tts1/cmd.sh
+++ b/egs2/an4/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/babel/asr1/cmd.sh b/egs2/babel/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/babel/asr1/cmd.sh
+++ b/egs2/babel/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/bn_openslr53/asr1/README.md b/egs2/bn_openslr53/asr1/README.md
new file mode 100644
index 00000000000..542c8053339
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/README.md
@@ -0,0 +1,29 @@
+# RESULTS
+## Environments
+- date: `Mon Jan 31 10:53:20 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `9d09bf551a9fe090973de60e15adec1de6b3d054`
+  - Commit date: `Fri Jan 21 11:43:15 2022 -0500`
+- Pretrained Model: https://huggingface.co/espnet/bn_openslr53
+
+## asr_train_asr_raw_bpe1000
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|6470|74.2|21.3|4.5|2.2|28.0|48.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|39196|89.4|4.3|6.3|1.4|12.0|48.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_lm_lm_train_lm_bpe1000_valid.loss.ave_asr_model_valid.acc.best/sbn_test|2018|15595|77.6|12.7|9.7|1.6|24.0|48.7|
+
diff --git a/egs2/bn_openslr53/asr1/asr.sh b/egs2/bn_openslr53/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/cmd.sh b/egs2/bn_openslr53/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/bn_openslr53/asr1/conf/decode_asr.yaml b/egs2/bn_openslr53/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..9c03d913dad
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/decode_asr.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/conf/fbank.conf b/egs2/bn_openslr53/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/bn_openslr53/asr1/conf/pbs.conf b/egs2/bn_openslr53/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/bn_openslr53/asr1/conf/pitch.conf b/egs2/bn_openslr53/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/bn_openslr53/asr1/conf/queue.conf b/egs2/bn_openslr53/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/bn_openslr53/asr1/conf/slurm.conf b/egs2/bn_openslr53/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/bn_openslr53/asr1/conf/train_asr.yaml b/egs2/bn_openslr53/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..e375e1216e7
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/train_asr.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 200000
+
+# optimization related
+optim: adam
+accum_grad: 20
+grad_clip: 5
+patience: 20
+max_epoch: 50
+optim_conf:
+    lr: 10
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: chainer
diff --git a/egs2/bn_openslr53/asr1/conf/train_lm.yaml b/egs2/bn_openslr53/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..439d1f72c3b
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/db.sh b/egs2/bn_openslr53/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/local/data.sh b/egs2/bn_openslr53/asr1/local/data.sh
new file mode 100755
index 00000000000..a06e280c22e
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/local/data.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+mkdir -p ${BENGALI}
+if [ -z "${BENGALI}" ]; then
+    log "Fill the value of 'BENGALI' of db.sh"
+    exit 1
+fi
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    idxs=("1" "2" "3" "4" "5" "6" "7" "8" "9" "a" "b" "c" "d" "e" "f")
+    for i in "${idxs[@]}"; do
+        wget -O ${BENGALI} https://us.openslr.org/resources/53/asr_bengali_${i}.zip
+        unzip -o asr_bengali_${i}.zip
+        rm -f asr_bengali_${i}.zip
+    done 
+    rm -rf asr_bengali
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${BENGALI}
+    utils/spk2utt_to_utt2spk.pl data/bn_train/spk2utt > data/bn_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/bn_dev/spk2utt > data/bn_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/bn_test/spk2utt > data/bn_test/utt2spk
+    utils/fix_data_dir.sh data/bn_train
+    utils/fix_data_dir.sh data/bn_dev
+    utils/fix_data_dir.sh data/bn_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/local/data_prep.py b/egs2/bn_openslr53/asr1/local/data_prep.py
new file mode 100644
index 00000000000..4cb5a47596b
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/local/data_prep.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/utt_spk_text.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = l_list[1]
+        text = l_list[2]
+        path = "%s/data/%s/%s.flac" % (args.d, fid[:2], fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 2000:
+            break
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s/data" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 2000:
+                curr_num_fids = num_fids - 2000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s/%s.flac -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid[:2],
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/java_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/bn_openslr53/asr1/local/path.sh b/egs2/bn_openslr53/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/bn_openslr53/asr1/path.sh b/egs2/bn_openslr53/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/pyscripts b/egs2/bn_openslr53/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/run.sh b/egs2/bn_openslr53/asr1/run.sh
new file mode 100755
index 00000000000..300a4851519
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="sbn_train"
+train_dev="sbn_dev"
+test_set="sbn_test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml 
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 80 \
+    --inference_nj 256 \
+    --inference_asr_model valid.acc.best.pth \
+    --gpu_inference false \
+    --inference_args "--batch_size 1" \
+    --use_lm true \
+    --token_type bpe \
+    --nbpe 1000 \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}"\
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --lm_train_text "data/${train_set}/text" \
+    --lm_dev_text "data/${train_dev}/text" \
+    --lm_test_text "data/${test_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/scripts b/egs2/bn_openslr53/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/steps b/egs2/bn_openslr53/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/bn_openslr53/asr1/utils b/egs2/bn_openslr53/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/bn_openslr53/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/catslu/asr1/README.md b/egs2/catslu/asr1/README.md
new file mode 100755
index 00000000000..402e6ab70c7
--- /dev/null
+++ b/egs2/catslu/asr1/README.md
@@ -0,0 +1,35 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Fri Oct 22 17:27:09 EDT 2021`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.4a1`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `e31965d55993766461f0964216a0bb9aea3cfb7a`
+  - Commit date: `Fri Oct 22 16:33:16 2021 -0400`
+
+## Intent Classification Results
+
+- ASR Config: [conf/train_asr.yaml](conf/train_asr.yaml)
+
+ | Dataset						| Intent Classification (%) |
+ | ---------------------------------------------------- | ------------------------- |
+ | inference_asr_model_valid.acc.ave_5best/valid	| 80.89	  	       	    |
+ | inference_asr_model_valid.acc.ave_5best/test	       	| 78.82	  		    |		   
+
+- ASR Result: /inference_asr_model_valid.acc.ave_5best/valid/score_wer//result_asr.txt
+
+| SPKR                             | # Snt | # Wrd | Corr  |  Sub  |  Del |   Ins |   Err | S.Err |
+| -------------------------------- | ----- | ----- | ----- | ----- | ---- | ----- | ----- | ----- |
+| Sum/Avg                          |  921  | 5517  | 44.1 |  30.9 |  25.0  |  3.2 |  59.1 |  79.0 |
+
+- ASR result: /inference_asr_model_valid.acc.ave_5best/test/score_wer//result_asr.txt
+
+| SPKR                             | # Snt | # Wrd | Corr |   Sub  |  Del |   Ins |   Err | S.Err |
+| -------------------------------- | ----- | ----- | ----- | ----- | ---- | ----- | ----- | ----- |
+| Sum/Avg                          | 1577  | 9864 | 40.9 |  31.6 |  27.5 |   2.9 |  62.0 |  81.1 |
+
+
+## Trained Model
+
+Trained model can be downloaded from HuggingFace [ESPNet repo](https://huggingface.co/espnet/sujay_catslu_map).
\ No newline at end of file
diff --git a/egs2/catslu/asr1/asr.sh b/egs2/catslu/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/catslu/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/catslu/asr1/cmd.sh b/egs2/catslu/asr1/cmd.sh
new file mode 100755
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/catslu/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/catslu/asr1/conf/fbank.conf b/egs2/catslu/asr1/conf/fbank.conf
new file mode 100755
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/catslu/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/catslu/asr1/conf/pbs.conf b/egs2/catslu/asr1/conf/pbs.conf
new file mode 100755
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/catslu/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/catslu/asr1/conf/pitch.conf b/egs2/catslu/asr1/conf/pitch.conf
new file mode 100755
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/catslu/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/catslu/asr1/conf/queue.conf b/egs2/catslu/asr1/conf/queue.conf
new file mode 100755
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/catslu/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/catslu/asr1/conf/slurm.conf b/egs2/catslu/asr1/conf/slurm.conf
new file mode 100755
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/catslu/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/catslu/asr1/conf/train_asr.yaml b/egs2/catslu/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..881d6d894dc
--- /dev/null
+++ b/egs2/catslu/asr1/conf/train_asr.yaml
@@ -0,0 +1,81 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 4
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 2500
+max_epoch: 100
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wav2vec2_xlsr  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/catslu/asr1/db.sh b/egs2/catslu/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/catslu/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/catslu/asr1/local/data.sh b/egs2/catslu/asr1/local/data.sh
new file mode 100755
index 00000000000..18beebfccea
--- /dev/null
+++ b/egs2/catslu/asr1/local/data.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${CATSLU}" ]; then
+    log "Fill the value of 'CATSLU' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${CATSLU}/catslu_traindev" ] && [ ! -e "${CATSLU}/catslu_test" ]; then
+	echo "stage 1: Download traindev and test data to ${CATSLU}"
+    else
+        log "stage 1: ${CATSLU}/catslu_traindev and ${CATSLU}/catslu_test already exists."
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,valid,test}
+    python3 local/data_prep.py ${CATSLU}
+    for x in test valid train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+    done
+    utils/fix_data_dir.sh data/train
+    utils/fix_data_dir.sh data/valid
+    utils/fix_data_dir.sh data/test
+
+    utils/validate_data_dir.sh --no-feats data/train
+    utils/validate_data_dir.sh --no-feats data/valid
+    utils/validate_data_dir.sh --no-feats data/test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/catslu/asr1/local/data_prep.py b/egs2/catslu/asr1/local/data_prep.py
new file mode 100755
index 00000000000..2ce83727a07
--- /dev/null
+++ b/egs2/catslu/asr1/local/data_prep.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Sujay Suresh Kumar
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+import os
+import sys
+from pathlib import Path
+import json
+import string as string_lib
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [catslu_root]")
+    sys.exit(1)
+
+catslu_root = sys.argv[1]
+
+BLACKLIST_IDS = ["map-df61ee397d015314dfde80255365428b_4b3d3b2f332793052a000014-1"]
+
+
+def word_segmentation(string):
+    new_string = ""
+    for char in list(string):
+        if char.encode("UTF-8").isalpha():
+            new_string += char
+        elif char in string_lib.punctuation:
+            new_string += " "
+        elif char in ["·", "，"]:
+            new_string += " "
+        else:
+            new_string += " " + char + " "
+    result = " ".join(new_string.strip().split())
+    return result
+
+
+catslu_root_path = Path(catslu_root)
+
+# Here, we are considering only the MAP dataset of CATSLU
+catslu_traindev = Path(os.path.join(catslu_root_path, "catslu_traindev", "data"))
+catslu_traindev_domain_dirs = [
+    f for f in catslu_traindev.iterdir() if f.is_dir() and f.name == "map"
+]
+
+catslu_test = Path(os.path.join(catslu_root_path, "catslu_test", "data"))
+catslu_test_domain_dirs = [
+    f for f in catslu_test.iterdir() if f.is_dir() and f.name == "map"
+]
+
+train_text = []
+dev_text = set()
+test_text = set()
+
+train_wav_scp = []
+dev_wav_scp = set()
+test_wav_scp = set()
+
+train_utt2spk = []
+dev_utt2spk = set()
+test_utt2spk = set()
+
+train_labels = set()
+test_labels = set()
+valid_labels = set()
+
+
+def _process_data(data):
+    global BLACKLIST_IDS
+    text = []
+    wav_scp = []
+    utt2spk = []
+    for dialogue in data:
+        dialogue_id = dialogue["dlg_id"]
+        for utterance in dialogue["utterances"]:
+            wav_path = os.path.join(
+                domain_dir, "audios/{}.wav".format(utterance["wav_id"])
+            )
+            utt_id = "{}-{}-{}".format(
+                domain_dir.parts[-1], utterance["wav_id"], utterance["utt_id"]
+            )
+            slots = ["none", "none", "none"]
+            try:
+                slots[0] = (
+                    utterance["semantic"][0][0]
+                    if utterance["semantic"] and len(utterance["semantic"][0]) > 0
+                    else "none"
+                )
+                slots[1] = (
+                    utterance["semantic"][0][1]
+                    if utterance["semantic"] and len(utterance["semantic"][0]) > 1
+                    else "none"
+                )
+            except Exception as e:
+                print(e)
+
+            intent = "_".join(slots)
+            transcription = (
+                intent + " " + word_segmentation(utterance["manual_transcript"])
+            )
+            if utt_id in BLACKLIST_IDS:
+                continue
+            text.append("{} {}".format(utt_id, transcription))
+            wav_scp.append("{} {}".format(utt_id, wav_path))
+            utt2spk.append("{} {}".format(utt_id, dialogue_id))
+
+    return text, wav_scp, utt2spk
+
+
+train_text = []
+train_wav_scp = []
+train_utt2spk = []
+for domain_dir in catslu_traindev_domain_dirs:
+    with open(os.path.join(domain_dir, "train.json")) as fp:
+        train_data = json.load(fp)
+
+    train_text, train_wav_scp, train_utt2spk = _process_data(train_data)
+
+    with open(os.path.join(domain_dir, "development.json")) as fp:
+        dev_data = json.load(fp)
+
+    dev_text, dev_wav_scp, dev_utt2spk = _process_data(dev_data)
+
+
+for domain_dir in catslu_test_domain_dirs:
+    with open(os.path.join(domain_dir, "test.json")) as fp:
+        test_data = json.load(fp)
+
+    test_text, test_wav_scp, test_utt2spk = _process_data(test_data)
+
+
+# Write train data
+with open(os.path.join("data", "train", "text"), "w") as fp:
+    fp.truncate()
+    for line in train_text:
+        fp.write(line + "\n")
+
+with open(os.path.join("data", "train", "wav.scp"), "w") as fp:
+    fp.truncate()
+    for line in train_wav_scp:
+        fp.write(line + "\n")
+
+with open(os.path.join("data", "train", "utt2spk"), "w") as fp:
+    fp.truncate()
+    for line in train_utt2spk:
+        fp.write(line + "\n")
+
+
+# Write valid data
+with open(os.path.join("data", "valid", "text"), "w") as fp:
+    fp.truncate()
+    for line in dev_text:
+        fp.write(line + "\n")
+
+with open(os.path.join("data", "valid", "wav.scp"), "w") as fp:
+    fp.truncate()
+    for line in dev_wav_scp:
+        fp.write(line + "\n")
+
+with open(os.path.join("data", "valid", "utt2spk"), "w") as fp:
+    fp.truncate()
+    for line in dev_utt2spk:
+        fp.write(line + "\n")
+
+
+# Write test data
+with open(os.path.join("data", "test", "text"), "w") as fp:
+    fp.truncate()
+    for line in test_text:
+        fp.write(line + "\n")
+
+with open(os.path.join("data", "test", "wav.scp"), "w") as fp:
+    fp.truncate()
+    for line in test_wav_scp:
+        fp.write(line + "\n")
+
+with open(os.path.join("data", "test", "utt2spk"), "w") as fp:
+    fp.truncate()
+    for line in test_utt2spk:
+        fp.write(line + "\n")
diff --git a/egs2/catslu/asr1/local/path.sh b/egs2/catslu/asr1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/catslu/asr1/local/score.py b/egs2/catslu/asr1/local/score.py
new file mode 120000
index 00000000000..f3e522712b0
--- /dev/null
+++ b/egs2/catslu/asr1/local/score.py
@@ -0,0 +1 @@
+../../../fsc/asr1/local/score.py
\ No newline at end of file
diff --git a/egs2/catslu/asr1/local/score.sh b/egs2/catslu/asr1/local/score.sh
new file mode 120000
index 00000000000..91c8680b9b9
--- /dev/null
+++ b/egs2/catslu/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../fsc/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/catslu/asr1/path.sh b/egs2/catslu/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/catslu/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/catslu/asr1/pyscripts b/egs2/catslu/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/catslu/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/catslu/asr1/run.sh b/egs2/catslu/asr1/run.sh
new file mode 100755
index 00000000000..f93cf3053c0
--- /dev/null
+++ b/egs2/catslu/asr1/run.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+asr_config=conf/train_asr.yaml
+
+./asr.sh \
+    --lang zh \
+    --ngpu 1 \
+    --stage 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --audio_format wav\
+    --feats_type raw\
+    --feats_normalize null \
+    --max_wav_duration 30 \
+    --inference_asr_model valid.acc.ave_5best.pth\
+    --asr_config "${asr_config}" \
+    --expdir "exp" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/catslu/asr1/scripts b/egs2/catslu/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/catslu/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/catslu/asr1/steps b/egs2/catslu/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/catslu/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/catslu/asr1/utils b/egs2/catslu/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/catslu/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/chime4/asr1/cmd.sh b/egs2/chime4/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/chime4/asr1/cmd.sh
+++ b/egs2/chime4/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/chime4/asr1/local/data.sh b/egs2/chime4/asr1/local/data.sh
index b3186df8061..734d4abec72 100755
--- a/egs2/chime4/asr1/local/data.sh
+++ b/egs2/chime4/asr1/local/data.sh
@@ -52,13 +52,24 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     local/clean_wsj0_data_prep.sh ${wsj0_data}
     local/clean_chime4_format_data.sh
 
+    # create data for 1ch and 2ch tracks
+    if [ ! -d ${CHIME4}/data/audio/16kHz/isolated_1ch_track ]; then
+        log "create data for 1ch tracks"
+        python local/sym_channel.py ${CHIME4} 1ch
+    fi
+    
+    if [ ! -d ${CHIME4}/data/audio/16kHz/isolated_2ch_track ]; then
+        log "create data for 2ch tracks"
+        python local/sym_channel.py ${CHIME4} 2ch
+    fi
+
     # beamforming for multich 
     local/run_beamform_2ch_track.sh --cmd "${train_cmd}" --nj 20 \
 	    ${CHIME4}/data/audio/16kHz/isolated_2ch_track enhan/beamformit_2mics
     local/run_beamform_6ch_track.sh --cmd "${train_cmd}" --nj 20 \
 	    ${CHIME4}/data/audio/16kHz/isolated_6ch_track enhan/beamformit_5mics
 
-    # prepartion for chime4 data
+    # preparation for chime4 data
     local/real_noisy_chime4_data_prep.sh ${CHIME4}
     local/simu_noisy_chime4_data_prep.sh ${CHIME4}
 
diff --git a/egs2/chime4/asr1/local/sym_channel.py b/egs2/chime4/asr1/local/sym_channel.py
new file mode 100644
index 00000000000..8a3bdcce2a9
--- /dev/null
+++ b/egs2/chime4/asr1/local/sym_channel.py
@@ -0,0 +1,38 @@
+import os
+from os import path
+import argparse
+
+
+def create_sym(data_dir, track, wav):
+    ori_path = path.join(f"{data_dir}/data/audio/16kHz/isolated", wav)
+    wav_path = path.join(
+        f"{data_dir}/data/audio/16kHz/isolated_{track}_track", *wav.split("/")[:-1]
+    )
+    if not path.exists(wav_path):
+        os.makedirs(wav_path)
+    if track == "1ch":
+        new_wav = wav.split("/")[-1].split(".")[0] + ".wav"
+        os.system(" ".join(["ln -s", ori_path, path.join(wav_path, new_wav)]))
+    elif track == "2ch":
+        os.system(" ".join(["ln -s", ori_path, wav_path]))
+
+
+def create_sym_list(data_dir, track):
+    for root, dirs, files in os.walk(f"{data_dir}/data/annotations"):
+        for file in files:
+            list_file = path.join(root, file)
+            if ".list" not in list_file or track not in list_file:
+                continue
+            with open(list_file, "r") as lfile:
+                for line in lfile.readlines():
+                    for wav in line.split():
+                        create_sym(data_dir, track, wav)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("")
+    parser.add_argument("data_dir", type=str, default="wpe", help="dir of misp data")
+    parser.add_argument("track", type=str, default="wpe", help="1ch or 2ch")
+    args = parser.parse_args()
+
+    create_sym_list(args.data_dir, args.track)
diff --git a/egs2/chime4/enh1/README.md b/egs2/chime4/enh1/README.md
index b1e52ade381..886eb0cbf26 100644
--- a/egs2/chime4/enh1/README.md
+++ b/egs2/chime4/enh1/README.md
@@ -6,6 +6,7 @@
 - python version: `3.6.3 |Anaconda, Inc.| (default, Nov 20 2017, 20:41:42)  [GCC 7.2.0]`
 - espnet version: `espnet 0.9.7`
 - pytorch version: `pytorch 1.6.0`
+- Note: PESQ is evaluated based on https://github.com/vBaiCai/python-pesq
 
 
 ## enh_train_enh_conv_tasnet_raw
@@ -14,5 +15,47 @@ config: conf/tuning/train_enh_conv_tasnet.yaml
 
 |dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
 |---|---|---|---|---|---|---|
-|enhanced_dt05_simu_isolated_1ch_track|2.63359|0.909127|11.3937|11.3937|0|10.1792|
-|enhanced_et05_simu_isolated_1ch_track|2.58602|0.891826|12.5193|12.5193|0|11.5768|
+|enhanced_dt05_simu_isolated_1ch_track|2.63|0.90|11.39|11.39|0|10.17|
+|enhanced_et05_simu_isolated_1ch_track|2.58|0.89|12.51|12.51|0|11.57|
+
+## enh_train_enh_beamformer_mvdr_raw
+
+config: conf/tuning/train_enh_beamformer_mvdr.yaml
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|2.60|0.94|13.67|13.67|0|12.51|
+|enhanced_et05_simu_isolated_6ch_track|2.63|0.95|15.51|15.51|0|14.65|
+
+<!-- These results are from the code after refactoring  -->
+## enh_train_enh_dc_crn_mapping_snr_raw
+
+config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|3.10|0.96|17.82|17.82|0.00|17.59|
+|enhanced_et05_simu_isolated_6ch_track|2.95|0.95|17.33|17.33|0.00|17.04|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 07:17:45 CST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `648b024d8fb262eb9923c06a698b9c6df5b16e51`
+  - Commit date: `Wed Mar 16 18:47:21 2022 +0800`
+
+
+## enh_train_enh_dprnntac_fasnet_raw
+
+config: conf/tuning/train_enh_dprnntac_fasnet.yaml
+
+Pretrained model: https://huggingface.co/lichenda/chime4_fasnet_dprnn_tac
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_dt05_simu_isolated_6ch_track|0.95|15.75|15.75|0.00|
+|enhanced_et05_simu_isolated_6ch_track|0.94|15.40|15.40|0.00|
+
diff --git a/egs2/chime4/enh1/cmd.sh b/egs2/chime4/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/chime4/enh1/cmd.sh
+++ b/egs2/chime4/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
index e618d41af44..cee051c8ef1 100644
--- a/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -25,13 +25,11 @@ scheduler_conf:
     mode: min
     factor: 0.5
     patience: 1
-model_conf:
-    loss_type: mask_mse
-    mask_type: PSM^2
 encoder: stft
 encoder_conf:
     n_fft: 512
     hop_length: 128
+    use_builtin_complex: False
 decoder: stft
 decoder_conf:
     n_fft: 512
@@ -55,7 +53,20 @@ separator_conf:
     bunits: 512
     bprojs: 512
     badim: 320
-    ref_channel: 4
+    ref_channel: 3
     use_noise_mask: True
     beamformer_type: mvdr_souden
     bdropout_rate: 0.0
+
+
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM^2
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_blstm_tf.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_blstm_tf.yaml
index e8ee7dd69c6..74761da642a 100644
--- a/egs2/chime4/enh1/conf/tuning/train_enh_blstm_tf.yaml
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_blstm_tf.yaml
@@ -25,9 +25,7 @@ scheduler_conf:
     mode: min
     factor: 0.7
     patience: 1
-model_conf:
-    loss_type: mask_mse
-    mask_type: psm
+
 encoder: stft
 encoder_conf:
     n_fft: 256
@@ -43,4 +41,16 @@ separator_conf:
     nonlinear: relu
     layer: 3
     unit: 896
-    dropout: 0.5
\ No newline at end of file
+    dropout: 0.5
+
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_conv_tasnet.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_conv_tasnet.yaml
index 971f240347e..b1c4b4ef7c5 100644
--- a/egs2/chime4/enh1/conf/tuning/train_enh_conv_tasnet.yaml
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_conv_tasnet.yaml
@@ -27,8 +27,6 @@ scheduler_conf:
     mode: min
     factor: 0.5
     patience: 3
-model_conf:
-    loss_type: si_snr
 encoder: conv
 encoder_conf:
     channel: 256
@@ -49,4 +47,15 @@ separator_conf:
     kernel: 3
     causal: False
     norm_type: "gLN"
-    nonlinear: relu
\ No newline at end of file
+    nonlinear: relu
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
new file mode 100644
index 00000000000..38d61843282
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
@@ -0,0 +1,67 @@
+init: xavier_uniform
+max_epoch: 200
+batch_type: folded
+batch_size:  16
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim: adam
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+    amsgrad: true
+patience: 10
+grad_clip: 5
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dc_crn
+separator_conf:
+    num_spk: 1
+    input_channels: [10, 16, 32, 64, 128, 256]  # 5x2=10 input channels
+    enc_hid_channels: 8
+    enc_layers: 5
+    glstm_groups: 2
+    glstm_layers: 2
+    glstm_bidirectional: true
+    glstm_rearrange: false
+    mode: mapping
+    ref_channel: 3
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml
index 94d1f58317c..b8ba3af6972 100644
--- a/egs2/chime4/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml
@@ -26,8 +26,7 @@ scheduler: steplr
 scheduler_conf:
     step_size: 2
     gamma: 0.98
-model_conf:
-    loss_type: si_snr
+
 encoder: conv
 encoder_conf:
     channel: 64
@@ -49,3 +48,14 @@ separator_conf:
     segment_size: 250
     dropout: 0.1
     nonlinear: relu
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml
new file mode 100644
index 00000000000..b5dd47ddac7
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_fasnet.yaml
@@ -0,0 +1,59 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8 
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: same
+encoder_conf: {}
+decoder: same
+decoder_conf: {}
+separator: fasnet
+separator_conf:
+    enc_dim: 64
+    feature_dim: 64
+    hidden_dim: 128
+    layer: 6
+    segment_size: 24
+    num_spk: 1
+    win_len: 16
+    context_len: 16
+    sr: 16000
+    fasnet_type: 'fasnet'
+    dropout: 0.2
+
+
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml
new file mode 100644
index 00000000000..ef1349ad8b9
--- /dev/null
+++ b/egs2/chime4/enh1/conf/tuning/train_enh_dprnntac_ifasnet.yaml
@@ -0,0 +1,58 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8 
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: same
+encoder_conf: {}
+decoder: same
+decoder_conf: {}
+separator: fasnet
+separator_conf:
+    enc_dim: 64
+    feature_dim: 64
+    hidden_dim: 128
+    layer: 6
+    segment_size: 24
+    num_spk: 1
+    win_len: 16
+    context_len: 16
+    sr: 16000
+    fasnet_type: 'ifasnet'
+
+
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh b/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
index 08df7d0dc4c..5cd50773aeb 100755
--- a/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
+++ b/egs2/chime4/enh1/local/simu_ext_chime4_data_prep.sh
@@ -85,6 +85,8 @@ elif [[ "$track" == "6" ]]; then
   done
 
   for x in $list_set; do
+    # drop the second channel to follow the convention in CHiME-4
+    # see P27 in https://hal.inria.fr/hal-01399180/file/vincent_CSL16.pdf
     mix-mono-wav-scp.py ${x}_wav.CH{1,3,4,5,6}.scp > ${x}_wav.scp
     mix-mono-wav-scp.py ${x}_spk1_wav.CH{1,3,4,5,6}.scp > ${x}_spk1_wav.scp
     sed -E "s#\.Clean\.wav#\.Noise\.wav#g" ${x}_spk1_wav.scp > ${x}_noise_wav.scp
diff --git a/egs2/chime4/enh1/run.sh b/egs2/chime4/enh1/run.sh
index 81fa2ab1304..60ee54ec435 100755
--- a/egs2/chime4/enh1/run.sh
+++ b/egs2/chime4/enh1/run.sh
@@ -12,7 +12,7 @@ extra_annotations=
 
 # train_set=tr05_simu_isolated_6ch_track
 # valid_set=dt05_simu_isolated_6ch_track
-# test_sets="et05_simuz_isolated_6ch_track"
+# test_sets="et05_simu_isolated_6ch_track"
 
 train_set=tr05_simu_isolated_1ch_track
 valid_set=dt05_simu_isolated_1ch_track
@@ -25,6 +25,7 @@ test_sets="et05_simu_isolated_1ch_track"
     --fs ${sample_rate} \
     --ngpu 2 \
     --spk_num 1 \
+    --ref_channel 3 \
     --local_data_opts "--extra-annotations ${extra_annotations} --stage 1 --stop-stage 2" \
     --enh_config conf/tuning/train_enh_conv_tasnet.yaml \
     --use_dereverb_ref false \
diff --git a/egs2/cmu_arctic/tts1/README.md b/egs2/cmu_arctic/tts1/README.md
new file mode 100644
index 00000000000..a520ab90cfa
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/README.md
@@ -0,0 +1,14 @@
+# CMU ARCTIC RECIPE
+
+This is the recipe of the TTS model with the [CMU ARCTIC](http://www.festvox.org/cmu_arctic/) databases.
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
diff --git a/egs2/cmu_arctic/tts1/cmd.sh b/egs2/cmu_arctic/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/cmu_arctic/tts1/conf/decode.yaml b/egs2/cmu_arctic/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/cmu_arctic/tts1/conf/mfcc.conf b/egs2/cmu_arctic/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/cmu_arctic/tts1/conf/pbs.conf b/egs2/cmu_arctic/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/cmu_arctic/tts1/conf/queue.conf b/egs2/cmu_arctic/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/cmu_arctic/tts1/conf/slurm.conf b/egs2/cmu_arctic/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/cmu_arctic/tts1/conf/train.yaml b/egs2/cmu_arctic/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/cmu_arctic/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/cmu_arctic/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/cmu_arctic/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/cmu_arctic/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/cmu_arctic/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/cmu_arctic/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..434096c4bbc
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 200            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 12000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/cmu_arctic/tts1/conf/tuning/train_tacotron2.yaml b/egs2/cmu_arctic/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/cmu_arctic/tts1/conf/tuning/train_transformer.yaml b/egs2/cmu_arctic/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/cmu_arctic/tts1/conf/vad.conf b/egs2/cmu_arctic/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/cmu_arctic/tts1/db.sh b/egs2/cmu_arctic/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/cmu_arctic/tts1/local/data.sh b/egs2/cmu_arctic/tts1/local/data.sh
new file mode 100755
index 00000000000..c6008ceb93b
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/local/data.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+
+log "$0 $*"
+. utils/parse_options.sh
+
+spk=$1
+
+available_spks=(
+    "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
+)
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <spk>"
+    echo "Available speakers: ${available_spks[*]}"
+    exit 2
+fi
+
+# check speakers
+# shellcheck disable=SC2048
+if ! eval "$(echo ${available_spks[*]} | grep -q ${spk})"; then
+    echo "Specified spk (${spk}) is not available or not supported." >&2
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+db_root=${CMU_ARCTIC}
+
+train_set=${spk}_train_no_dev
+dev_set=${spk}_dev
+eval_set=${spk}_eval
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: Data Download"
+    local/data_download.sh "${db_root}" "${spk}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    # Initial normalization of the data
+    # Doesn't change sampling frequency and it's done after stages
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${db_root}/cmu_us_${spk}_arctic" "${spk}"
+fi
diff --git a/egs2/cmu_arctic/tts1/local/data_download.sh b/egs2/cmu_arctic/tts1/local/data_download.sh
new file mode 100755
index 00000000000..b615db679a7
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/local/data_download.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Peter Wu
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+spk=$2
+
+available_spks=(
+    "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
+)
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <db_root> <spk>"
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+
+# check speakers
+if ! $(echo ${available_spks[*]} | grep -q ${spk}); then
+    echo "Specified spk (${spk}) is not available or not supported." >&2
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${db_root}/${spk}.done" ]; then
+    mkdir -p "${db_root}"
+    cd "${db_root}" || exit 1;
+    wget http://festvox.org/cmu_arctic/cmu_arctic/packed/cmu_us_${spk}_arctic-0.95-release.tar.bz2
+    tar xf cmu_us_${spk}*.tar.bz2
+    rm cmu_us_${spk}*.tar.bz2
+    cd "${cwd}" || exit 1;
+    echo "Successfully finished download."
+    touch ${db_root}/${spk}.done
+else
+    echo "Already exists. Skip download."
+fi
diff --git a/egs2/cmu_arctic/tts1/local/data_prep.sh b/egs2/cmu_arctic/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..6dfba647097
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/local/data_prep.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Peter Wu
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+train_set=train_no_dev
+dev_set=dev
+eval_set=eval
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db=$1
+spk=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db> <spk>"
+    echo ""
+    echo "Options:"
+    echo "    --train_set: name of train set (default=${train_set})."
+    echo "    --dev_set: name of dev set (default=${dev_set})."
+    echo "    --eval_set: name of eval set (default=${eval_set})."
+    exit 1
+fi
+
+# check speaker
+available_spks=(
+    "slt" "clb" "bdl" "rms" "jmk" "awb" "ksp"
+)
+if ! $(echo ${available_spks[*]} | grep -q ${spk}); then
+    echo "Specified speaker ${spk} is not available."
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+
+set -euo pipefail
+
+[ ! -e data/${spk} ] && mkdir -p data/${spk}
+
+# set filenames
+scp=data/${spk}/wav.scp
+utt2spk=data/${spk}/utt2spk
+text=data/${spk}/text
+segments=data/${spk}/segments
+spk2utt=data/${spk}/spk2utt
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+[ -e "${segments}" ] && rm "${segments}"
+
+# make scp, utt2spk, and spk2utt
+find ${db} -name "*.wav" -follow | sort | while read -r filename;do
+    id="${spk}_$(basename ${filename} | sed -e "s/\.[^\.]*$//g")"
+    echo "${id} ${filename}" >> ${scp}
+    echo "${id} ${spk}" >> ${utt2spk}
+done
+echo "Successfully finished making wav.scp, utt2spk."
+
+utils/utt2spk_to_spk2utt.pl ${utt2spk} > ${spk2utt}
+echo "Successfully finished making spk2utt."
+
+# make text
+raw_text=${db}/etc/txt.done.data
+ids=$(sed < ${raw_text} -e "s/^( /${spk}_/g" -e "s/ )$//g" | cut -d " " -f 1)
+sentences=$(sed < ${raw_text} -e "s/^( //g" -e "s/ )$//g" -e "s/\"//g" | tr '[:lower:]' '[:upper:]' | cut -d " " -f 2-)
+paste -d " " <(echo "${ids}") <(echo "${sentences}") > ${text}
+echo "Successfully finished making text."
+
+utils/fix_data_dir.sh data/${spk}
+utils/validate_data_dir.sh --no-feats data/${spk}
+
+# split
+utils/subset_data_dir.sh --last data/${spk} 200 data/${spk}_tmp
+utils/subset_data_dir.sh --last data/${spk}_tmp 100 data/${eval_set}
+utils/subset_data_dir.sh --first data/${spk}_tmp 100 data/${dev_set}
+n=$(( $(wc -l < data/${spk}/wav.scp) - 200 ))
+utils/subset_data_dir.sh --first data/${spk} ${n} data/${train_set}
+
+# remove tmp directories
+rm -rf data/${spk}_tmp
+
+echo "Successfully prepared data."
diff --git a/egs2/cmu_arctic/tts1/local/path.sh b/egs2/cmu_arctic/tts1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/cmu_arctic/tts1/path.sh b/egs2/cmu_arctic/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/cmu_arctic/tts1/pyscripts b/egs2/cmu_arctic/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/cmu_arctic/tts1/run.sh b/egs2/cmu_arctic/tts1/run.sh
new file mode 100755
index 00000000000..842a1954b48
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/run.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=16000
+n_fft=1024
+n_shift=256
+win_length=null
+
+spk=awb
+
+opts="--audio_format wav --local_data_opts ${spk} "
+
+train_set=${spk}_train_no_dev
+valid_set=${spk}_dev
+test_sets=${spk}_eval
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# g2p=g2p_en # Include word separator
+g2p=g2p_en_no_space # Include no word separator
+
+./tts.sh \
+    --lang en \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner tacotron \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    ${opts} "$@"
diff --git a/egs2/cmu_arctic/tts1/scripts b/egs2/cmu_arctic/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/cmu_arctic/tts1/steps b/egs2/cmu_arctic/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/cmu_arctic/tts1/tts.sh b/egs2/cmu_arctic/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/cmu_arctic/tts1/utils b/egs2/cmu_arctic/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/cmu_arctic/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/README.md b/egs2/cmu_indic/tts1/README.md
new file mode 100644
index 00000000000..83256819fca
--- /dev/null
+++ b/egs2/cmu_indic/tts1/README.md
@@ -0,0 +1,14 @@
+# CMU INDIC RECIPE
+
+This is the recipe of the TTS model with the [CMU INDIC](http://festvox.org/cmu_indic/) databases.
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
diff --git a/egs2/cmu_indic/tts1/cmd.sh b/egs2/cmu_indic/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/cmu_indic/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/cmu_indic/tts1/conf/decode.yaml b/egs2/cmu_indic/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/conf/mfcc.conf b/egs2/cmu_indic/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/cmu_indic/tts1/conf/pbs.conf b/egs2/cmu_indic/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/cmu_indic/tts1/conf/queue.conf b/egs2/cmu_indic/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/cmu_indic/tts1/conf/slurm.conf b/egs2/cmu_indic/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/cmu_indic/tts1/conf/train.yaml b/egs2/cmu_indic/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/cmu_indic/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/cmu_indic/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/cmu_indic/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/cmu_indic/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/cmu_indic/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..434096c4bbc
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 200            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 12000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/cmu_indic/tts1/conf/tuning/train_tacotron2.yaml b/egs2/cmu_indic/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/cmu_indic/tts1/conf/tuning/train_transformer.yaml b/egs2/cmu_indic/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/cmu_indic/tts1/conf/vad.conf b/egs2/cmu_indic/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/cmu_indic/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/cmu_indic/tts1/db.sh b/egs2/cmu_indic/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/cmu_indic/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/local/data.sh b/egs2/cmu_indic/tts1/local/data.sh
new file mode 100755
index 00000000000..ead80f1a3c2
--- /dev/null
+++ b/egs2/cmu_indic/tts1/local/data.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+
+log "$0 $*"
+. utils/parse_options.sh
+
+spk=$1
+
+available_spks=(
+    "hin_ab" "tel_ss" "tam_sdr" "kan_plv" "mar_slp" "guj_dp" "ben_rm"
+)
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <spk>"
+    echo "Available speakers: ${available_spks[*]}"
+    exit 2
+fi
+
+# check speakers
+# shellcheck disable=SC2048
+if ! eval "$(echo ${available_spks[*]} | grep -q ${spk})"; then
+    echo "Specified spk (${spk}) is not available or not supported." >&2
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+db_root=${CMU_INDIC}
+
+train_set=${spk}_train_no_dev
+dev_set=${spk}_dev
+eval_set=${spk}_eval
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: Data Download"
+    local/data_download.sh "${db_root}" "${spk}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    # Initial normalization of the data
+    # Doesn't change sampling frequency and it's done after stages
+    local/data_prep.sh \
+        --train_set "${train_set}" \
+        --dev_set "${dev_set}" \
+        --eval_set "${eval_set}" \
+        "${db_root}/cmu_indic_${spk}" "${spk}"
+fi
diff --git a/egs2/cmu_indic/tts1/local/data_download.sh b/egs2/cmu_indic/tts1/local/data_download.sh
new file mode 100755
index 00000000000..6f5629e0a42
--- /dev/null
+++ b/egs2/cmu_indic/tts1/local/data_download.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Peter Wu
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+spk=$2
+
+available_spks=(
+    "hin_ab" "tel_ss" "tam_sdr" "kan_plv" "mar_slp" "guj_dp" "ben_rm"
+)
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <db_root> <spk>"
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+
+# check speakers
+if ! $(echo ${available_spks[*]} | grep -q ${spk}); then
+    echo "Specified spk (${spk}) is not available or not supported." >&2
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${db_root}/${spk}.done" ]; then
+    mkdir -p "${db_root}"
+    cd "${db_root}" || exit 1;
+    wget http://festvox.org/h2r_indic/cmu_indic_${spk}.tar.bz2
+    tar xf cmu_indic_${spk}.tar.bz2
+    rm cmu_indic_${spk}.tar.bz2
+    cd "${cwd}" || exit 1;
+    echo "Successfully finished download."
+    touch ${db_root}/${spk}.done
+else
+    echo "Already exists. Skip download."
+fi
diff --git a/egs2/cmu_indic/tts1/local/data_prep.sh b/egs2/cmu_indic/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..daf7729e355
--- /dev/null
+++ b/egs2/cmu_indic/tts1/local/data_prep.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Peter Wu
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+train_set=train_no_dev
+dev_set=dev
+eval_set=eval
+
+# shellcheck disable=SC1091
+. utils/parse_options.sh || exit 1;
+
+db=$1
+spk=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 [Options] <db> <spk>"
+    echo ""
+    echo "Options:"
+    echo "    --train_set: name of train set (default=${train_set})."
+    echo "    --dev_set: name of dev set (default=${dev_set})."
+    echo "    --eval_set: name of eval set (default=${eval_set})."
+    exit 1
+fi
+
+# check speaker
+available_spks=(
+    "hin_ab" "tel_ss" "tam_sdr" "kan_plv" "mar_slp" "guj_dp" "ben_rm"
+)
+if ! $(echo ${available_spks[*]} | grep -q ${spk}); then
+    echo "Specified speaker ${spk} is not available."
+    echo "Available speakers: ${available_spks[*]}"
+    exit 1
+fi
+
+set -euo pipefail
+
+[ ! -e data/${spk} ] && mkdir -p data/${spk}
+
+# set filenames
+scp=data/${spk}/wav.scp
+utt2spk=data/${spk}/utt2spk
+text=data/${spk}/text
+segments=data/${spk}/segments
+spk2utt=data/${spk}/spk2utt
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+[ -e "${segments}" ] && rm "${segments}"
+
+# make scp, utt2spk, and spk2utt
+find ${db} -name "*.wav" -follow | sort | while read -r filename;do
+    id="${spk}_$(basename ${filename} | sed -e "s/\.[^\.]*$//g")"
+    echo "${id} ${filename}" >> ${scp}
+    echo "${id} ${spk}" >> ${utt2spk}
+done
+echo "Successfully finished making wav.scp, utt2spk."
+
+utils/utt2spk_to_spk2utt.pl ${utt2spk} > ${spk2utt}
+echo "Successfully finished making spk2utt."
+
+# make text
+raw_text=${db}/etc/txt.done.data
+ids=$(sed < ${raw_text} -e "s/^( /${spk}_/g" -e "s/ )$//g" | cut -d " " -f 1)
+sentences=$(sed < ${raw_text} -e "s/^( //g" -e "s/ )$//g" -e "s/\"//g" | tr '[:lower:]' '[:upper:]' | cut -d " " -f 2-)
+paste -d " " <(echo "${ids}") <(echo "${sentences}") > ${text}
+echo "Successfully finished making text."
+
+utils/fix_data_dir.sh data/${spk}
+utils/validate_data_dir.sh --no-feats data/${spk}
+
+# split
+utils/subset_data_dir.sh --last data/${spk} 200 data/${spk}_tmp
+utils/subset_data_dir.sh --last data/${spk}_tmp 100 data/${eval_set}
+utils/subset_data_dir.sh --first data/${spk}_tmp 100 data/${dev_set}
+n=$(( $(wc -l < data/${spk}/wav.scp) - 200 ))
+utils/subset_data_dir.sh --first data/${spk} ${n} data/${train_set}
+
+# remove tmp directories
+rm -rf data/${spk}_tmp
+
+echo "Successfully prepared data."
diff --git a/egs2/cmu_indic/tts1/local/path.sh b/egs2/cmu_indic/tts1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/cmu_indic/tts1/path.sh b/egs2/cmu_indic/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/cmu_indic/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/pyscripts b/egs2/cmu_indic/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/cmu_indic/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/run.sh b/egs2/cmu_indic/tts1/run.sh
new file mode 100755
index 00000000000..6c01dcad7ee
--- /dev/null
+++ b/egs2/cmu_indic/tts1/run.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=16000
+n_fft=1024
+n_shift=256
+win_length=null
+
+spk=hin_ab
+
+opts="--audio_format wav --local_data_opts ${spk} "
+
+train_set=${spk}_train_no_dev
+valid_set=${spk}_dev
+test_sets=${spk}_eval
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# g2p=g2p_en # Include word separator
+g2p=g2p_en_no_space # Include no word separator
+
+./tts.sh \
+    --lang en \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner tacotron \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    ${opts} "$@"
diff --git a/egs2/cmu_indic/tts1/scripts b/egs2/cmu_indic/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/cmu_indic/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/steps b/egs2/cmu_indic/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/cmu_indic/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/tts.sh b/egs2/cmu_indic/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/cmu_indic/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/cmu_indic/tts1/utils b/egs2/cmu_indic/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/cmu_indic/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/commonvoice/asr1/Fusing_ASR_frontends_in_Espnet2.md b/egs2/commonvoice/asr1/Fusing_ASR_frontends_in_Espnet2.md
new file mode 100644
index 00000000000..10cff1468fd
--- /dev/null
+++ b/egs2/commonvoice/asr1/Fusing_ASR_frontends_in_Espnet2.md
@@ -0,0 +1,74 @@
+# Fusing ASR frontends in EspNET 2 
+
+## 1. Introduction 
+
+Espnet enables to use a wide range of front-ends for ASR. Front-ends based on spectral feature extraction () and Front-ends with Self-Supervised Learning Representations (SSLR). 
+In a recent project we remarked that using a learnable linear combination of FBank front-end and Hubert SSLR front-end led to significant improvements in terms of WER. 
+For that reason we provide here a generic way of fusing front-end in espnet toolkit.
+
+
+## 2. Front-ends and fusion techniques 
+
+### 2.1. Front-ends
+
+Espnet provides two types of frontends : ```default``` (spectral speech features, eg FBanks) and ```s3prl``` (SSLR eg Hubert, wav2vec2). The SSLR front-ends usale in Espnet are inherited from [s3prl project](https://github.com/s3prl/s3prl). It includes for instance TERA, Hubert, wav2vec2, wav2vec 2 xslr, APC, PASE ... 
+
+### 2.2. Fusion techniques
+
+We provide for now a learnable linear fusion technique for combining any number of front-ends from any type. This method showed strong results in previous work. More fusion techniques will be added in the next weeks.
+
+
+## 3. Practical use : Configuration file (yaml) 
+
+The only file you need to take care to use fusion of front-ends is the yaml configuration file. You need to : 
+* specify that you want to use the ```fused``` frontend type (as shown in line 23 in the picture below) 
+* list the frontend you want to use under ```frontends``` argument, following the syntax of lines 25 to 41 in the picture below. 
+* * chose a fusion method (linear_projection only for now, see line 43 in the picture below)
+* chose a projection dimension (as shown in line 44 in the picture below) 
+* DO NOT FORGET to change the pre-encoder input dimension according to the projection dimension you chosed and the number of front-ends you use. The input dimension will be ```NumberOfFrontends * ProjectionDimension```, so in the example provided, we have 3 front-ends and ```proj_dim = 100``` so ```input_dim = 300```. (line 48 here)
+
+
+
+ <img width="884" alt="Capture d’écran 2021-11-29 à 22 29 48" src="https://user-images.githubusercontent.com/53098519/143980781-f3527066-9375-4740-8e03-66590d3d9576.png">
+
+
+
+## 4. Problems and future work
+
+We will study the fusion techniques more in depth in the next weeks and refine the tools, in case of any question ping me anytime (Dan Berrebbi). 
+
+## 5. Yaml code ready for copy-paste
+```
+
+#frontend related 
+#freeze_param: ["frontend.upstream"] 
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: hubert_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+      
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 100     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 300  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 100
+```
+
+ 
diff --git a/egs2/commonvoice/asr1/README.md b/egs2/commonvoice/asr1/README.md
index 7bc20f00bca..d2aea85393d 100644
--- a/egs2/commonvoice/asr1/README.md
+++ b/egs2/commonvoice/asr1/README.md
@@ -1,4 +1,12 @@
 <!-- Generated by scripts/utils/show_asr_result.sh -->
+
+## How to use fusion of front-ends : 
+
+We provide in this recipe a example configuration file using several front-ends and a linear fusion to combine them. 
+The configuration file is [here](https://github.com/DanBerrebbi/espnet/blob/master/egs2/commonvoice/asr1/conf/tuning/train_asr_fused_frontends.yaml#L22-L50), we highlighted the lines defning the front-end fusion. 
+For more details on how use fusion of front-ends, you can refer to this [wiki](https://github.com/DanBerrebbi/espnet/blob/master/egs2/commonvoice/asr1/Fusing_ASR_frontends_in_Espnet2.md).
+
+
 # RESULTS
 ## Environments
 - date: `Fri Dec 18 04:46:59 KST 2020`
diff --git a/egs2/commonvoice/asr1/cmd.sh b/egs2/commonvoice/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/commonvoice/asr1/cmd.sh
+++ b/egs2/commonvoice/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/commonvoice/asr1/conf/tuning/train_asr_fused_frontends.yaml b/egs2/commonvoice/asr1/conf/tuning/train_asr_fused_frontends.yaml
new file mode 100644
index 00000000000..24444524099
--- /dev/null
+++ b/egs2/commonvoice/asr1/conf/tuning/train_asr_fused_frontends.yaml
@@ -0,0 +1,99 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+#frontend related 
+#freeze_param: ["frontend.upstream"] 
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: hubert_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+      
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 100     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 300  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.1
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/commonvoice/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/commonvoice/asr1/conf/tuning/train_asr_transformer.yaml
index 0d34329d744..a889e110cf3 100644
--- a/egs2/commonvoice/asr1/conf/tuning/train_asr_transformer.yaml
+++ b/egs2/commonvoice/asr1/conf/tuning/train_asr_transformer.yaml
@@ -48,4 +48,5 @@ best_model_criterion:
    - max
 keep_nbest_models: 10
 
-init: chainer
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/commonvoice/asr1/local/data.sh b/egs2/commonvoice/asr1/local/data.sh
index 4fdc79b6f01..efeb1cce96c 100755
--- a/egs2/commonvoice/asr1/local/data.sh
+++ b/egs2/commonvoice/asr1/local/data.sh
@@ -24,7 +24,7 @@ log() {
     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 
-mkdir ${COMMONVOICE}
+mkdir -p ${COMMONVOICE}
 if [ -z "${COMMONVOICE}" ]; then
     log "Fill the value of 'COMMONVOICE' of db.sh"
     exit 1
@@ -44,7 +44,8 @@ log "data preparation started"
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
     log "stage1: Download data to ${COMMONVOICE}"
-    mkdir -p ${COMMONVOICE}
+    log "The default data of this recipe is from commonvoice 5.1, for newer version, you need to register at \
+         https://commonvoice.mozilla.org/"
     local/download_and_untar.sh ${COMMONVOICE} ${data_url} ${lang}.tar.gz
 fi
 
diff --git a/egs2/csj/asr1/cmd.sh b/egs2/csj/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/csj/asr1/cmd.sh
+++ b/egs2/csj/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/csmsc/tts1/README.md b/egs2/csmsc/tts1/README.md
index ff55c8dcbb3..61724744910 100644
--- a/egs2/csmsc/tts1/README.md
+++ b/egs2/csmsc/tts1/README.md
@@ -6,12 +6,81 @@ See the following pages for the usage:
 - [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
 - [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
 - [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
 
 See the following pages before asking the question:
 - [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
 - [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
 
 
+# THIRD RESULTS
+
+- Initial VITS models
+
+## Environments
+- date: `Sat Sep  4 19:38:35 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a1`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `dee654041cddf80281048b3e7525c1cdafc377ff`
+  - Commit date: `Thu Sep 2 14:45:48 2021 +0900`
+
+## Pretrained Models
+
+### csmsc_tts_train_vits_raw_phn_pypinyin_g2p_phone_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 22.05khz / 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5499120
+
+### csmsc_tts_train_full_band_vits_raw_phn_pypinyin_g2p_phone_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --ngpu 4 \
+    --fs 44100 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --win_length null \
+    --dumpdir dump/44k \
+    --expdir exp/44k \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_full_band_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 44.1khz / 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5521404
+
+
 # SECOND RESULTS
 
 ## Environments
diff --git a/egs2/csmsc/tts1/cmd.sh b/egs2/csmsc/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/csmsc/tts1/cmd.sh
+++ b/egs2/csmsc/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/csmsc/tts1/conf/tuning/decode_vits.yaml b/egs2/csmsc/tts1/conf/tuning/decode_vits.yaml
new file mode 100644
index 00000000000..74bb0ebe0e2
--- /dev/null
+++ b/egs2/csmsc/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
diff --git a/egs2/csmsc/tts1/conf/tuning/train_full_band_vits.yaml b/egs2/csmsc/tts1/conf/tuning/train_full_band_vits.yaml
new file mode 100644
index 00000000000..3f4f97f68c3
--- /dev/null
+++ b/egs2/csmsc/tts1/conf/tuning/train_full_band_vits.yaml
@@ -0,0 +1,185 @@
+# This configuration is for ESPnet2 to train 44.1 kHz VITS,
+# which is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 44100 hz audio as
+# the training data (mainly tested on CSMSC and JSUT).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 44100          # must be the same as the training data
+        n_fft: 2048        # fft points
+        hop_length: 512    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 44100          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 10000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/csmsc/tts1/conf/tuning/train_vits.yaml b/egs2/csmsc/tts1/conf/tuning/train_vits.yaml
new file mode 100644
index 00000000000..574b8febab4
--- /dev/null
+++ b/egs2/csmsc/tts1/conf/tuning/train_vits.yaml
@@ -0,0 +1,185 @@
+# This configuration is for ESPnet2 to train VITS, which
+# is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 22050 hz audio as
+# the training data (mainly tested on LJspeech).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/csmsc/tts1/local/data.sh b/egs2/csmsc/tts1/local/data.sh
index 3dbdca76206..9822825f375 100755
--- a/egs2/csmsc/tts1/local/data.sh
+++ b/egs2/csmsc/tts1/local/data.sh
@@ -26,7 +26,7 @@ fi
 . ./db.sh || exit 1;
 
 if [ -z "${CSMSC}" ]; then
-   log "Fill the value of 'JSUT' of db.sh"
+   log "Fill the value of 'CSMSC' of db.sh"
    exit 1
 fi
 db_root=${CSMSC}
diff --git a/egs2/css10/tts1/README.md b/egs2/css10/tts1/README.md
new file mode 100644
index 00000000000..ecf66de5cbe
--- /dev/null
+++ b/egs2/css10/tts1/README.md
@@ -0,0 +1,68 @@
+# CSS10 RECIPE
+
+This is the recipe of 10 langauge TTS model with [CSS10](https://github.com/Kyubyong/css10).
+
+Before running the recipe, please download from https://github.com/Kyubyong/css10.  
+Then, edit 'CSS10' in `db.sh` and locate unzipped dataset as follows:
+
+```bash
+$ vim db.sh
+CSS10=/path/to/CSS10
+
+$ tree -L 2 /path/to/CSS10
+├── de
+│   ├── achtgesichterambiwasse
+│   ├── meisterfloh
+│   ├── serapionsbruederauswahl
+│   └── transcript.txt
+├── el
+│   ├── Paramythi_horis_onoma
+│   └── transcript.txt
+├── es
+│   ├── 19demarzo
+│   ├── bailen
+│   ├── batalla_arapiles
+│   └── transcript.txt
+├── fi
+│   ├── ensimmaisetnovellit
+│   ├── gulliverin_matkat_kaukaisilla_mailla
+│   ├── kaleri-orja
+│   ├── salmelan_heinatalkoot
+│   └── transcript.txt
+├── fr
+│   ├── lesmis
+│   ├── lupincontresholme
+│   └── transcript.txt
+├── hu
+│   ├── egri_csillagok
+│   └── transcript.txt
+├── ja
+│   ├── meian
+│   └── transcript.txt
+├── nl
+│   ├── 20000_mijlen
+│   └── transcript.txt
+├── ru
+│   ├── early_short_stories
+│   ├── icemarch
+│   ├── shortstories_childrenadults
+│   └── transcript.txt
+└── zh
+    ├── call_to_arms
+    ├── chao_hua_si_she
+    └── transcript.txt
+```
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train with X-vector](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-x-vector-training)
+- [How to train with speaker ID](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-speaker-id-embedding-training)
+- [How to train with language ID](../../TEMPLATE/tts1/README.md#multi-language-model-with-language-id-embedding-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
diff --git a/egs2/css10/tts1/cmd.sh b/egs2/css10/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/css10/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/css10/tts1/conf/decode.yaml b/egs2/css10/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/css10/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/css10/tts1/conf/mfcc.conf b/egs2/css10/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/css10/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/css10/tts1/conf/pbs.conf b/egs2/css10/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/css10/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/css10/tts1/conf/queue.conf b/egs2/css10/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/css10/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/css10/tts1/conf/slurm.conf b/egs2/css10/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/css10/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/css10/tts1/conf/train.yaml b/egs2/css10/tts1/conf/train.yaml
new file mode 120000
index 00000000000..5825b613e30
--- /dev/null
+++ b/egs2/css10/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_gst+xvector_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/css10/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/css10/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/css10/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/css10/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/css10/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..e006156e8be
--- /dev/null
+++ b/egs2/css10/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false  # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/css10/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml b/egs2/css10/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a6b8d59d422
--- /dev/null
+++ b/egs2/css10/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,113 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with GST + X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 4                               # number of heads in GST multi-head attention
+    gst_tokens: 16                             # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/css10/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml b/egs2/css10/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
new file mode 100644
index 00000000000..6065c914c39
--- /dev/null
+++ b/egs2/css10/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
@@ -0,0 +1,81 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 512              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 4                 # number of heads in GST multi-head attention
+    gst_tokens: 16               # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/css10/tts1/conf/tuning/train_gst+xvector_transformer.yaml b/egs2/css10/tts1/conf/tuning/train_gst+xvector_transformer.yaml
new file mode 100644
index 00000000000..737a26960d4
--- /dev/null
+++ b/egs2/css10/tts1/conf/tuning/train_gst+xvector_transformer.yaml
@@ -0,0 +1,96 @@
+# This configuration is for ESPnet2 to train Transformer-TTS with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the learning of the diagonal attention.
+# It requires 4 GPUs with 32 GB memory and it takes around 3 days
+# to finish the training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    spk_embed_dim: 512               # dimension of speaker embedding
+    spk_embed_integration_type: add  # how to integrate speaker embedding
+    use_gst: true                    # whether to use GST embedding
+    gst_heads: 4                     # number of heads in GST multi-head attention
+    gst_tokens: 16                   # number of global style tokens
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 12000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/css10/tts1/conf/vad.conf b/egs2/css10/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/css10/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/css10/tts1/db.sh b/egs2/css10/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/css10/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/css10/tts1/local/data.sh b/egs2/css10/tts1/local/data.sh
new file mode 100755
index 00000000000..f2bc743bcac
--- /dev/null
+++ b/egs2/css10/tts1/local/data.sh
@@ -0,0 +1,176 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=100
+text_format=raw
+langs="de el es fi fr hu ja nl ru zh"
+threshold=35
+nj=32
+
+log "$0 $*"
+# shellcheck disable=SC1091
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+# shellcheck disable=SC1091
+. ./cmd.sh || exit 1;
+# shellcheck disable=SC1091
+. ./db.sh || exit 1;
+
+if [ -z "${CSS10}" ]; then
+   log "Fill the value of 'CSS10' of db.sh"
+   exit 1
+fi
+
+db_root=${CSS10}
+train_set=tr_no_dev
+dev_set=dev
+eval_set=eval1
+
+if [ ! -e "${CSS10}" ]; then
+    log "CSS10 dataset is not found."
+    log "Please download it from https://github.com/Kyubyong/css10 and locate as follows:"
+    cat << EOF
+$ vim db.sh
+CSS10=/path/to/CSS10
+
+$ tree -L 2 /path/to/CSS10
+├── de
+│   ├── achtgesichterambiwasse
+│   ├── meisterfloh
+│   ├── serapionsbruederauswahl
+│   └── transcript.txt
+├── el
+│   ├── Paramythi_horis_onoma
+│   └── transcript.txt
+├── es
+│   ├── 19demarzo
+│   ├── bailen
+│   ├── batalla_arapiles
+│   └── transcript.txt
+├── fi
+│   ├── ensimmaisetnovellit
+│   ├── gulliverin_matkat_kaukaisilla_mailla
+│   ├── kaleri-orja
+│   ├── salmelan_heinatalkoot
+│   └── transcript.txt
+├── fr
+│   ├── lesmis
+│   ├── lupincontresholme
+│   └── transcript.txt
+├── hu
+│   ├── egri_csillagok
+│   └── transcript.txt
+├── ja
+│   ├── meian
+│   └── transcript.txt
+├── nl
+│   ├── 20000_mijlen
+│   └── transcript.txt
+├── ru
+│   ├── early_short_stories
+│   ├── icemarch
+│   ├── shortstories_childrenadults
+│   └── transcript.txt
+└── zh
+    ├── call_to_arms
+    ├── chao_hua_si_she
+    └── transcript.txt
+EOF
+    exit 1
+fi
+
+# define g2p dict
+declare -A g2p_dict=(
+    ["de"]="espeak_ng_german"
+    ["el"]="espeak_ng_greek"
+    ["es"]="espeak_ng_spanish"
+    ["fi"]="espeak_ng_finnish"
+    ["fr"]="espeak_ng_french"
+    ["hu"]="espeak_ng_hungarian"
+    ["ja"]="pyopenjtalk"
+    ["nl"]="espeak_ng_dutch"
+    ["ru"]="espeak_ng_russian"
+    ["zh"]="pypinyin_g2p_phone"
+)
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    for lang in ${langs}; do
+        local/data_prep.sh "${db_root}/${lang}" "data/${lang}"
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: scripts/audio/trim_silence.sh"
+    for lang in ${langs}; do
+        # shellcheck disable=SC2154
+        scripts/audio/trim_silence.sh \
+            --cmd "${train_cmd}" \
+            --nj "${nj}" \
+            --fs 22050 \
+            --win_length 1024 \
+            --shift_length 256 \
+            --threshold "${threshold}" \
+            "data/${lang}" "data/${lang}/log"
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ] && [ "${text_format}" = phn ]; then
+    log "stage 2: pyscripts/utils/convert_text_to_phn.py"
+    for lang in ${langs}; do
+        g2p=${g2p_dict[${lang}]}
+        utils/copy_data_dir.sh "data/${lang}" "data/${lang}_phn"
+        pyscripts/utils/convert_text_to_phn.py \
+            --g2p "${g2p}" --nj "${nj}" \
+            "data/${lang}/text" "data/${lang}_phn/text"
+        utils/fix_data_dir.sh "data/${lang}_phn"
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: utils/subset_data_dir.sh"
+    suffix=""
+    if [ "${text_format}" = phn ]; then
+        suffix="_phn"
+    fi
+    combine_train_dirs=()
+    combine_dev_dirs=()
+    combine_eval_dirs=()
+    for lang in ${langs}; do
+        utils/subset_data_dir.sh "data/${lang}${suffix}" 100 "data/${lang}_deveval${suffix}"
+        utils/subset_data_dir.sh --first "data/${lang}_deveval${suffix}" 50 "data/${lang}_${dev_set}${suffix}"
+        utils/subset_data_dir.sh --last "data/${lang}_deveval${suffix}" 50 "data/${lang}_${eval_set}${suffix}"
+        utils/copy_data_dir.sh "data/${lang}${suffix}" "data/${lang}_${train_set}${suffix}"
+        utils/filter_scp.pl --exclude "data/${lang}_deveval${suffix}/wav.scp" \
+            "data/${lang}${suffix}/wav.scp" > "data/${lang}_${train_set}${suffix}/wav.scp"
+        utils/fix_data_dir.sh "data/${lang}_${train_set}${suffix}"
+        combine_train_dirs+=("data/${lang}_${train_set}${suffix}")
+        combine_dev_dirs+=("data/${lang}_${dev_set}${suffix}")
+        combine_eval_dirs+=("data/${lang}_${eval_set}${suffix}")
+    done
+    utils/combine_data.sh "data/${train_set}${suffix}" "${combine_train_dirs[@]}"
+    utils/combine_data.sh "data/${dev_set}${suffix}" "${combine_dev_dirs[@]}"
+    utils/combine_data.sh "data/${eval_set}${suffix}" "${combine_eval_dirs[@]}"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/css10/tts1/local/data_prep.sh b/egs2/css10/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..c103e2b45ac
--- /dev/null
+++ b/egs2/css10/tts1/local/data_prep.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <db_root> <data_dir>"
+    echo "e.g.: $0 /path/to/CSS10/ja data/ja"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check directory existence
+[ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+
+# make scp, utt2spk, and spk2utt
+lang=$(basename "${db_root}")
+find "${db_root}" -name "*.wav" | sort | while read -r filename; do
+    id=${lang}_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    # NOTE(kan-bayashi): Some wav files are difference bit or channels
+    echo "${id} sox ${filename} -r 22050 -t wav -c 1 -b 16 - |" >> "${scp}"
+    echo "${id} ${lang}" >> "${utt2spk}"
+done
+utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+echo "Successfully finished making wav.scp, utt2spk, spk2utt."
+
+# make text
+find "${db_root}" -name "transcript*.txt" | sort | while read -r filename; do
+   sed < "${filename}" -e "s;^[^/.]*/;;g" -e "s/.wav//g" -e "s/　//g" |
+        awk -F "|" -v spk="${lang}" '{print spk "_" $1 " " $2}' |
+        sort >> "${text}"
+done
+echo "Successfully finished making text."
+
+utils/fix_data_dir.sh "${data_dir}"
+echo "Successfully finished preparing data directory."
diff --git a/egs2/css10/tts1/local/path.sh b/egs2/css10/tts1/local/path.sh
new file mode 100644
index 00000000000..3c8f2419056
--- /dev/null
+++ b/egs2/css10/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import phonemizer" > /dev/null; then
+    echo "Error: phonemizer is not installed." >&2
+    echo "Error: please install phonemizer and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && source activate_python.sh && ./installers/install_phonemizer.sh" >&2
+    return 1
+fi
diff --git a/egs2/css10/tts1/path.sh b/egs2/css10/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/css10/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/css10/tts1/pyscripts b/egs2/css10/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/css10/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/css10/tts1/run.sh b/egs2/css10/tts1/run.sh
new file mode 100755
index 00000000000..36d0771352b
--- /dev/null
+++ b/egs2/css10/tts1/run.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Feature related
+fs=22050
+n_fft=1024
+n_shift=256
+win_length=null
+
+# Data prep related
+text_format=phn  # Use "raw" or "phn". If use "phn", convert to phn in data prep.
+local_data_opts+=" --text_format ${text_format}"
+
+dset_suffix=""
+if [ "${text_format}" = phn ]; then
+    dset_suffix=_phn
+fi
+train_set=tr_no_dev${dset_suffix}
+valid_set=dev${dset_suffix}
+test_sets="dev${dset_suffix} eval1${dset_suffix}"
+
+# config related
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# NOTE(kan-bayashi): Make sure that you use text_format=raw
+#   if you want to use token_type=char.
+token_type=phn
+
+# NOTE(kan-bayashi): For now, multi-language G2P is not supported
+#   so we convert text into phoeneme in data prep stage.
+g2p=none
+
+./tts.sh \
+    --use_xvector true \
+    --local_data_opts "${local_data_opts}" \
+    --audio_format wav \
+    --lang noinfo \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type "${token_type}" \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    "$@"
diff --git a/egs2/css10/tts1/scripts b/egs2/css10/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/css10/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/css10/tts1/sid b/egs2/css10/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/css10/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/css10/tts1/steps b/egs2/css10/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/css10/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/css10/tts1/tts.sh b/egs2/css10/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/css10/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/css10/tts1/utils b/egs2/css10/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/css10/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/dirha_wsj/asr1/cmd.sh b/egs2/dirha_wsj/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/dirha_wsj/asr1/cmd.sh
+++ b/egs2/dirha_wsj/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/dns_ins20/enh1/README.md b/egs2/dns_ins20/enh1/README.md
new file mode 100644
index 00000000000..1f2b00b41df
--- /dev/null
+++ b/egs2/dns_ins20/enh1/README.md
@@ -0,0 +1,46 @@
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Fri May 28 20:55:14 CST 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.9`
+- pytorch version: `pytorch 1.4.0`
+- Git hash: `be73d0de071e9a7fcaf98ad2e5c94dad9ca73cda`
+  - Commit date: `Fri May 28 19:14:45 2021 +0800`
+
+
+## enh_train_enh_blstm_tf_raw
+
+ - config: ./conf/tuning/train_enh_blstm_tf.yaml
+ - Pretrained model: https://zenodo.org/record/4923697
+
+| dataset                           | STOI | SAR   | SDR   | SIR  |
+| --------------------------------- | ---- | ----- | ----- | ---- |
+| enhanced_cv_synthetic             | 0.95 | 18.63 | 18.63 | 0.00 |
+| enhanced_tt_synthetic_no_reverb   | 0.92 | 10.92 | 10.92 | 0.00 |
+| enhanced_tt_synthetic_with_reverb | 0.85 | 9.31  | 9.31  | 0.00 |
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Feb 10 23:11:40 CST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.9.1`
+- Git hash: `6f66283b9eed7b0d5e5643feb18d8f60118a4afc`
+  - Commit date: `Mon Dec 13 15:30:29 2021 +0800`
+
+
+## enh_train_enh_dccrn_raw
+
+- config: ./conf/tuning/train_enh_dccrn.yaml
+- download_model: https://huggingface.co/Johnson-Lsx/Shaoxiong_Lin_dns_ins20_enh_enh_train_enh_dccrn_raw
+
+| dataset                           | PESQ | STOI | SAR   | SDR   | SIR  | SI_SNR |
+| --------------------------------- | ---- | ---- | ----- | ----- | ---- | ------ |
+| enhanced_cv_synthetic             | 3.72 | 0.98 | 24.69 | 24.69 | 0.00 | 24.22  |
+| enhanced_tt_synthetic_no_reverb   | 3.29 | 0.96 | 17.69 | 17.69 | 0.00 | 17.50  |
+| enhanced_tt_synthetic_with_reverb | 2.54 | 0.81 | 10.45 | 10.45 | 0.00 | 9.72   |
+
+Note: Here, the model is only trained on data without reverberation.
+Note: Here, the PESQ score is calculated based on https://github.com/vBaiCai/python-pesq.
diff --git a/egs2/dns_ins20/enh1/cmd.sh b/egs2/dns_ins20/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dns_ins20/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dns_ins20/enh1/conf/pbs.conf b/egs2/dns_ins20/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dns_ins20/enh1/conf/queue.conf b/egs2/dns_ins20/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dns_ins20/enh1/conf/slurm.conf b/egs2/dns_ins20/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/dns_ins20/enh1/conf/train.yaml b/egs2/dns_ins20/enh1/conf/train.yaml
new file mode 120000
index 00000000000..42f67c21642
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_enh_blstm_tf.yaml
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/conf/tuning/train_enh_blstm_tf.yaml b/egs2/dns_ins20/enh1/conf/tuning/train_enh_blstm_tf.yaml
new file mode 100644
index 00000000000..6f2d899542a
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/tuning/train_enh_blstm_tf.yaml
@@ -0,0 +1,46 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size:  64
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+model_conf:
+    loss_type: mask_mse
+    mask_type: psm
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: rnn
+separator_conf:
+    rnn_type: blstm
+    num_spk: 1
+    nonlinear: relu
+    layer: 3
+    unit: 896
+    dropout: 0.5
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml b/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml
new file mode 100644
index 00000000000..695e04f8cc0
--- /dev/null
+++ b/egs2/dns_ins20/enh1/conf/tuning/train_enh_dccrn.yaml
@@ -0,0 +1,53 @@
+optim: adam
+init: null   # do not set init method here because DCCRN has its own initialization
+max_epoch: 100
+batch_type: folded
+batch_size: 32
+iterator_type: chunk
+chunk_length: 64000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+patience: 10
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+model_conf:
+    loss_type: si_snr
+encoder: stft
+encoder_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 100
+decoder: stft
+decoder_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 100
+separator: dccrn
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/dns_ins20/enh1/db.sh b/egs2/dns_ins20/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dns_ins20/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/enh.sh b/egs2/dns_ins20/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/dns_ins20/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/local/convert2wav.sh b/egs2/dns_ins20/enh1/local/convert2wav.sh
new file mode 120000
index 00000000000..d3cc3acbeaf
--- /dev/null
+++ b/egs2/dns_ins20/enh1/local/convert2wav.sh
@@ -0,0 +1 @@
+../../../wsj0_2mix_spatialized/enh1/local/convert2wav.sh
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/local/data.sh b/egs2/dns_ins20/enh1/local/data.sh
new file mode 100755
index 00000000000..a1d4f20d9e8
--- /dev/null
+++ b/egs2/dns_ins20/enh1/local/data.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+help_message=$(cat << EOF
+Usage: $0 [--stage <stage>] [--stop_stage <stop_stage>] [--configure <conf.json>]
+  optional argument:
+    [--stage]: 1 (default) or 2
+    [--stop_stage]: 1 or 2 (default)
+    [--configure]: use another specific configuration file 
+    NOTE:
+        stage 1: Create the Data Mixture from the DNS scripts. You can skip this step when you already have the audio mixture for training.
+        stage 2: Prepare the data for ESPNet-se
+        You can clone the DNS-interspeech2020 by git clone -b interspeech2020/master https://github.com/microsoft/DNS-Challenge.git DNS-Challenge
+        If you do not want to use the default noisyspeech_synthesizer.cfg configuration under the DNS directory, you can specify your configuration file.
+        Please make sure the destination is under data/dns_wav
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+stage=1
+stop_stage=2
+configure=
+dns_wav=$PWD/data/dns_wav
+
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if [ ! -e "${DNS}" ]; then
+    log "Fill the value of 'DNS' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Simulation"
+    if [ -z "$configure" ]; then
+        local/dns_create_mixture.sh ${DNS} ${dns_wav}  || exit 1;
+    else
+        local/dns_create_mixture.sh --configure ${configure} ${DNS} ${dns_wav}  || exit 1;
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    # The following datasets will be created:
+    # {tr,cv}_synthetic tt_synthetic__{no,with}_reverb
+    local/dns_data_prep.sh  ${dns_wav} ${DNS}/datasets/test_set/ || exit 1;
+fi
+
diff --git a/egs2/dns_ins20/enh1/local/dns_create_mixture.sh b/egs2/dns_ins20/enh1/local/dns_create_mixture.sh
new file mode 100755
index 00000000000..3099e123381
--- /dev/null
+++ b/egs2/dns_ins20/enh1/local/dns_create_mixture.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+configure=   # Path to the configure file
+
+. utils/parse_options.sh
+. path.sh
+. cmd.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 --configure <configure> <dns> <dns_wav> "
+  echo " where <dns> is dns directory,"
+  echo " <dns_wav> is wav generation space."
+  exit 1;
+fi
+
+dns=$1
+dns_wav=$2
+
+rm -r data/ 2>/dev/null || true
+mkdir -p data/
+
+if [ -z "$configure" ]; then
+  # modify path in the original noisyspeech_synthesizer.cfg
+  configure=${dns}/noisyspeech_synthesizer.cfg
+  train_cfg=data/noisyspeech_synthesizer.cfg
+
+
+  if [ ! -f ${configure} ]; then
+    echo -e "Please check configurtion ${configure} exist"
+    exit 1;
+  fi
+
+  #input datas
+  noise_dir=${dns}/datasets/noise
+  speech_dir=${dns}/datasets/clean
+
+  #outputs
+  noisy_wav=${dns_wav}/noisy
+  clean_wav=${dns_wav}/clean
+  noise_wav=${dns_wav}/noise
+  log_dir=data/log
+
+  #modify the input paths for "\" separated paths
+  sed -e "/^noisy_destination/s#.*#noisy_destination:${noisy_wav}#g"  \
+      -e "/^clean_destination/s#.*#clean_destination:${clean_wav}#g"  \
+      -e "/^noise_destination/s#.*#noise_destination:${noise_wav}#g"  \
+      -e "/^noise_dir/s#.*#noise_dir:${noise_dir}#g"  \
+      -e "/^speech_dir/s#.*#speech_dir:${speech_dir}#g"  \
+      -e "/^log_dir/s#.*#log_dir:${log_dir}#g" ${configure} \
+    > ${train_cfg}
+else
+  cp ${configure} ${train_cfg}
+fi
+
+mix_script=${dns}/noisyspeech_synthesizer_multiprocessing.py
+
+if [ ! -f ${configure} -a -f ${mix_script} ]; then
+  echo -e "Please check configurtion ${configure} and mix_script ${mix_script} exist"
+  exit 1;
+fi
+
+echo "Creating Mixtures for Training and Validation Data."
+python ${mix_script} --cfg ${PWD}/${train_cfg} >/dev/null || exit 1;
+
+
diff --git a/egs2/dns_ins20/enh1/local/dns_data_prep.sh b/egs2/dns_ins20/enh1/local/dns_data_prep.sh
new file mode 100755
index 00000000000..21301a02ae9
--- /dev/null
+++ b/egs2/dns_ins20/enh1/local/dns_data_prep.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+
+. ./path.sh
+
+
+
+if [ $# -ne 2 ]; then
+  echo "Arguments should be DNS script path, DNS wav path and DNS data, see local/data.sh for example."
+  exit 1;
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dns_wav=$1
+dns_test_wav=$2
+
+# check if the wav dirs exist.
+
+for ddir in clean noise noisy; do
+  f=${dns_wav}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+#Synthetic test data
+for rev in with_reverb no_reverb; do
+  for ddir in clean noisy; do
+    f=${dns_test_wav}/synthetic/${rev}/${ddir}
+    if [ ! -d $f ]; then
+      echo "Error: $f is not a directory."
+      exit 1;
+    fi
+  done
+done
+
+#Real_recordings test data
+f=${dns_test_wav}/real_recordings
+if [ ! -d $f ]; then
+  echo "Error: $f is not a directory."
+  exit 1;
+fi
+
+data=./data
+rm -r ${data}/{tr, cv}_synthetic 2>/dev/null || true
+rm -r ${data}/tt_synthetic_{with,no}_reverb 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+mixwav_dir=${dns_wav}/noisy
+
+find $mixwav_dir -iname '*.wav' | sort -u > $tmpdir/train_valid.flist
+
+sed -e 's:.*reader_\([0-9]*\)_.*_\([0-9]*\).wav$:reader_\1_fileid_\2:i' $tmpdir/train_valid.flist \
+> $tmpdir/train_valid.uttids
+
+paste $tmpdir/train_valid.uttids $tmpdir/train_valid.flist \
+| sort -k1,1 >  $tmpdir/train_valid.scp
+
+num=$(wc -l $tmpdir/train_valid.scp | awk '{print $1}')
+train_num=$(($num*9/10))
+
+echo "Split 10% of the Training data to the Validation data"
+awk "NR<=$train_num" $tmpdir/train_valid.scp > $tmpdir/tr.scp
+awk "NR>$train_num" $tmpdir/train_valid.scp > $tmpdir/cv.scp
+
+for x in tr cv; do
+  ddir=${x}_synthetic
+  mkdir -p ${data}/${ddir}
+  cp $tmpdir/${x}.scp ${data}/${ddir}/wav.scp
+  
+  awk '{split($1, lst, "_"); spk=lst[1]"_"lst[2]; print($1, spk)}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+  noise_wav_dir=${dns_wav}/noise/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${noise_wav_dir}noise_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/noise1.scp
+
+  spk1_wav_dir=${dns_wav}/clean/
+  sed -e "s#${mixwav_dir}.*_\(.*\).wav#${spk1_wav_dir}clean_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+done
+
+
+echo "Building testing data"
+
+for x in tt; do
+  echo "Building synthetic testing data"
+  for rev in with_reverb no_reverb; do
+    ddir=${x}_synthetic_${rev}
+    mkdir -p ${data}/${ddir}
+    root_dir=${dns_test_wav}/synthetic/${rev}
+
+    mixwav_dir=${root_dir}/noisy/
+    find $mixwav_dir -iname '*.wav' > $tmpdir/${x}_${rev}.flist
+
+    sed -e 's:.*_\([0-9]*\).wav$:fileid_\1:i' $tmpdir/${x}_${rev}.flist \
+    > $tmpdir/${x}_${rev}.uttids
+
+    paste $tmpdir/${x}_${rev}.uttids $tmpdir/${x}_${rev}.flist \
+    | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+    awk '{print($1, $1)}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+
+
+    awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+    utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+    spk1_wav_dir=${root_dir}/clean/
+    sed -e "s#${mixwav_dir}.*_\(.*\).wav#${spk1_wav_dir}clean_fileid_\1.wav#g" ${data}/${ddir}/wav.scp \
+      > ${data}/${ddir}/spk1.scp
+  done
+
+  echo "Building real testing data"
+  ddir=${x}_real_recordings
+  mkdir -p ${data}/${ddir}
+  real_dir=${dns_test_wav}/real_recordings
+
+  find $real_dir -iname '*.wav' > $tmpdir/${x}_real_recordings.flist
+  
+  sed -e 's:.*\/\(.*\).wav$:\1:i' $tmpdir/${x}_real_recordings.flist \
+  > $tmpdir/${x}_real_recordings.uttids
+
+  paste $tmpdir/${x}_real_recordings.uttids $tmpdir/${x}_real_recordings.flist \
+  | sort -k1,1 >  ${data}/${ddir}/wav.scp 
+
+
+  awk '{print($1, $1)}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/utt2spk
+
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+  sort -u> ${data}/${ddir}/text
+
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+  
+
+done
+
diff --git a/egs2/dns_ins20/enh1/local/path.sh b/egs2/dns_ins20/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dns_ins20/enh1/path.sh b/egs2/dns_ins20/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/dns_ins20/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/pyscripts b/egs2/dns_ins20/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dns_ins20/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/run.sh b/egs2/dns_ins20/enh1/run.sh
new file mode 100755
index 00000000000..8cddfee3a19
--- /dev/null
+++ b/egs2/dns_ins20/enh1/run.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=16k
+
+
+
+train_set=tr_synthetic
+valid_set=cv_synthetic
+test_sets="tt_synthetic_no_reverb tt_synthetic_with_reverb"
+
+./enh.sh \
+    --lang en \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 2 \
+    --spk_num 1 \
+    --local_data_opts "" \
+    --enh_config ./conf/tuning/train_enh_blstm_tf.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref true \
+    --max_wav_duration 31 \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/dns_ins20/enh1/scripts b/egs2/dns_ins20/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/dns_ins20/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/steps b/egs2/dns_ins20/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dns_ins20/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dns_ins20/enh1/utils b/egs2/dns_ins20/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dns_ins20/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/dsing/asr1/RESULTS.md b/egs2/dsing/asr1/RESULTS.md
new file mode 100644
index 00000000000..0cdd661e049
--- /dev/null
+++ b/egs2/dsing/asr1/RESULTS.md
@@ -0,0 +1,55 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Mar 19 23:02:37 EDT 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c1ed71c6899e54c0b3dad82687886b1183cd0885`
+  - Commit date: `Wed Mar 16 23:34:49 2022 -0400`
+
+## asr_train_asr_conformer7_hubert_ll60k_large_raw_bpe500_sp
+- model:  https://huggingface.co/espnet/ftshijt_espnet2_asr_dsing_hubert_conformer
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|4018|83.6|9.4|7.0|6.4|22.8|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|4632|81.4|12.3|6.3|4.5|23.1|52.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|18692|88.5|3.1|8.4|5.9|17.4|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|21787|87.9|4.3|7.8|4.5|16.6|52.1|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/dev|482|6097|82.2|7.1|10.7|5.7|23.5|58.3|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_latest/test|480|7736|81.7|9.2|9.1|4.0|22.3|52.1|
+
+## asr_train_asr_raw_bpe500_sp
+- model: https://huggingface.co/espnet/ftshijt_espnet2_asr_dsing_transformer
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|4018|77.0|16.2|6.8|4.0|27.0|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|4632|76.1|17.3|6.6|3.7|27.6|57.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|18692|85.0|5.8|9.2|4.2|19.2|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|21787|84.9|6.3|8.8|4.2|19.3|57.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|482|6097|75.2|12.8|12.0|4.1|28.9|65.1|
+|decode_asr_lm_lm_train_lm_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|480|7736|75.3|14.3|10.4|4.1|28.8|57.7|
\ No newline at end of file
diff --git a/egs2/dsing/asr1/asr.sh b/egs2/dsing/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/dsing/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/cmd.sh b/egs2/dsing/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/dsing/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/dsing/asr1/conf/decode_asr.yaml b/egs2/dsing/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/dsing/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/dsing/asr1/conf/fbank.conf b/egs2/dsing/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/dsing/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/dsing/asr1/conf/pbs.conf b/egs2/dsing/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/dsing/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/dsing/asr1/conf/pitch.conf b/egs2/dsing/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/dsing/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/dsing/asr1/conf/queue.conf b/egs2/dsing/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/dsing/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/dsing/asr1/conf/slurm.conf b/egs2/dsing/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/dsing/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/commonvoice/asr1/conf/train_asr_transformer.yaml b/egs2/dsing/asr1/conf/train_asr.yaml
similarity index 100%
rename from egs2/commonvoice/asr1/conf/train_asr_transformer.yaml
rename to egs2/dsing/asr1/conf/train_asr.yaml
diff --git a/egs2/dsing/asr1/conf/train_lm.yaml b/egs2/dsing/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/dsing/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml b/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_conformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml b/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml b/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml
new file mode 100644
index 00000000000..4ec26c01907
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_conformer6.yaml
@@ -0,0 +1,79 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..545cd8a8333
--- /dev/null
+++ b/egs2/dsing/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform
diff --git a/egs2/dsing/asr1/db.sh b/egs2/dsing/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/dsing/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/local/data.sh b/egs2/dsing/asr1/local/data.sh
new file mode 100644
index 00000000000..26c61801e5f
--- /dev/null
+++ b/egs2/dsing/asr1/local/data.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon  University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+dsing=1
+
+ . utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${DSING}
+if [ -z "${DSING}" ]; then
+    log "Fill the value of 'DSING' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train${dsing}
+train_dev=dev
+test_set="dev test"
+
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Prepare stage1: Download data to ${DSING}"
+    echo "Please download the data at https://ccrma.stanford.edu/damp/"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "Prepare stage2: segmentation setup for Dsing"
+    if [ -d "local/dsing_task" ]; then
+       echo "exist segmetation, skip git clone"
+    else
+        git clone https://github.com/groadabike/Kaldi-Dsing-task.git local/dsing_task
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "Prepare stage3: dataset prepare"
+    for datadir in ${train_set} ${train_dev} ${test_set}; do
+        python local/data_prep.py data/ ${DSING}/sing_300x30x2 local/dsing_task/DSing\ Kaldi\ Recipe/dsing/s5/conf/${datadir}.json ${datadir}
+        utils/utt2spk_to_spk2utt.pl data/${datadir}/utt2spk > data/${datadir}/spk2utt
+        utils/fix_data_dir.sh data/${datadir}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/dsing/asr1/local/data_prep.py b/egs2/dsing/asr1/local/data_prep.py
new file mode 100644
index 00000000000..98d82fe1259
--- /dev/null
+++ b/egs2/dsing/asr1/local/data_prep.py
@@ -0,0 +1,195 @@
+# Source from https://github.com/groadabike/Kaldi-Dsing-task
+
+import json
+import argparse
+from os.path import join, exists, isfile
+from os import makedirs, listdir
+import re
+import hashlib
+
+
+class DataSet:
+    def __init__(self, name, workspace, db_path):
+        self.segments = []
+        self.spk2gender = []
+        self.text = []
+        self.utt2spk = []
+        self.wavscp = []
+        self.workspace = join(workspace, name)
+        self.db_path = db_path
+
+    def add_utterance(self, utt, recording):
+
+        text = utt["text"]
+        arrangement, performance, country, gender, user = recording[:-4].split("-")
+
+        # the following mapping is necessary for errors in gender in country IN
+        insensitive_none = re.compile(re.escape("none"), re.IGNORECASE)
+
+        gender = insensitive_none.sub("", utt["gender"])
+        spk = "{}{}".format(
+            insensitive_none.sub("", gender).upper(), insensitive_none.sub("", user)
+        )
+
+        rec_id = recording[:-4]
+        utt_id = "{}-{}-{}-{}-{}-{:03}".format(
+            spk, arrangement, performance, country, gender.upper(), utt["index"]
+        )
+
+        start = utt["start"]
+        end = utt["end"]
+
+        wavpath = join(country, "{}{}".format(country, "Vocals"), recording)
+
+        self._add_segment(utt_id, rec_id, start, end)
+        self._add_spk2gender(spk, gender)
+        self._add_text(utt_id, text)
+        self._add_utt2spk(utt_id, spk)
+        self._add_wavscp(rec_id, wavpath)
+
+    def _add_segment(self, rec_id, utt_id, start, end):
+        self.segments.append("{} {} {:.3f} {:.3f}".format(rec_id, utt_id, start, end))
+
+    def _add_spk2gender(self, spk, gender):
+        self.spk2gender.append("{} {}".format(spk, gender))
+
+    def _add_text(self, utt_id, text):
+        self.text.append("{} {}".format(utt_id, text))
+
+    def _add_utt2spk(self, utt_id, spk):
+        self.utt2spk.append("{} {}".format(utt_id, spk))
+
+    def _add_wavscp(self, rec_id, wavpath):
+        # use ffmpeg or sox (default ffmepg)
+        self.wavscp.append(
+            "{} ffmpeg -i {}/{} -f wav -ar 16000 -ac 1 - | ".format(
+                rec_id, self.db_path, wavpath
+            )
+        )
+        # self.wavscp.append(
+        #     "{} sox {}/{} -G -t wav -r 16000 -c 1 - remix 1 | ".format(
+        #         rec_id, db_path, wavpath
+        #     )
+        #  )
+
+    def list2file(self, outfile, list_data):
+        list_data = list(set(list_data))
+        with open(outfile, "w") as f:
+            for line in list_data:
+                f.write("{}\n".format(line))
+
+    def save(self):
+        if not exists(self.workspace):
+            makedirs(self.workspace)
+        self.list2file(join(self.workspace, "spk2gender"), sorted(self.spk2gender))
+        self.list2file(join(self.workspace, "text"), sorted(self.text))
+        self.list2file(join(self.workspace, "wav.scp"), sorted(self.wavscp))
+        self.list2file(join(self.workspace, "utt2spk"), sorted(self.utt2spk))
+        self.list2file(join(self.workspace, "segments"), sorted(self.segments))
+
+
+def read_json(filepath):
+    try:  # Read the json
+        with open(filepath) as data_file:
+            data = json.load(data_file)
+    except json.decoder.JSONDecodeError:
+        # Json has an extra first line. Error when was created
+        data = []
+
+    return data
+
+
+def map_rec2chec(db_path, countries):
+    """
+    Method read all the original audio tracks and create a dict
+            {<checksum>: <recording>}
+    :param db_path: string, path to root of DAMP Sing!
+    :return: dict
+    """
+    rec2chec = {}
+    for country in countries:
+        recordings = [
+            f
+            for f in listdir(join(db_path, country, country + "Vocals"))
+            if f.endswith(".m4a")
+        ]
+        for record in recordings:
+            rec2chec[
+                hashlib.md5(
+                    open(
+                        join(db_path, country, country + "Vocals", record), "rb"
+                    ).read()
+                ).hexdigest()
+            ] = record
+
+    return rec2chec
+
+
+def main(args):
+    db_path = args.db_path
+    workspace = args.workspace
+    utts_path = args.utterances
+    dset = args.dset
+
+    countries = ["GB"]
+    countries += ["US", "AU"] if dset in ["train3", "train30"] else []
+    countries += (
+        [
+            "AE",
+            "AR",
+            "BR",
+            "CL",
+            "CN",
+            "DE",
+            "ES",
+            "FR",
+            "HU",
+            "ID",
+            "IN",
+            "IQ",
+            "IR",
+            "IT",
+            "JP",
+            "KR",
+            "MX",
+            "MY",
+            "NO",
+            "PH",
+            "PT",
+            "RU",
+            "SA",
+            "SG",
+            "TH",
+            "VN",
+            "ZA",
+        ]
+        if dset in ["train30"]
+        else []
+    )
+
+    performances = map_rec2chec(db_path, countries)
+    utterances = read_json(utts_path)
+    dataset = DataSet(dset, workspace, db_path)
+
+    for utt in utterances:
+        dataset.add_utterance(utt, performances[utt["wavfile"]])
+
+    dataset.save()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "workspace", type=str, help="Path where the output files will be saved"
+    )
+    parser.add_argument("db_path", type=str, help="Path to DAMP 300x30x2 database")
+    parser.add_argument(
+        "utterances",
+        type=str,
+        help="Path to utterance details in json format",
+        default="metadata.json",
+    )
+    parser.add_argument("dset", type=str, help="Name of the dataset")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/egs2/dsing/asr1/local/path.sh b/egs2/dsing/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/dsing/asr1/path.sh b/egs2/dsing/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/dsing/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/dsing/asr1/pyscripts b/egs2/dsing/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/dsing/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/dsing/asr1/run.sh b/egs2/dsing/asr1/run.sh
new file mode 100755
index 00000000000..2ce6a7d68bb
--- /dev/null
+++ b/egs2/dsing/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+dsing=30  # Set: 1  for DSing1
+          #    3  for DSing3
+          #    30 for DSing30
+
+train_set=train${dsing}
+train_dev=dev
+test_set="dev test"
+
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=500
+
+./asr.sh \
+    --ngpu 1 \
+    --stage 1 \
+    --stop_stage 100 \
+    --local_data_opts "--dsing ${dsing}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/dsing/asr1/scripts b/egs2/dsing/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/dsing/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/dsing/asr1/steps b/egs2/dsing/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/dsing/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/dsing/asr1/utils b/egs2/dsing/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/dsing/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/RESULTS.md b/egs2/fisher_callhome_spanish/asr1/RESULTS.md
new file mode 100644
index 00000000000..e8292401a7b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/RESULTS.md
@@ -0,0 +1,118 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Fri Feb 25 11:45:29 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0`
+- Git hash: `54799d2fa7beb702ab909a7e57cc70288e3ce96c`
+  - Commit date: `Tue Feb 22 10:31:31 2022 -0500`
+
+## asr_8k_conformer (no callhome training)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|37982|64.9|24.8|10.3|6.4|41.5|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|19035|63.1|25.6|11.3|6.4|43.3|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|40961|78.5|13.4|8.1|4.8|26.3|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|39871|78.2|14.0|7.8|5.8|27.7|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|40011|80.0|12.8|7.2|5.8|25.8|64.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|180997|80.8|7.1|12.1|6.2|25.4|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|91266|79.2|7.6|13.2|5.9|26.8|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|194297|88.6|3.4|8.0|5.3|16.7|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|189893|88.4|3.8|7.7|7.0|18.6|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|194507|89.6|3.2|7.3|5.9|16.3|64.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|56665|64.3|20.7|15.0|5.6|41.3|79.8|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|28386|62.2|21.4|16.3|6.1|43.9|82.2|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|55856|79.0|11.6|9.4|6.5|27.5|65.6|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|53962|79.1|12.5|8.4|8.9|29.8|68.1|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|54138|81.4|10.7|7.9|7.7|26.3|64.2|
+
+## asr_8k_transformer (no callhome training)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|37982|53.1|33.3|13.7|6.0|52.9|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|19035|52.3|34.0|13.7|6.0|53.7|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|40961|76.8|16.5|6.7|5.1|28.3|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|39871|77.8|16.3|5.9|6.0|28.2|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|40011|79.9|14.5|5.5|5.8|25.9|69.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|180997|74.3|10.0|15.7|6.2|31.9|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|91266|73.3|10.2|16.5|6.1|32.8|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|194297|89.7|4.0|6.3|5.6|15.9|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|189893|90.4|4.1|5.5|7.0|16.6|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|194507|91.6|3.4|5.0|5.9|14.3|69.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/callhome_devtest|3956|56665|52.2|28.0|19.8|5.0|52.8|85.1|
+|decode_asr_asr_model_valid.acc.ave/callhome_evltest|1825|28386|50.8|28.9|20.2|5.2|54.3|86.7|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev|3977|55856|76.2|14.3|9.5|5.8|29.6|70.0|
+|decode_asr_asr_model_valid.acc.ave/fisher_dev2|3958|53962|77.5|14.3|8.2|7.9|30.4|70.8|
+|decode_asr_asr_model_valid.acc.ave/fisher_test|3641|54138|80.1|12.3|7.5|6.6|26.5|69.2|
+
+
+## asr_train_asr_raw_bpe1000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|81587|77.8|16.1|6.1|6.0|28.2|62.4|
+|decode_asr_asr_model_valid.acc.ave/test|6283|40307|80.5|14.6|4.9|5.9|25.4|61.4|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|392279|89.7|3.9|6.4|5.7|16.0|62.4|
+|decode_asr_asr_model_valid.acc.ave/test|6283|195370|91.8|3.3|4.9|5.6|13.9|61.4|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|115994|76.9|13.3|9.7|5.4|28.5|62.4|
+|decode_asr_asr_model_valid.acc.ave/test|6283|55738|80.2|12.0|7.9|5.8|25.6|61.4|
+
+
+
+## asr_train_asr_conformer6_raw_bpe1000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|81587|82.4|12.4|5.2|5.4|23.0|57.5|
+|decode_asr_asr_model_valid.acc.ave/test|6283|40307|85.0|11.0|4.1|5.4|20.5|55.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|392279|91.6|2.9|5.4|5.3|13.7|57.5|
+|decode_asr_asr_model_valid.acc.ave/test|6283|195370|93.6|2.4|4.0|5.4|11.7|55.5|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_all|12986|115994|81.6|10.1|8.3|5.3|23.7|57.5|
+|decode_asr_asr_model_valid.acc.ave/test|6283|55738|84.9|8.6|6.5|5.7|20.7|55.5|
diff --git a/egs2/fisher_callhome_spanish/asr1/asr.sh b/egs2/fisher_callhome_spanish/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/cmd.sh b/egs2/fisher_callhome_spanish/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/decode_asr.yaml b/egs2/fisher_callhome_spanish/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/fbank.conf b/egs2/fisher_callhome_spanish/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/pbs.conf b/egs2/fisher_callhome_spanish/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf b/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/queue.conf b/egs2/fisher_callhome_spanish/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/slurm.conf b/egs2/fisher_callhome_spanish/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/train_asr.yaml b/egs2/fisher_callhome_spanish/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/train_lm.yaml b/egs2/fisher_callhome_spanish/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_conformer.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_conformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_conformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_rnn.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_transformer.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_conformer6.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_conformer6.yaml
new file mode 100644
index 00000000000..4ec26c01907
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_conformer6.yaml
@@ -0,0 +1,79 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..545cd8a8333
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform
diff --git a/egs2/fisher_callhome_spanish/asr1/db.sh b/egs2/fisher_callhome_spanish/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/callhome_create_splits.sh b/egs2/fisher_callhome_spanish/asr1/local/callhome_create_splits.sh
new file mode 120000
index 00000000000..cb162d8e875
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/callhome_create_splits.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_create_splits.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/callhome_data_prep.sh b/egs2/fisher_callhome_spanish/asr1/local/callhome_data_prep.sh
new file mode 120000
index 00000000000..f7c7948603c
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/callhome_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_data_prep.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/callhome_make_spk2gender.py b/egs2/fisher_callhome_spanish/asr1/local/callhome_make_spk2gender.py
new file mode 120000
index 00000000000..d465057b745
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/callhome_make_spk2gender.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/callhome_make_trans.pl b/egs2/fisher_callhome_spanish/asr1/local/callhome_make_trans.pl
new file mode 120000
index 00000000000..88959b1138f
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/callhome_make_trans.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_make_trans.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/concat_short_utt.py b/egs2/fisher_callhome_spanish/asr1/local/concat_short_utt.py
new file mode 120000
index 00000000000..f0566e73a01
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/concat_short_utt.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/concat_short_utt.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/create_splits.sh b/egs2/fisher_callhome_spanish/asr1/local/create_splits.sh
new file mode 120000
index 00000000000..d2e87a8a129
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/create_splits.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/create_splits.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/data.sh b/egs2/fisher_callhome_spanish/asr1/local/data.sh
new file mode 100755
index 00000000000..ebfb413212a
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/data.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+. utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${FISHER_CALLHOME_SPANISH}
+if [ -z "${FISHER_CALLHOME_SPANISH}" ]; then
+    log "Fill the value of 'FISHER_CALLHOME_SPANISH' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Assume the file structures as
+# - ${FISHER_CALLHOME_SPANISH}
+#     - LDC2010S01 # (for fisher speech)
+#     - LDC2010T04 # (for fisher transcripts)
+#     - LDC96S35   # (for callhome speech)
+#     - LDC96T17   # (for callhome transcripts)
+
+sfisher_speech=${FISHER_CALLHOME_SPANISH}/LDC2010S01
+sfisher_transcripts=${FISHER_CALLHOME_SPANISH}/LDC2010T04
+split=local/splits/split_fisher
+callhome_speech=${FISHER_CALLHOME_SPANISH}/LDC96S35
+callhome_transcripts=${FISHER_CALLHOME_SPANISH}/LDC96T17
+split_callhome=local/splits/split_callhome
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Make sure you have fisher_callhome_spanish at ${sfisher_speech}, ${sfisher_transcripts}, \
+             ${callhome_speech}, ${callhome_transcripts}"
+    log "stage 0: Data Preparation"
+    local/fsp_data_prep.sh ${sfisher_speech} ${sfisher_transcripts}
+    local/callhome_data_prep.sh ${callhome_speech} ${callhome_transcripts}
+
+    # split data
+    local/create_splits.sh ${split}
+    local/callhome_create_splits.sh ${split_callhome}
+
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Concatenate Multiple Utterances"
+
+    # concatenate multiple utterances
+    local/normalize_trans.sh ${sfisher_transcripts} ${callhome_transcripts}
+
+    # skip combination to have same condition of previous result
+    # utils/combine_data.sh \
+    #     --extra-files "text.lc.es text.lc.rm.es text.tc.es" \
+    #     data/train \
+    #     data/fisher_train data/callhome_train/ 
+
+    cp -r data/fisher_train data/train
+    cp -r data/fisher_dev data/dev
+fi
+
+
+
diff --git a/egs2/fisher_callhome_spanish/asr1/local/fsp_data_prep.sh b/egs2/fisher_callhome_spanish/asr1/local/fsp_data_prep.sh
new file mode 120000
index 00000000000..8b01fc5da76
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/fsp_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_data_prep.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/fsp_make_spk2gender.py b/egs2/fisher_callhome_spanish/asr1/local/fsp_make_spk2gender.py
new file mode 120000
index 00000000000..2483eee0bf8
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/fsp_make_spk2gender.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/fsp_make_trans.pl b/egs2/fisher_callhome_spanish/asr1/local/fsp_make_trans.pl
new file mode 120000
index 00000000000..67a714c738c
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/fsp_make_trans.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_make_trans.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/normalize_punctuation.pl b/egs2/fisher_callhome_spanish/asr1/local/normalize_punctuation.pl
new file mode 120000
index 00000000000..3235cbb3b86
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/normalize_punctuation.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/normalize_punctuation.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/normalize_trans.sh b/egs2/fisher_callhome_spanish/asr1/local/normalize_trans.sh
new file mode 120000
index 00000000000..9f03ca409ff
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/normalize_trans.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/local/path.sh b/egs2/fisher_callhome_spanish/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/fisher_callhome_spanish/asr1/local/splits b/egs2/fisher_callhome_spanish/asr1/local/splits
new file mode 120000
index 00000000000..0156cd08892
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/local/splits
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/splits
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/path.sh b/egs2/fisher_callhome_spanish/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/pyscripts b/egs2/fisher_callhome_spanish/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/run.sh b/egs2/fisher_callhome_spanish/asr1/run.sh
new file mode 100755
index 00000000000..632af2b4b49
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/run.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set="fisher_dev fisher_dev2 fisher_test callhome_evltest callhome_devtest"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=1000
+
+./asr.sh \
+    --ngpu 1 \
+    --fs 8k \
+    --audio_format "flac.ark" \
+    --local_data_opts "--stage 0" \
+    --use_lm false \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/fisher_callhome_spanish/asr1/scripts b/egs2/fisher_callhome_spanish/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/steps b/egs2/fisher_callhome_spanish/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/asr1/utils b/egs2/fisher_callhome_spanish/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/RESULT.md b/egs2/fisher_callhome_spanish/st1/RESULT.md
new file mode 100644
index 00000000000..6efdcb6d5ef
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/RESULT.md
@@ -0,0 +1,15 @@
+# NOTE: apostrophe is included both in hyp and ref
+
+# Summary (4-gram BLEU)
+
+| model                                                         | fisher_dev | fisher_dev2 | fisher_test | callhome_devtest | callhome_evltest |
+| ------------------------------------------------------------- | ---------- | ----------- | ----------- | ---------------- | ---------------- |
+| RNN (char) [[Weiss et al.]](https://arxiv.org/abs/1703.08581) | 48.3       | 49.1        | 48.7        | 16.8             | 17.4             |
+| Transformer (BPE1k(500ES,500EN)) + ASR-PT + SpecAugment       | 48.4       | 49.5        | 48.6        | 19.7             | 19.6             |
+| Conformer (BPE1k(500ES,500EN)) + ASR-PT + SpecAugment         | **51.8**   | **52.3**    | **50.5**    | **22.3**         | **21.7**         |
+
+# Summary (4-gram BLEU, no callhome training)
+
+| model                                                         | fisher_dev | fisher_dev2 | fisher_test | callhome_devtest | callhome_evltest |
+| ------------------------------------------------------------- | ---------- | ----------- | ----------- | ---------------- | ---------------- |
+| Transformer (BPE1k(500ES,500EN)) + SpecAugment                | 44.7       | 45.6        | 45.1        | 17.3             | 16.8             |
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/cmd.sh b/egs2/fisher_callhome_spanish/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/fisher_callhome_spanish/st1/conf/decode_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/decode_st.yaml
new file mode 100644
index 00000000000..2967ee6fc0f
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/decode_st.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+
diff --git a/egs2/fisher_callhome_spanish/st1/conf/fbank.conf b/egs2/fisher_callhome_spanish/st1/conf/fbank.conf
new file mode 100644
index 00000000000..d75ddde4df8
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=8000 
+--num-mel-bins=80
diff --git a/egs2/fisher_callhome_spanish/st1/conf/pbs.conf b/egs2/fisher_callhome_spanish/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/fisher_callhome_spanish/st1/conf/pitch.conf b/egs2/fisher_callhome_spanish/st1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs2/fisher_callhome_spanish/st1/conf/queue.conf b/egs2/fisher_callhome_spanish/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/fisher_callhome_spanish/st1/conf/slurm.conf b/egs2/fisher_callhome_spanish/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/fisher_callhome_spanish/st1/conf/train_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/train_st.yaml
new file mode 120000
index 00000000000..2f41337f023
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/train_st.yaml
@@ -0,0 +1 @@
+tuning/train_conformer_st.yaml
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/conf/tuning/train_conformer_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_conformer_st.yaml
new file mode 100644
index 00000000000..a1d782bfc18
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_conformer_st.yaml
@@ -0,0 +1,96 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fisher_callhome_spanish/st1/conf/tuning/train_transformer_st.yaml b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_transformer_st.yaml
new file mode 100644
index 00000000000..168a6a7a174
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/conf/tuning/train_transformer_st.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fisher_callhome_spanish/st1/db.sh b/egs2/fisher_callhome_spanish/st1/db.sh
new file mode 120000
index 00000000000..39ab78266e5
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/db.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/callhome_create_splits.sh b/egs2/fisher_callhome_spanish/st1/local/callhome_create_splits.sh
new file mode 120000
index 00000000000..cb162d8e875
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/callhome_create_splits.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_create_splits.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/callhome_data_prep.sh b/egs2/fisher_callhome_spanish/st1/local/callhome_data_prep.sh
new file mode 120000
index 00000000000..f7c7948603c
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/callhome_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_data_prep.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py b/egs2/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py
new file mode 120000
index 00000000000..d465057b745
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_make_spk2gender.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/callhome_make_trans.pl b/egs2/fisher_callhome_spanish/st1/local/callhome_make_trans.pl
new file mode 120000
index 00000000000..88959b1138f
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/callhome_make_trans.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/callhome_make_trans.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/concat_short_utt.py b/egs2/fisher_callhome_spanish/st1/local/concat_short_utt.py
new file mode 120000
index 00000000000..f0566e73a01
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/concat_short_utt.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/concat_short_utt.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/create_splits.sh b/egs2/fisher_callhome_spanish/st1/local/create_splits.sh
new file mode 120000
index 00000000000..d2e87a8a129
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/create_splits.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/create_splits.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/data.sh b/egs2/fisher_callhome_spanish/st1/local/data.sh
new file mode 100755
index 00000000000..92c3c9be50e
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/data.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${FISHER_CALLHOME_SPANISH}
+if [ -z "${FISHER_CALLHOME_SPANISH}" ]; then
+    log "Fill the value of 'FISHER_CALLHOME_SPANISH' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Assume the file structures as
+# - ${FISHER_CALLHOME_SPANISH}
+#     - LDC2010S01 # (for fisher speech)
+#     - LDC2010T04 # (for fisher transcripts)
+#     - LDC96S35   # (for callhome speech)
+#     - LDC96T17   # (for callhome transcripts)
+
+sfisher_speech=${FISHER_CALLHOME_SPANISH}/LDC2010S01
+sfisher_transcripts=${FISHER_CALLHOME_SPANISH}/LDC2010T04
+split=local/splits/split_fisher
+callhome_speech=${FISHER_CALLHOME_SPANISH}/LDC96S35
+callhome_transcripts=${FISHER_CALLHOME_SPANISH}/LDC96T17
+split_callhome=local/splits/split_callhome
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Make sure you have fisher_callhome_spanish at ${sfisher_speech}, ${sfisher_transcripts}, \
+             ${callhome_speech}, ${callhome_transcripts}"
+    log "stage 0: Data Preparation"
+    local/fsp_data_prep.sh ${sfisher_speech} ${sfisher_transcripts}
+    local/callhome_data_prep.sh ${callhome_speech} ${callhome_transcripts}
+
+    # split data
+    local/create_splits.sh ${split}
+    local/callhome_create_splits.sh ${split_callhome}
+
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Concatenate Multiple Utterances"
+
+    # concatenate multiple utterances
+    local/normalize_trans.sh ${sfisher_transcripts} ${callhome_transcripts}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Combine train and dev set"
+
+    # skip the combination to have same condition to previous result
+    # utils/combine_data.sh \
+    #     --extra-files "text.lc.en text.lc.es text.lc.rm.en text.lc.rm.es text.tc.en text.tc.es" \
+    #     data/train \
+    #     data/fisher_train data/callhome_train/ 
+
+    cp -r data/fisher_train data/train
+    cp -r data/fisher_dev data/dev
+    rm data/dev/text.*.en
+    # Use 1 reference as dev set
+    cp data/fisher_dev/text.lc.en.0 data/dev/text.lc.en
+    cp data/fisher_dev/text.tc.en.0 data/dev/text.tc.en
+    cp data/fisher_dev/text.lc.rm.en.0 data/dev/text.lc.rm.en
+fi
diff --git a/egs2/fisher_callhome_spanish/st1/local/fsp_data_prep.sh b/egs2/fisher_callhome_spanish/st1/local/fsp_data_prep.sh
new file mode 120000
index 00000000000..8b01fc5da76
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/fsp_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_data_prep.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py b/egs2/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py
new file mode 120000
index 00000000000..2483eee0bf8
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_make_spk2gender.py
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/fsp_make_trans.pl b/egs2/fisher_callhome_spanish/st1/local/fsp_make_trans.pl
new file mode 120000
index 00000000000..67a714c738c
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/fsp_make_trans.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/fsp_make_trans.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/normalize_punctuation.pl b/egs2/fisher_callhome_spanish/st1/local/normalize_punctuation.pl
new file mode 120000
index 00000000000..3235cbb3b86
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/normalize_punctuation.pl
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/normalize_punctuation.pl
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/normalize_trans.sh b/egs2/fisher_callhome_spanish/st1/local/normalize_trans.sh
new file mode 120000
index 00000000000..9f03ca409ff
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/normalize_trans.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/normalize_trans.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/path.sh b/egs2/fisher_callhome_spanish/st1/local/path.sh
new file mode 100644
index 00000000000..5733ad5eb5b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/path.sh
@@ -0,0 +1,8 @@
+
+# check extra module installation
+if ! command -v tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    return 1
+fi
diff --git a/egs2/fisher_callhome_spanish/st1/local/score_bleu.sh b/egs2/fisher_callhome_spanish/st1/local/score_bleu.sh
new file mode 120000
index 00000000000..5a5a06ea3d2
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/score_bleu.sh
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/score_bleu.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/local/splits b/egs2/fisher_callhome_spanish/st1/local/splits
new file mode 120000
index 00000000000..0156cd08892
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/local/splits
@@ -0,0 +1 @@
+../../../../egs/fisher_callhome_spanish/st1/local/splits
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/path.sh b/egs2/fisher_callhome_spanish/st1/path.sh
new file mode 120000
index 00000000000..8e43dca7d4d
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/path.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/pyscripts b/egs2/fisher_callhome_spanish/st1/pyscripts
new file mode 120000
index 00000000000..f8eb6b2ab7b
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/pyscripts
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/run.sh b/egs2/fisher_callhome_spanish/st1/run.sh
new file mode 100755
index 00000000000..e48d8924263
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/run.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=es
+tgt_lang=en
+
+train_set=train
+train_dev=dev
+test_set="fisher_dev fisher_dev2 fisher_test callhome_devtest callhome_evltest"
+
+st_config=conf/train_st.yaml
+inference_config=conf/decode_st.yaml
+
+src_nbpe=500
+tgt_nbpe=500
+
+src_case=lc.rm
+tgt_case=lc.rm
+
+./st.sh \
+    --local_data_opts "--stage 0" \
+    --audio_format "flac.ark" \
+    --use_lm false \
+    --token_joint false \
+    --nj 40 \
+    --fs 8k \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/fisher_callhome_spanish/st1/scripts b/egs2/fisher_callhome_spanish/st1/scripts
new file mode 120000
index 00000000000..c5b75bb5b0a
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/scripts
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/st.sh b/egs2/fisher_callhome_spanish/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/steps b/egs2/fisher_callhome_spanish/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/fisher_callhome_spanish/st1/utils b/egs2/fisher_callhome_spanish/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/fisher_callhome_spanish/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fsc/asr1/README.md b/egs2/fsc/asr1/README.md
new file mode 100644
index 00000000000..ed3428174cf
--- /dev/null
+++ b/egs2/fsc/asr1/README.md
@@ -0,0 +1,81 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Environments
+- date: `Mon Oct 11 13:11:36 2021 -0400`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51) [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `4e7d2ba3510463ae744d1a6d98f18388ad929a9d`
+  - Commit date: `Mon Oct 11 12:57:48 2021 -0400`
+- Pretrained Model
+  - Zenodo : https://zenodo.org/record/5590204
+  - Hugging Face Hub : https://huggingface.co/espnet/siddhana_fsc_asr_train_asr_hubert_transformer_adam_specaug_raw_en_word_valid.acc.ave_5best 
+
+## Using Transformer based encoder-decoder with Hubert pre encoder and decoding sentence with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml](conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml)
+- token_type: word
+- keep_nbest_models: 5
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|3793|99.6|
+|inference_asr_model_valid.acc.ave_5best/valid|3793|98.4|
+
+###ASR results
+
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|3793|20316|99.8|0.1|0.1|0.1|0.4|0.9|
+|inference_asr_model_valid.acc.ave_5best/valid|3118|16751|99.2|0.5|0.3|0.3|1.1|2.7|
+
+## Environments
+- date: `Sat Jul 24 00:48:04 2021 -0400`
+- python version: `3.6.13 (default, Jun  4 2021, 14:25:59)  [GCC 7.5.0]`
+- espnet version: `espnet 0.9.10`
+- pytorch version: `pytorch 1.4.0+cu100`
+- Git hash: `3de98cec5d2bfe39ab4f7bbe8baa2a531e694ce3`
+  - Commit date: `Sat Jul 24 00:48:04 2021 -0400`
+- Pretrained Model: https://zenodo.org/record/5154341 
+
+## Using Transformer based encoder-decoder and decoding character with spectral augmentation 
+
+- ASR config: [conf/tuning/train_asr_transformer_adam_specaug.yaml](conf/tuning/train_asr_transformer_adam_specaug.yaml)
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.best/test|3793|98.3|
+|inference_asr_model_valid.acc.best/valid|3118|91.6|
+
+## Using Transformer based encoder-decoder and decoding sentence with spectral augmentation
+- ASR config: [conf/tuning/train_asr_transformer_adam_specaug_small.yaml](conf/tuning/train_asr_transformer_adam_specaug_small.yaml)
+- token_type: word
+- keep_nbest_models: 5
+- Pretrainded model: https://zenodo.org/record/5171544
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|3793|98.8|
+|inference_asr_model_valid.acc.ave_5best/valid|3118|93.6|
+
+
+## Using Transformer based encoder-decoder and decoding sentence with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/tuning/train_asr_transformer_adam_specaug.yaml](conf/tuning/train_asr_transformer_adam_specaug.yaml)
+- token_type: word
+- keep_nbest_models: 5
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|3793|99.5|
+
+### ASR results
+
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|3793|20316|99.7|0.2|0.1|0.2|0.4|1.5|
+|inference_asr_model_valid.acc.ave_5best/valid|3118|16751|97.9|1.5|0.6|0.5|2.6|5.5|
+
diff --git a/egs2/fsc/asr1/asr.sh b/egs2/fsc/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/fsc/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/fsc/asr1/cmd.sh b/egs2/fsc/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/fsc/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/fsc/asr1/conf/fbank.conf b/egs2/fsc/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/fsc/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/fsc/asr1/conf/pbs.conf b/egs2/fsc/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/fsc/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/fsc/asr1/conf/pitch.conf b/egs2/fsc/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/fsc/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/fsc/asr1/conf/queue.conf b/egs2/fsc/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/fsc/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/fsc/asr1/conf/slurm.conf b/egs2/fsc/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/fsc/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/fsc/asr1/conf/train_asr.yaml b/egs2/fsc/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..47cc9ac85c0
--- /dev/null
+++ b/egs2/fsc/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_hubert_transformer_adam_specaug.yaml
\ No newline at end of file
diff --git a/egs2/fsc/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml b/egs2/fsc/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml
new file mode 100755
index 00000000000..51376ed97c0
--- /dev/null
+++ b/egs2/fsc/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml
@@ -0,0 +1,74 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fsc/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/fsc/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..7828c9954ee
--- /dev/null
+++ b/egs2/fsc/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,25 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+max_epoch: 80
\ No newline at end of file
diff --git a/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam.yaml b/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam.yaml
new file mode 100644
index 00000000000..737b3629448
--- /dev/null
+++ b/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam.yaml
@@ -0,0 +1,31 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.00002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
\ No newline at end of file
diff --git a/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_small.yaml b/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_small.yaml
new file mode 100644
index 00000000000..a38fba89d94
--- /dev/null
+++ b/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_small.yaml
@@ -0,0 +1,31 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 1
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
diff --git a/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_small_specaug.yaml b/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_small_specaug.yaml
new file mode 100644
index 00000000000..aed943c01fa
--- /dev/null
+++ b/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_small_specaug.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 1
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
+
+keep_nbest_models: 5
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml b/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml
new file mode 100644
index 00000000000..fe600f7b2c9
--- /dev/null
+++ b/egs2/fsc/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
+keep_nbest_models: 5
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/fsc/asr1/db.sh b/egs2/fsc/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/fsc/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/fsc/asr1/local/data.sh b/egs2/fsc/asr1/local/data.sh
new file mode 100644
index 00000000000..604b2f56959
--- /dev/null
+++ b/egs2/fsc/asr1/local/data.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${FSC}" ]; then
+    log "Fill the value of 'FSC' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${FSC}/Fluent Speech Commands Public License.pdf" ]; then
+	echo "stage 1: Download data to ${FSC}"
+    else
+        log "stage 1: ${FSC}/Fluent Speech Commands Public License.pdf is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,valid,test}
+    python3 local/data_prep.py ${FSC}
+    for x in test valid train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/fsc/asr1/local/data_prep.py b/egs2/fsc/asr1/local/data_prep.py
new file mode 100644
index 00000000000..f6cc9cb42ce
--- /dev/null
+++ b/egs2/fsc/asr1/local/data_prep.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [fsc_root]")
+    sys.exit(1)
+fsc_root = sys.argv[1]
+
+dir_dict = {
+    "train": "train_data.csv",
+    "valid": "valid_data.csv",
+    "test": "test_data.csv",
+}
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "wav.scp"), "w"
+    ) as wav_scp_f, open(
+        os.path.join("data", x, "transcript"), "w"
+    ) as transcript_f, open(
+        os.path.join("data", x, "utt2spk"), "w"
+    ) as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(os.path.join(fsc_root, "data", dir_dict[x]))
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            words = (
+                row[4].replace(" ", "_")
+                + "_"
+                + row[5].replace(" ", "_")
+                + "_"
+                + row[6].replace(" ", "_")
+                + " "
+                + row[3].encode("ascii", "ignore").decode()
+            )
+            print(words)
+            path_arr = row[1].split("/")
+            utt_id = path_arr[-2] + "_" + path_arr[-1]
+            # print(utt_id + " " + words + "\n")
+            text_f.write(utt_id + " " + words + "\n")
+            wav_scp_f.write(utt_id + " " + fsc_root + "/" + row[1] + "\n")
+            utt2spk_f.write(utt_id + " " + row[2] + "\n")
diff --git a/egs2/fsc/asr1/local/path.sh b/egs2/fsc/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/fsc/asr1/local/score.py b/egs2/fsc/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/fsc/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/fsc/asr1/local/score.sh b/egs2/fsc/asr1/local/score.sh
new file mode 100644
index 00000000000..a2d03e28b76
--- /dev/null
+++ b/egs2/fsc/asr1/local/score.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+# # begin configuration section.
+# cmd=run.pl
+# stage=0
+# data=data/eval2000
+# #end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder>"
+  exit 1;
+fi
+
+asr_expdir=$1
+if [ $# -gt 1 ]; then
+        valid_inference_folder=$2
+        test_inference_folder=$3
+else
+        valid_inference_folder="inference_asr_model_valid.acc.ave_5best/valid/"
+        test_inference_folder="inference_asr_model_valid.acc.ave_5best/test/"
+fi
+
+python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+
+sclite \
+            -r "${asr_expdir}/${valid_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${valid_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+
+sclite \
+            -r "${asr_expdir}/${test_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${test_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+exit 0
+
diff --git a/egs2/fsc/asr1/path.sh b/egs2/fsc/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/fsc/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/fsc/asr1/pyscripts b/egs2/fsc/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/fsc/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/fsc/asr1/run.sh b/egs2/fsc/asr1/run.sh
new file mode 100755
index 00000000000..70b42c7ac61
--- /dev/null
+++ b/egs2/fsc/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.7.0")' &> /dev/null;  then
+	asr_config=conf/train_asr.yaml
+else
+	asr_config=conf/tuning/train_asr_transformer_adam_specaug.yaml #s3prl is installed when pytorch > 1.7. Hence using default frontend
+fi
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --audio_format wav\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --feats_normalize utterance_mvn\
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_5best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/fsc/asr1/scripts b/egs2/fsc/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/fsc/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/fsc/asr1/steps b/egs2/fsc/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/fsc/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/fsc/asr1/utils b/egs2/fsc/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/fsc/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/README.md b/egs2/fsc_challenge/asr1/README.md
new file mode 100644
index 00000000000..3f2b1fec1b0
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/README.md
@@ -0,0 +1,45 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Dataset
+- Fluent Speech Commands Datset
+  - Splits: https://github.com/maseEval/mase/tree/main/slu_splits/fluent_speech_commands/challenge_splits
+    - Unseen Speaker Test Set: Test generalisation to new speaker, Similar distribution of speaker diversity in train and test set
+      - Additional constraints to increase ASR confusion in test set
+    - Unseen Utterance Test Set: Test generalisation to new utterances, Similar distribution of transcript length in train and test set
+      - Additional constraints to reduce n-gram overlap with training set
+  - Paper: http://arxiv.org/abs/2106.15065
+
+## Environments
+- date: `Sun Oct  3 22:25:25 EDT 2021`
+- python version: `3.8.11 (default, Aug  3 2021, 15:09:35)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a3`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `a1a55e1eef2a74d2b8580d8071ce5229e7fa654c`
+  - Commit date: `Mon Nov 8 23:56:06 2021 -0500`
+- Pretrained Model
+  - Zenodo: https://zenodo.org/record/5656007
+  - Hugging Face Hub: https://huggingface.co/espnet/siddhana_fsc_challenge_asr_train_asr_hubert_transformer_adam_specaug_raw_en_word_valid.acc.ave_5best
+
+## Using Transformer based encoder-decoder with finetuned Hubert pre encoder and decoding sentence with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/tuning/train_asr.yaml](conf/tuning/train_asr_hubert_transformer_adam_specaug_finetune.yaml)
+- token_type: word
+- keep_nbest_models: 5
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/spk_test|3366|97.5|
+|inference_asr_model_valid.acc.ave_5best/utt_test|3970|78.5|
+|inference_asr_model_valid.acc.ave_5best/valid|2624|98.4|
+
+###ASR results
+
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/spk_test|3349|14588|98.7|0.9|0.4|0.6|1.9|4.7|
+|inference_asr_model_valid.acc.ave_5best/utt_test|4201|18330|87.1|10.6|2.3|3.8|16.7|44.6|
+|inference_asr_model_valid.acc.ave_5best/valid|2597|1185|98.9|0.6|0.5|0.3|1.3|2.9 |
+
+
diff --git a/egs2/fsc_challenge/asr1/asr.sh b/egs2/fsc_challenge/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/cmd.sh b/egs2/fsc_challenge/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/fsc_challenge/asr1/conf/fbank.conf b/egs2/fsc_challenge/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/fsc_challenge/asr1/conf/pbs.conf b/egs2/fsc_challenge/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/fsc_challenge/asr1/conf/pitch.conf b/egs2/fsc_challenge/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/fsc_challenge/asr1/conf/queue.conf b/egs2/fsc_challenge/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/fsc_challenge/asr1/conf/slurm.conf b/egs2/fsc_challenge/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/fsc_challenge/asr1/conf/train_asr.yaml b/egs2/fsc_challenge/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..7c90501f633
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_hubert_transformer_adam_specaug_finetune.yaml
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml b/egs2/fsc_challenge/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml
new file mode 100755
index 00000000000..b8fff85bcc3
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml
@@ -0,0 +1,64 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 80
+keep_nbest_models: 5
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fsc_challenge/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug_finetune.yaml b/egs2/fsc_challenge/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug_finetune.yaml
new file mode 100644
index 00000000000..b8fff85bcc3
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug_finetune.yaml
@@ -0,0 +1,64 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 80
+keep_nbest_models: 5
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fsc_challenge/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml b/egs2/fsc_challenge/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml
new file mode 100644
index 00000000000..fe600f7b2c9
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
+keep_nbest_models: 5
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/db.sh b/egs2/fsc_challenge/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/local/data.sh b/egs2/fsc_challenge/asr1/local/data.sh
new file mode 100644
index 00000000000..7c96f08fd00
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/local/data.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${FSC}" ]; then
+    log "Fill the value of 'FSC' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${FSC}/Fluent Speech Commands Public License.pdf" ]; then
+	echo "stage 1: Download data to ${FSC}"
+    else
+        log "stage 1: ${FSC}/Fluent Speech Commands Public License.pdf is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,valid,utt_test,spk_test}
+    python3 local/data_prep.py ${FSC}
+    for x in utt_test spk_test valid train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/fsc_challenge/asr1/local/data_prep.py b/egs2/fsc_challenge/asr1/local/data_prep.py
new file mode 100644
index 00000000000..95de097faac
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/local/data_prep.py
@@ -0,0 +1,60 @@
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import string
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [fsc_root]")
+    sys.exit(1)
+fsc_root = sys.argv[1]
+
+dir_dict = {
+    "train": "train_data.csv",
+    "valid": "valid_data.csv",
+    "utt_test": "utterance_test_data.csv",
+    "spk_test": "speaker_test_data.csv",
+}
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "wav.scp"), "w"
+    ) as wav_scp_f, open(
+        os.path.join("data", x, "transcript"), "w"
+    ) as transcript_f, open(
+        os.path.join("data", x, "utt2spk"), "w"
+    ) as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(
+            os.path.join(fsc_root, "data/challenge_splits", dir_dict[x])
+        )
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            words = (
+                row[5].replace(" ", "_")
+                + "_"
+                + row[6].replace(" ", "_")
+                + "_"
+                + row[7].replace(" ", "_")
+                + " "
+                + row[4]
+                .encode("ascii", "ignore")
+                .decode()
+                .lower()
+                .translate(str.maketrans("", "", string.punctuation))
+            )
+            print(words)
+            path_arr = row[2].split("/")
+            utt_id = path_arr[-2] + "_" + path_arr[-1]
+            # print(utt_id + " " + words + "\n")
+            text_f.write(utt_id + " " + words + "\n")
+            wav_scp_f.write(utt_id + " " + fsc_root + "/" + row[2] + "\n")
+            utt2spk_f.write(utt_id + " " + row[3] + "\n")
diff --git a/egs2/fsc_challenge/asr1/local/path.sh b/egs2/fsc_challenge/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/fsc_challenge/asr1/local/score.py b/egs2/fsc_challenge/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/local/score.sh b/egs2/fsc_challenge/asr1/local/score.sh
new file mode 100644
index 00000000000..a70b3382681
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/local/score.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+# # begin configuration section.
+# cmd=run.pl
+# stage=0
+# data=data/eval2000
+# #end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder> <utt_test_inference_folder>"
+  exit 1;
+fi
+
+asr_expdir=$1
+if [ $# -gt 1 ]; then
+        valid_inference_folder=$2
+        test_inference_folder=$3
+	utt_test_inference_folder=$4
+else
+        valid_inference_folder="inference_asr_model_valid.acc.ave_5best/valid/"
+        test_inference_folder="inference_asr_model_valid.acc.ave_5best/spk_test/"
+	utt_test_inference_folder="inference_asr_model_valid.acc.ave_5best/utt_test/"
+fi
+
+python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder} --utterance_test_folder ${utt_test_inference_folder}
+
+sclite \
+            -r "${asr_expdir}/${valid_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${valid_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+
+sclite \
+            -r "${asr_expdir}/${test_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${test_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+
+sclite \
+            -r "${asr_expdir}/${utt_test_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${utt_test_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${utt_test_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${utt_test_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${utt_test_inference_folder}/score_wer/result_asr.txt"
+exit 0
+
diff --git a/egs2/fsc_challenge/asr1/path.sh b/egs2/fsc_challenge/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/pyscripts b/egs2/fsc_challenge/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/run.sh b/egs2/fsc_challenge/asr1/run.sh
new file mode 100755
index 00000000000..70b42c7ac61
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.7.0")' &> /dev/null;  then
+	asr_config=conf/train_asr.yaml
+else
+	asr_config=conf/tuning/train_asr_transformer_adam_specaug.yaml #s3prl is installed when pytorch > 1.7. Hence using default frontend
+fi
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --audio_format wav\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --feats_normalize utterance_mvn\
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_5best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/fsc_challenge/asr1/scripts b/egs2/fsc_challenge/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/steps b/egs2/fsc_challenge/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/fsc_challenge/asr1/utils b/egs2/fsc_challenge/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/fsc_challenge/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/README.md b/egs2/fsc_unseen/asr1/README.md
new file mode 100644
index 00000000000..a285c46eeeb
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/README.md
@@ -0,0 +1,43 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Dataset
+- Fluent Speech Commands Datset
+  - Splits: https://github.com/maseEval/mase/tree/main/slu_splits/fluent_speech_commands/unseen_splits
+    - Unseen Speaker Test Set: Test generalisation to new speaker, Similar distribution of speaker diversity in train and test set
+    - Unseen Utterance Test Set: Test generalisation to new utterances, Similar distribution of transcript length in train and test set
+  - Paper: http://arxiv.org/abs/2106.15065
+
+## Environments
+- date: `Mon Oct 11 13:11:36 2021 -0400`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51) [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `4e7d2ba3510463ae744d1a6d98f18388ad929a9d`
+  - Commit date: `Mon Nov 8 16:28:44 2021 -0500`
+- Pretrained Model
+  - Zenodo: https://zenodo.org/record/5655832
+  - Hugging Face Hub: https://huggingface.co/espnet/siddhana_fsc_unseen_asr_train_asr_hubert_transformer_adam_specaug_finetune_raw_en_word_valid.acc.ave_5best
+
+## Using Transformer based encoder-decoder with Hubert pre encoder and decoding sentence with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml](conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml)
+- token_type: word
+- keep_nbest_models: 5
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/spk_test|3366|98.5|
+|inference_asr_model_valid.acc.ave_5best/utt_test|3970|86.4|
+|inference_asr_model_valid.acc.ave_5best/valid|2624|98.8|
+
+###ASR results
+
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/spk_test|3366|14806|99.1|0.5|0.4|0.4|1.3|3.1|
+|inference_asr_model_valid.acc.ave_5best/utt_test|3970|17199|91.0|6.5|2.5|5.3|14.4|49.3|
+|inference_asr_model_valid.acc.ave_5best/valid|2624|11295|99.3|0.4|0.2|0.2|0.9|2.1|
+
+
diff --git a/egs2/fsc_unseen/asr1/asr.sh b/egs2/fsc_unseen/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/cmd.sh b/egs2/fsc_unseen/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/fsc_unseen/asr1/conf/fbank.conf b/egs2/fsc_unseen/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/fsc_unseen/asr1/conf/pbs.conf b/egs2/fsc_unseen/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/fsc_unseen/asr1/conf/pitch.conf b/egs2/fsc_unseen/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/fsc_unseen/asr1/conf/queue.conf b/egs2/fsc_unseen/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/fsc_unseen/asr1/conf/slurm.conf b/egs2/fsc_unseen/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/fsc_unseen/asr1/conf/train_asr.yaml b/egs2/fsc_unseen/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..47cc9ac85c0
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_hubert_transformer_adam_specaug.yaml
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml b/egs2/fsc_unseen/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml
new file mode 100755
index 00000000000..51376ed97c0
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/conf/tuning/train_asr_hubert_transformer_adam_specaug.yaml
@@ -0,0 +1,74 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/fsc_unseen/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml b/egs2/fsc_unseen/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml
new file mode 100644
index 00000000000..fe600f7b2c9
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/conf/tuning/train_asr_transformer_adam_specaug.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 200
+keep_nbest_models: 5
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/db.sh b/egs2/fsc_unseen/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/local/data.sh b/egs2/fsc_unseen/asr1/local/data.sh
new file mode 100644
index 00000000000..7c96f08fd00
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/local/data.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${FSC}" ]; then
+    log "Fill the value of 'FSC' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${FSC}/Fluent Speech Commands Public License.pdf" ]; then
+	echo "stage 1: Download data to ${FSC}"
+    else
+        log "stage 1: ${FSC}/Fluent Speech Commands Public License.pdf is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,valid,utt_test,spk_test}
+    python3 local/data_prep.py ${FSC}
+    for x in utt_test spk_test valid train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/fsc_unseen/asr1/local/data_prep.py b/egs2/fsc_unseen/asr1/local/data_prep.py
new file mode 100644
index 00000000000..68d2d1798cf
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/local/data_prep.py
@@ -0,0 +1,61 @@
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import string
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [fsc_root]")
+    sys.exit(1)
+fsc_root = sys.argv[1]
+
+dir_dict = {
+    "train": "train_data.csv",
+    "valid": "valid_data.csv",
+    "utt_test": "utterance_test_data.csv",
+    "spk_test": "speaker_test_data.csv",
+}
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "wav.scp"), "w"
+    ) as wav_scp_f, open(
+        os.path.join("data", x, "transcript"), "w"
+    ) as transcript_f, open(
+        os.path.join("data", x, "utt2spk"), "w"
+    ) as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(
+            os.path.join(fsc_root, "data/unseen_splits", dir_dict[x])
+        )
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            words = (
+                row[5].replace(" ", "_")
+                + "_"
+                + row[6].replace(" ", "_")
+                + "_"
+                + row[7].replace(" ", "_")
+                + " "
+                + row[4]
+                .encode("ascii", "ignore")
+                .decode()
+                .lower()
+                .replace("?", "")
+                .translate(str.maketrans("", "", string.punctuation))
+            )
+            print(words)
+            path_arr = row[2].split("/")
+            utt_id = path_arr[-2] + "_" + path_arr[-1]
+            # print(utt_id + " " + words + "\n")
+            text_f.write(utt_id + " " + words + "\n")
+            wav_scp_f.write(utt_id + " " + fsc_root + "/" + row[2] + "\n")
+            utt2spk_f.write(utt_id + " " + row[3] + "\n")
diff --git a/egs2/fsc_unseen/asr1/local/path.sh b/egs2/fsc_unseen/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/fsc_unseen/asr1/local/score.py b/egs2/fsc_unseen/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/local/score.sh b/egs2/fsc_unseen/asr1/local/score.sh
new file mode 100644
index 00000000000..a70b3382681
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/local/score.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+# # begin configuration section.
+# cmd=run.pl
+# stage=0
+# data=data/eval2000
+# #end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder> <utt_test_inference_folder>"
+  exit 1;
+fi
+
+asr_expdir=$1
+if [ $# -gt 1 ]; then
+        valid_inference_folder=$2
+        test_inference_folder=$3
+	utt_test_inference_folder=$4
+else
+        valid_inference_folder="inference_asr_model_valid.acc.ave_5best/valid/"
+        test_inference_folder="inference_asr_model_valid.acc.ave_5best/spk_test/"
+	utt_test_inference_folder="inference_asr_model_valid.acc.ave_5best/utt_test/"
+fi
+
+python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder} --utterance_test_folder ${utt_test_inference_folder}
+
+sclite \
+            -r "${asr_expdir}/${valid_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${valid_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+
+sclite \
+            -r "${asr_expdir}/${test_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${test_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+
+sclite \
+            -r "${asr_expdir}/${utt_test_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${utt_test_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${utt_test_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${utt_test_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${utt_test_inference_folder}/score_wer/result_asr.txt"
+exit 0
+
diff --git a/egs2/fsc_unseen/asr1/path.sh b/egs2/fsc_unseen/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/pyscripts b/egs2/fsc_unseen/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/run.sh b/egs2/fsc_unseen/asr1/run.sh
new file mode 100755
index 00000000000..70b42c7ac61
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+if python3 -c 'import torch as t; from distutils.version import LooseVersion as L; assert L(t.__version__) >= L("1.7.0")' &> /dev/null;  then
+	asr_config=conf/train_asr.yaml
+else
+	asr_config=conf/tuning/train_asr_transformer_adam_specaug.yaml #s3prl is installed when pytorch > 1.7. Hence using default frontend
+fi
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --audio_format wav\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --feats_normalize utterance_mvn\
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_5best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/fsc_unseen/asr1/scripts b/egs2/fsc_unseen/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/steps b/egs2/fsc_unseen/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/fsc_unseen/asr1/utils b/egs2/fsc_unseen/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/fsc_unseen/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/gigaspeech/asr1/README.md b/egs2/gigaspeech/asr1/README.md
new file mode 100644
index 00000000000..97d7ac6826b
--- /dev/null
+++ b/egs2/gigaspeech/asr1/README.md
@@ -0,0 +1,17 @@
+# RESULTS
+## Environments
+- date: `Tue Mar 23 10:03:49 EDT 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.8`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `dcb5bdb2ffa34a9f44255c0b073759c5b9b3f86e`
+  - Commit date: `Sat Mar 13 10:16:16 2021 -0500`
+
+## asr_train_asr_raw_en_bpe5000
+- https://zenodo.org/record/4630406
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|5715|127790|92.0|6.0|2.1|2.9|10.9|70.9|
+|decode_asr_asr_model_valid.acc.ave/test|19930|390744|91.2|6.7|2.1|2.0|10.8|64.2|
diff --git a/egs2/gigaspeech/asr1/asr.sh b/egs2/gigaspeech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/gigaspeech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/gigaspeech/asr1/cmd.sh b/egs2/gigaspeech/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/gigaspeech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/gigaspeech/asr1/conf/decode_asr.yaml b/egs2/gigaspeech/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..1268f3badbe
--- /dev/null
+++ b/egs2/gigaspeech/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/gigaspeech/asr1/conf/fbank.conf b/egs2/gigaspeech/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/gigaspeech/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/gigaspeech/asr1/conf/pbs.conf b/egs2/gigaspeech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/gigaspeech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/gigaspeech/asr1/conf/pitch.conf b/egs2/gigaspeech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/gigaspeech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/gigaspeech/asr1/conf/queue.conf b/egs2/gigaspeech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/gigaspeech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/gigaspeech/asr1/conf/slurm.conf b/egs2/gigaspeech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/gigaspeech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/librispeech/asr1/conf/train_asr_confformer.yaml b/egs2/gigaspeech/asr1/conf/train_asr.yaml
similarity index 100%
rename from egs2/librispeech/asr1/conf/train_asr_confformer.yaml
rename to egs2/gigaspeech/asr1/conf/train_asr.yaml
diff --git a/egs2/gigaspeech/asr1/conf/train_lm.yaml b/egs2/gigaspeech/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..313b7db0f58
--- /dev/null
+++ b/egs2/gigaspeech/asr1/conf/train_lm.yaml
@@ -0,0 +1,16 @@
+optim: sgd
+patience: 3
+max_epoch: 40
+batch_type: folded
+batch_size: 256
+lm: seq_rnn
+lm_conf:
+    rnn_type: lstm
+    nlayers: 2
+    unit: 2024
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/gigaspeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml b/egs2/gigaspeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml
new file mode 100644
index 00000000000..231ed5d954f
--- /dev/null
+++ b/egs2/gigaspeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml
@@ -0,0 +1,75 @@
+# This configuration requires TITAN RTX (24GB) x 4 GPUs It takes about XX days.
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 20
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0015
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/gigaspeech/asr1/db.sh b/egs2/gigaspeech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/gigaspeech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/gigaspeech/asr1/local/data.sh b/egs2/gigaspeech/asr1/local/data.sh
new file mode 100755
index 00000000000..e545967d6d1
--- /dev/null
+++ b/egs2/gigaspeech/asr1/local/data.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=5000
+data_dir="data"
+
+log "$0 $*"
+. utils/parse_options.sh
+
+# base url for downloads.
+giga_repo=https://github.com/SpeechColab/GigaSpeech.git
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ ! -e "${GIGASPEECH}" ]; then
+    log "Fill the value of 'GIGASPEECH' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ -d "${GIGASPEECH}/audio" ] && [ -f "${GIGASPEECH}/GigaSpeech.json" ]; then
+	log "GIGASPEECH found in ${GIGASPEECH}."
+	rm -fr GigaSpeech
+	git clone $giga_repo
+    else
+	echo "Valid GIGASPEECH data not found in ${GIGASPEECH}."
+	echo "Please follow the instruction in https://github.com/SpeechColab/GigaSpeech#dataset-download"
+	echo "and re-construct the data."
+	exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "data preparation"
+    mkdir -p ${data_dir}
+    abs_data_dir=$(readlink -f ${data_dir})
+    log "making Kaldi format data directory in ${abs_data_dir}"
+    pushd GigaSpeech
+    ./toolkits/kaldi/gigaspeech_data_prep.sh ${GIGASPEECH} ${abs_data_dir} true
+    popd
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "fixing data directories"
+    for sub in train dev test; do
+	utils/fix_data_dir.sh ${data_dir}/${sub}
+	# reco2dur causes the error at stage 4 in asr.sh
+	rm -f ${data_dir}/${sub}/reco2dur
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/gigaspeech/asr1/local/path.sh b/egs2/gigaspeech/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/gigaspeech/asr1/local/score.sh b/egs2/gigaspeech/asr1/local/score.sh
new file mode 100644
index 00000000000..3d5cea2d232
--- /dev/null
+++ b/egs2/gigaspeech/asr1/local/score.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+lm_exp=
+inference_tag=
+inference_config=
+inference_args=
+use_lm=
+inference_lm=valid.loss.ave.pth
+inference_asr_model=valid.acc.ave.pth
+test_sets="dev test"
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./utils/parse_options.sh || exit 1;
+
+
+if [ $# -ne 1 ]; then
+    echo "Usage:  <asr_exp> "
+    echo "Remove conversational fillers from both hypothese and reference text"
+    echo "and resore for normalization"
+    exit 1
+fi
+
+asr_exp=$1
+
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    if [ -n "${inference_args}" ]; then
+        inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    if "${use_lm}"; then
+        inference_tag+="_lm_$(basename "${lm_exp}")_$(echo "${inference_lm}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+    fi
+    inference_tag+="_asr_model_$(echo "${inference_asr_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+fi
+
+if [ -n "${test_sets}" ]; then
+    for dset in ${test_sets}; do
+        _dir="${asr_exp}/${inference_tag}/${dset}"
+        for _type in cer wer ter; do
+            _scoredir="${_dir}/score_${_type}"
+            python GigaSpeech/utils/gigaspeech_scoring.py ${_scoredir}/ref.trn ${_scoredir}/hyp.trn ${_scoredir}
+        done
+    done
+fi
diff --git a/egs2/gigaspeech/asr1/path.sh b/egs2/gigaspeech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/gigaspeech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/gigaspeech/asr1/pyscripts b/egs2/gigaspeech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/gigaspeech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/gigaspeech/asr1/run.sh b/egs2/gigaspeech/asr1/run.sh
new file mode 100755
index 00000000000..579a9a02208
--- /dev/null
+++ b/egs2/gigaspeech/asr1/run.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="dev test"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors=""
+
+./asr.sh \
+    --audio_format flac.ark \
+    --lang en \
+    --ngpu 4 \
+    --nj 128 \
+    --inference_nj 256 \
+    --use_lm false \
+    --nbpe 5000 \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_config "${asr_config}" \
+    --lm_config "${lm_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "data/${train_set}/text" "$@" \
+    --local_score_opts "--inference_config ${inference_config} --use_lm false"
diff --git a/egs2/gigaspeech/asr1/scripts b/egs2/gigaspeech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/gigaspeech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/gigaspeech/asr1/steps b/egs2/gigaspeech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/gigaspeech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/gigaspeech/asr1/utils b/egs2/gigaspeech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/gigaspeech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/cmd.sh b/egs2/googlei18n_lowresource/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/googlei18n_lowresource/tts1/conf/decode.yaml b/egs2/googlei18n_lowresource/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/conf/mfcc.conf b/egs2/googlei18n_lowresource/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/googlei18n_lowresource/tts1/conf/pbs.conf b/egs2/googlei18n_lowresource/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/googlei18n_lowresource/tts1/conf/queue.conf b/egs2/googlei18n_lowresource/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/googlei18n_lowresource/tts1/conf/slurm.conf b/egs2/googlei18n_lowresource/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/googlei18n_lowresource/tts1/conf/train.yaml b/egs2/googlei18n_lowresource/tts1/conf/train.yaml
new file mode 120000
index 00000000000..5825b613e30
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_gst+xvector_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a6b8d59d422
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,113 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with GST + X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 4                               # number of heads in GST multi-head attention
+    gst_tokens: 16                             # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
new file mode 100644
index 00000000000..6065c914c39
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
@@ -0,0 +1,81 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 512              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 4                 # number of heads in GST multi-head attention
+    gst_tokens: 16               # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_transformer.yaml b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_transformer.yaml
new file mode 100644
index 00000000000..737a26960d4
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/tuning/train_gst+xvector_transformer.yaml
@@ -0,0 +1,96 @@
+# This configuration is for ESPnet2 to train Transformer-TTS with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the learning of the diagonal attention.
+# It requires 4 GPUs with 32 GB memory and it takes around 3 days
+# to finish the training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    spk_embed_dim: 512               # dimension of speaker embedding
+    spk_embed_integration_type: add  # how to integrate speaker embedding
+    use_gst: true                    # whether to use GST embedding
+    gst_heads: 4                     # number of heads in GST multi-head attention
+    gst_tokens: 16                   # number of global style tokens
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 12000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/googlei18n_lowresource/tts1/conf/vad.conf b/egs2/googlei18n_lowresource/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/googlei18n_lowresource/tts1/db.sh b/egs2/googlei18n_lowresource/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/local/data.sh b/egs2/googlei18n_lowresource/tts1/local/data.sh
new file mode 100755
index 00000000000..4e1343a15ac
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/local/data.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=3
+threshold=35
+sex=both
+lang=es_ar
+openslr_id=61
+nj=40
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${GOOGLEI18N}" ]; then
+   log "Fill the value of 'GOOGLEI18N' of db.sh"
+   exit 1
+fi
+mkdir -p ${GOOGLEI18N}
+db_root=${GOOGLEI18N}
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage -1: download data from openslr"
+    if [[ "${sex}" == female  ]]; then
+        local/download_and_unzip.sh "${db_root}" "https://www.openslr.org/resources/${openslr_id}/${lang}_female.zip" ${lang}_female.zip
+        wget -O local/line_index_female.tsv "https://www.openslr.org/resources/${openslr_id}/line_index_female.tsv"
+        mv local/line_index_female.tsv local/index.tsv
+    elif [[ "${sex}" == male  ]]; then
+        local/download_and_unzip.sh "${db_root}" "https://www.openslr.org/resources/${openslr_id}/${lang}_male.zip" ${lang}_male.zip
+        wget -O local/line_index_male.tsv "https://www.openslr.org/resources/${openslr_id}/line_index_male.tsv"
+        mv local/line_index_male.tsv local/index.tsv
+    else
+        # local/download_and_unzip.sh "${db_root}" "https://www.openslr.org/resources/${openslr_id}/${lang}_male.zip" ${lang}_male.zip
+        # local/download_and_unzip.sh "${db_root}" "https://www.openslr.org/resources/${openslr_id}/${lang}_female.zip" ${lang}_female.zip
+        wget -O local/line_index_female.tsv "https://www.openslr.org/resources/${openslr_id}/line_index_female.tsv"
+        wget -O local/line_index_male.tsv "https://www.openslr.org/resources/${openslr_id}/line_index_male.tsv"
+        cat local/line_index_male.tsv local/line_index_female.tsv > local/index.tsv
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: prepare crowdsourced data"
+    mkdir -p data
+    mkdir -p data/${lang}
+    log "generate utt2spk"
+    awk -F '[_\t]' '{print $1 "_" $2 "_" $3 " " $1 "_" $2}' local/index.tsv > data/${lang}/utt2spk
+    log "generate text"
+    cp local/index.tsv data/${lang}/text
+    log "generate wav.scp"
+    awk -F "\t" -v db=${db_root} '{print $1 " " db}' local/index.tsv > data/${lang}/wav.scp
+    log "sorting"
+    sort data/${lang}/utt2spk -o data/${lang}/utt2spk
+    sort data/${lang}/wav.scp -o data/${lang}/wav.scp
+    sort data/${lang}/text -o data/${lang}/text
+    utils/utt2spk_to_spk2utt.pl data/${lang}/utt2spk > data/${lang}/spk2utt
+    utils/validate_data_dir.sh --no-feats data/${lang}
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: scripts/audio/trim_silence.sh"
+        # shellcheck disable=SC2154
+        scripts/audio/trim_silence.sh \
+             --cmd "${train_cmd}" \
+             --nj "${nj}" \
+             --fs 44100 \
+             --win_length 2048 \
+             --shift_length 512 \
+             --threshold "${threshold}" \
+             data/${lang} data/${lang}/log
+
+        utils/fix_data_dir.sh data/${lang}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: split for development set"
+    utils/subset_data_dir.sh data/${lang} 500 data/dev-test-${lang}
+    utils/subset_data_dir.sh data/dev-test-${lang} 250 data/dev_${lang}
+    utils/copy_data_dir.sh data/dev-test-${lang} data/test_${lang}
+    utils/filter_scp.pl --exclude data/dev_${lang}/wav.scp 
+        data/dev-test-${lang}/wav.scp > data/test_${lang}/wav.scp
+    utils/fix_data_dir.sh data/test_${lang}
+
+    utils/copy_data_dir.sh data/${lang} data/train_${lang}
+    utils/filter_scp.pl --exclude data/dev-test-${lang}/wav.scp \
+        data/${lang}/wav.scp > data/train_${lang}/wav.scp
+    utils/fix_data_dir.sh data/train_${lang}/wav.scp
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
+
diff --git a/egs2/googlei18n_lowresource/tts1/local/download_and_unzip.sh b/egs2/googlei18n_lowresource/tts1/local/download_and_unzip.sh
new file mode 100755
index 00000000000..50afe0a5b4a
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/local/download_and_unzip.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Luminar Technologies, Inc. (author: Daniel Galvez)
+#             2017  Ewald Enzinger
+# Apache 2.0
+
+# Adapted from egs/mini_librispeech/s5/local/download_and_untar.sh (commit 1cd6d2ac3a935009fdc4184cb8a72ddad98fe7d9)
+
+remove_archive=false
+filesize=19057141777 # data_aishell3.tgz  size
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
+  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.zip cv_corpus_v1.zip"
+  echo "With --remove-archive it will remove the archive after successfully un-zipping it."
+fi
+
+data=$1
+url=$2
+filename=$3
+filepath="$data/$filename"
+workspace=$PWD
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL."
+  exit 1;
+fi
+
+if [ -f $data/$filename.complete ]; then
+  echo "$0: data was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+if [ -f $filepath ]; then
+  size=$(/bin/ls -l $filepath | awk '{print $5}')
+  size_ok=false
+  if [ "$filesize" -eq "$size" ]; then size_ok=true; fi;
+  if ! $size_ok; then
+    echo "$0: removing existing file $filepath because its size in bytes ($size)"
+    echo "does not equal the size of the archives ($filesize)."
+    rm $filepath
+  else
+    echo "$filepath exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $filepath ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  echo "$0: downloading data from $url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $url; then
+    echo "$0: error executing wget $url"
+    exit 1;
+  fi
+  cd $workspace
+fi
+
+cd $data
+
+if ! unzip $filename; then
+  echo "$0: error un-zipping archive $filepath"
+  exit 1;
+fi
+
+cd $workspace
+
+touch $data/$filename.complete
+
+echo "$0: Successfully downloaded and un-zipped $filepath"
+
+if $remove_archive; then
+  echo "$0: removing $filepath file since --remove-archive option was supplied."
+  rm $filepath
+fi
diff --git a/egs2/googlei18n_lowresource/tts1/local/path.sh b/egs2/googlei18n_lowresource/tts1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/googlei18n_lowresource/tts1/run.sh b/egs2/googlei18n_lowresource/tts1/run.sh
new file mode 100755
index 00000000000..f820ec79c3e
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/run.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 44100 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+lang="es_ar"
+openslr_id=61
+sex=both
+
+# lang_id: lang full name (openslr_id) female/male/both
+# es_ar: Argentinian Spanish (61) both
+# ml_in: Malayalam (63) both
+# mr_in: Marathi (64) female
+# ta_in: Tamil (65) both
+# te_in: Telugu (66) both
+# ca_es: Catalan (69) both
+# en_ng: Nigerian (70) both
+# es_cl: Chilean (71) both
+# es_co: Colombian (72) both
+# es_pe: Peruvian (73) both
+# es_pr: Puerto Rico Spanish (74) female
+# es_ve: Venezuelan Spanish (75) both
+# eu_es: Basque (76) both
+# gl_es: Galician (77) both
+# gu_in: Gujarati (78) both
+# kn_in: Kannada (79) both
+# my_mm: Burmese (80) female
+# irish_english: Ireland English (83) male
+# midlands_english: Midland English (83) both
+# northern_english: Northern English (83) both
+# scottish_english: Scottish English (83) both
+# southern_english: Southern English (83) both
+# welsh_english: Welsh English (83) both
+# yo_ng: Yoruba (86) both
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+train_set=train_no_dev_${lang}
+valid_set=dev_${lang}
+test_sets="dev_${lang} test_${lang}"
+
+# no g2p for crowdsourced languages
+g2p=none
+
+./tts.sh \
+    --lang ${lang} \
+    --local_data_opts "--lang ${lang} --openslr_id ${openslr_id} --sex ${sex}" \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type raw \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    --use_xvector true \
+    ${opts} "$@"
diff --git a/egs2/googlei18n_lowresource/tts1/sid b/egs2/googlei18n_lowresource/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/steps b/egs2/googlei18n_lowresource/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/tts.sh b/egs2/googlei18n_lowresource/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/googlei18n_lowresource/tts1/utils b/egs2/googlei18n_lowresource/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/googlei18n_lowresource/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/grabo/asr1/README.md b/egs2/grabo/asr1/README.md
new file mode 100644
index 00000000000..f9a54404c39
--- /dev/null
+++ b/egs2/grabo/asr1/README.md
@@ -0,0 +1,66 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# Notes
+## Dataset
+
+For more information about the dataset, please refer to these papers:
+
+- Renkens, Vincent, et al. "Acquisition of ordinal words using weakly supervised NMF." 2014 IEEE Spoken Language Technology Workshop (SLT). IEEE, 2014.
+
+- Renkens, Vincent, and Hugo Van hamme. "Capsule networks for low resource spoken language understanding." arXiv preprint arXiv:1805.02922 (2018).
+
+We followed this paper to ***randomly*** split the entire dataset into train/dev/test sets. Please modify the data preparation script to make the procedure deterministic. Otherwise the results cannot be exactly reproduced. Since this dataset is small, it might be better to run multiple experiments and calculate the average accuracy.
+
+- Tian, Yusheng, and Philip John Gorinski. "Improving end-to-end speech-to-intent classification with Reptile." arXiv preprint arXiv:2008.01994 (2020).
+
+## Experiments
+Each command is considered as a single token. Thus, each reference sentence contains only one token.
+
+
+# feats_type: raw
+## Environments
+- date: `Sat Nov 20 16:16:36 EST 2021`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.9.0`
+- Git hash: `09ddefe8fd5b6394338b0c653c3f6ec50063a843`
+  - Commit date: `Sat Nov 20 15:56:30 2021 -0500`
+
+## asr_conformer_mono16k_warmup800_lr2e-4_accum2
+
+Model: https://zenodo.org/record/5716386#.YcZQgxOZMVU
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|infer/dev|1584|1584|97.9|2.1|0.0|0.0|2.1|2.1|
+|infer/test|3631|3631|97.6|2.4|0.0|0.0|2.4|2.4|
+
+
+# feats_type: fbank_pitch
+## Environments
+- date: `Fri Oct  1 11:24:37 UTC 2021`
+- python version: `3.8.12 | packaged by conda-forge | (default, Sep 16 2021, 02:08:29)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.3a3`
+- pytorch version: `pytorch 1.9.0`
+- Git hash: `3d17c072348a1a9a4a3f179ad642c0d9f07f4406`
+  - Commit date: `Fri Oct 1 05:46:21 2021 +0000`
+
+## asr_conformer_mono16k_warmup800_lr2e-4_accum2
+
+Model: https://zenodo.org/record/5637566#.YcZQOxOZMVU
+
+### Classification Accuracy
+
+|dataset|total|correct|accuracy|
+|---|---|---|---|
+|dev|1584|1529|0.965|
+|test|3631|3528|0.972|
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|infer/dev|1584|1584|96.5|3.5|0.0|0.0|3.5|3.5|
+|infer/test|3631|3631|97.2|2.8|0.0|0.0|2.8|2.8|
+
diff --git a/egs2/grabo/asr1/asr.sh b/egs2/grabo/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/grabo/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/grabo/asr1/cmd.sh b/egs2/grabo/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/grabo/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/grabo/asr1/conf/decode_asr.yaml b/egs2/grabo/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..bc4fdccba34
--- /dev/null
+++ b/egs2/grabo/asr1/conf/decode_asr.yaml
@@ -0,0 +1,4 @@
+lm_weight: 0.0
+ctc_weight: 0.0
+beam_size: 1
+maxlenratio: -1
diff --git a/egs2/grabo/asr1/conf/fbank.conf b/egs2/grabo/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/grabo/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/grabo/asr1/conf/pbs.conf b/egs2/grabo/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/grabo/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/grabo/asr1/conf/pitch.conf b/egs2/grabo/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/grabo/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/grabo/asr1/conf/queue.conf b/egs2/grabo/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/grabo/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/grabo/asr1/conf/slurm.conf b/egs2/grabo/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/grabo/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/grabo/asr1/conf/train_asr.yaml b/egs2/grabo/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..4195a915d3b
--- /dev/null
+++ b/egs2/grabo/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_adam.yaml
\ No newline at end of file
diff --git a/egs2/grabo/asr1/conf/tuning/train_asr_conformer_adam.yaml b/egs2/grabo/asr1/conf/tuning/train_asr_conformer_adam.yaml
new file mode 100644
index 00000000000..404dec41135
--- /dev/null
+++ b/egs2/grabo/asr1/conf/tuning/train_asr_conformer_adam.yaml
@@ -0,0 +1,68 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 2
+max_epoch: 250
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: legacy        # 'legacy' or 'latest'
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 800
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.0
+    length_normalized_loss: false
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/grabo/asr1/db.sh b/egs2/grabo/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/grabo/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/grabo/asr1/local/data.sh b/egs2/grabo/asr1/local/data.sh
new file mode 100755
index 00000000000..bdd3953b1af
--- /dev/null
+++ b/egs2/grabo/asr1/local/data.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+
+data_url=ftp://ftp.esat.kuleuven.be/psi/speech/vrenkens/grabo.tar.gz
+data_tar=grabo.tar.gz
+data_tar_size=2102543961
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+log "$0 $*"
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z ${GRABO} ]; then
+    log "Fill the value of 'GRABO' of db.sh"
+    exit 1
+fi
+
+cur_path=$(pwd)
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Download Data to ${GRABO}"
+    if [ ! -d ${GRABO} ]; then
+    mkdir -p ${GRABO}
+    fi
+    # absolute path
+    GRABO=$(cd ${GRABO}; pwd)
+
+    # download data files if they do not exist
+    # file name: grabo.tar.gz
+    download_data=true
+    if [ -f ${GRABO}/${data_tar} ]; then
+        size=$(/bin/ls -l ${GRABO}/${data_tar} | awk '{print $5}')
+        if [ ${size} -eq ${data_tar_size} ]; then
+            download_data=false
+            log "${GRABO}/${data_tar} exists and appears to be complete."
+        else
+            log "$0: removing existing file ${GRABO}/${data_tar} because its size in bytes ${size} is not equal to the size of the archive."
+            rm ${GRABO}/${data_tar}
+        fi
+    fi
+
+    if ${download_data}; then
+        if ! command -v wget >/dev/null; then
+            log "$0: wget is not installed."
+            exit 1
+        fi
+
+        cd ${GRABO}
+        if ! wget --no-check-certificate ${data_url}; then
+            log "$0: error executing wget ${data_url}"
+            exit 1
+        fi
+    fi
+
+    log "$0: successfully downloaded ${data_tar}"
+
+    # un-tar
+    cd ${GRABO}
+    mkdir -p grabo
+    if ! tar -xzf ${data_tar} -C grabo; then
+        log "$0: error un-tarring archive ${data_tar}"
+        exit 1
+    fi
+
+    log "$0: successfully un-tarred ${data_tar}"
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    cd ${cur_path}
+    mkdir -p data/{train,dev,test}
+    python3 local/data_prep.py \
+        --data_path ${GRABO}/grabo/grabo/speakers \
+        --sox_path "$(command -v sox)" \
+        --train_dir data/train \
+        --dev_dir data/dev \
+        --test_dir data/test
+    for x in train dev test; do
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+        utils/fix_data_dir.sh data/${x}
+        utils/validate_data_dir.sh --no-feats data/${x}
+    done
+fi
+
+log "$0: successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/grabo/asr1/local/data_prep.py b/egs2/grabo/asr1/local/data_prep.py
new file mode 100644
index 00000000000..ef91723c164
--- /dev/null
+++ b/egs2/grabo/asr1/local/data_prep.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# References:
+# https://www.esat.kuleuven.be/psi/spraak/downloads/
+# https://arxiv.org/pdf/1805.02922.pdf
+# https://arxiv.org/pdf/2008.01994.pdf (for train/dev/test split)
+
+
+import os
+import glob
+import random
+import argparse
+import xml.etree.ElementTree as ET
+
+
+parser = argparse.ArgumentParser(description="Process Grabo dataset.")
+parser.add_argument(
+    "--data_path",
+    type=str,
+    default="downloads/grabo/grabo/speakers",
+    help="folder containing the original data",
+)
+parser.add_argument("--sox_path", type=str, help="path to sox command")
+parser.add_argument(
+    "--train_dir",
+    type=str,
+    default="data/train",
+    help="output folder for training data",
+)
+parser.add_argument(
+    "--dev_dir", type=str, default="data/dev", help="output folder for validation data"
+)
+parser.add_argument(
+    "--test_dir", type=str, default="data/test", help="output folder for test data"
+)
+args = parser.parse_args()
+
+
+def frametotask(infile):
+    """Adapted from the originial dataset.
+    infile has an extension of .xml
+    """
+
+    semantic = dict()
+    # read the frame xml file
+    root = ET.parse(infile).getroot()
+    semantic["name"] = root[0].text.strip()
+    semantic["args"] = dict()
+    if len(root) >= 2:
+        for arg in root[1]:
+            if arg.text is not None:
+                semantic["args"][arg.tag] = arg.text.strip()
+            else:
+                semantic["args"][arg.tag] = ""
+
+    # create the root element
+    root = ET.Element(semantic["name"], attrib=semantic["args"])
+
+    return ET.tostring(root).decode("ascii")
+
+
+# Generate train/dev/test sets
+# For each speaker, we randomly select 2 recordings of each command for training,
+# 4 for validation, and use the remaining 9 recordings for testing.
+# Reference: https://arxiv.org/pdf/2008.01994.pdf
+random.seed(2021)
+processed_dict = {"train": [], "dev": [], "test": []}
+speaker_list = os.listdir(args.data_path)  # relative paths, or speaker names: 'pp2'
+for spk in speaker_list:
+    command_list = os.listdir(
+        os.path.join(args.data_path, spk, "spchdatadir")
+    )  # relative paths, e.g. 'recording1'
+    for cmd in command_list:
+        wav_list = os.listdir(
+            os.path.join(args.data_path, spk, "spchdatadir", cmd)
+        )  # relative paths, e.g. 'Voice_1.wav'
+        wav_list.sort()
+        random.shuffle(wav_list)
+        random.shuffle(wav_list)
+        wav_dict = {"train": wav_list[:2], "dev": wav_list[2:6], "test": wav_list[6:]}
+        for n in ["train", "dev", "test"]:
+            for wav in wav_dict[n]:
+                wav_abspath = os.path.abspath(
+                    os.path.join(args.data_path, spk, "spchdatadir", cmd, wav)
+                )
+                wav_id = f'{spk}_{cmd}_{wav.rstrip(".wav")}'
+                task_str = frametotask(
+                    os.path.join(
+                        args.data_path,
+                        spk,
+                        "framedir",
+                        cmd,
+                        wav.rstrip(".wav") + ".xml",
+                    )
+                )
+                task_str = "-".join(task_str.split())
+                processed_dict[n].append(
+                    {"wav_id": wav_id, "wav_abspath": wav_abspath, "task_str": task_str}
+                )
+
+# Write data into text, wav.scp, utt2spk
+for n in ["train", "dev", "test"]:
+    parent_dir = getattr(args, f"{n}_dir")
+    sample_list = processed_dict[n]
+
+    with open(os.path.join(parent_dir, "text"), "w") as text_f, open(
+        os.path.join(parent_dir, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join(parent_dir, "utt2spk"), "w") as utt2spk_f:
+        for sample in sample_list:
+            text_f.write(sample["wav_id"] + " " + sample["task_str"] + "\n")
+            downsampled_wav = (
+                f'{args.sox_path} {sample["wav_abspath"]} -t wav -r 16k -c 1 - |'
+            )
+            wav_scp_f.write(sample["wav_id"] + " " + downsampled_wav + "\n")
+            utt2spk_f.write(sample["wav_id"] + " " + sample["wav_id"] + "\n")
diff --git a/egs2/grabo/asr1/local/path.sh b/egs2/grabo/asr1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/grabo/asr1/local/score.py b/egs2/grabo/asr1/local/score.py
new file mode 100644
index 00000000000..b1c79a976c9
--- /dev/null
+++ b/egs2/grabo/asr1/local/score.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+
+import os
+import os.path
+import argparse
+
+parser = argparse.ArgumentParser(description="Calculate classification accuracy.")
+parser.add_argument("--wer_dir", type=str, help="folder containing hyp.trn and ref.trn")
+args = parser.parse_args()
+
+
+with open(os.path.join(args.wer_dir, "hyp.trn"), "r") as f:
+    hyp_dict = {ln.split()[1]: ln.split()[0] for ln in f.readlines()}
+with open(os.path.join(args.wer_dir, "ref.trn"), "r") as f:
+    ref_dict = {ln.split()[1]: ln.split()[0] for ln in f.readlines()}
+
+n_correct = 0
+n_samples = 0
+for sample_id in ref_dict:
+    n_samples += 1
+    if ref_dict[sample_id] == hyp_dict[sample_id]:
+        n_correct += 1
+
+with open(os.path.join(args.wer_dir, "..", "accuracy.csv"), "w") as f:
+    f.write("total,correct,accuracy\n")
+    f.write(f"{n_samples},{n_correct},{n_correct/n_samples}\n")
diff --git a/egs2/grabo/asr1/local/score.sh b/egs2/grabo/asr1/local/score.sh
new file mode 100755
index 00000000000..e4784260b71
--- /dev/null
+++ b/egs2/grabo/asr1/local/score.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+inference_tag=
+
+log "$0 $*"
+
+. utils/parse_options.sh
+. ./path.sh
+
+inference_expdir="$1/${inference_tag}"
+acc_file="${inference_expdir}/accuracy.csv"
+echo "name,total,correct,accuracy" | tee ${acc_file}
+for x in ${inference_expdir}/*; do
+    if [ -d ${x} ]; then
+        testset=$(basename ${x})
+        python local/score.py --wer_dir "${x}/score_wer"
+        echo "${testset},$(tail -n 1 ${x}/accuracy.csv)" | tee -a ${acc_file} || exit 1
+    fi
+done
+
+echo "$0: Successfully wrote accuracy results to file ${acc_file}"
diff --git a/egs2/grabo/asr1/path.sh b/egs2/grabo/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/grabo/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/grabo/asr1/pyscripts b/egs2/grabo/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/grabo/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/grabo/asr1/run.sh b/egs2/grabo/asr1/run.sh
new file mode 100755
index 00000000000..7371b030826
--- /dev/null
+++ b/egs2/grabo/asr1/run.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_tag=conformer_mono16k_warmup800_lr2e-4_accum2
+inference_tag=infer
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 0.95 1.0 1.05 1.1"
+
+./asr.sh                                                \
+    --skip_data_prep false                              \
+    --skip_train false                                  \
+    --skip_eval false                                   \
+    --ngpu 1                                            \
+    --nj 8                                              \
+    --inference_nj 8                                    \
+    --speed_perturb_factors "${speed_perturb_factors}"  \
+    --feats_type fbank_pitch                            \
+    --audio_format wav                                  \
+    --fs 16000                                          \
+    --token_type word                                   \
+    --use_lm false                                      \
+    --asr_tag "${asr_tag}"                              \
+    --asr_config "${asr_config}"                        \
+    --inference_tag "${inference_tag}"                  \
+    --inference_config "${inference_config}"            \
+    --inference_asr_model valid.acc.ave.pth             \
+    --train_set "${train_set}"                          \
+    --valid_set "${valid_set}"                          \
+    --test_sets "${test_sets}"                          \
+    --local_score_opts "--inference_tag ${inference_tag}" \
+    --lm_train_text "data/${train_set}/text" "$@" 
diff --git a/egs2/grabo/asr1/scripts b/egs2/grabo/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/grabo/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/grabo/asr1/steps b/egs2/grabo/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/grabo/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/grabo/asr1/utils b/egs2/grabo/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/grabo/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/hkust/asr1/cmd.sh b/egs2/hkust/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/hkust/asr1/cmd.sh
+++ b/egs2/hkust/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
deleted file mode 100644
index e0c19d89ddb..00000000000
--- a/egs2/how2/asr1/cmd.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
-# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
-# e.g.
-#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
-#
-# Options:
-#   --time <time>: Limit the maximum time to execute.
-#   --mem <mem>: Limit the maximum memory usage.
-#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
-#   --num-threads <ngpu>: Specify the number of CPU core.
-#   --gpu <ngpu>: Specify the number of GPU devices.
-#   --config: Change the configuration file from default.
-#
-# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
-# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
-# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
-# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
-#
-# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
-# These options are mapping to specific options for each backend and
-# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
-# If jobs failed, your configuration might be wrong for your environment.
-#
-#
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
-#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
-# =========================================================~
-
-
-# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
-cmd_backend='local'
-
-# Local machine, without any Job scheduling system
-if [ "${cmd_backend}" = local ]; then
-
-    # The other usage
-    export train_cmd="run.pl"
-    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
-    export cuda_cmd="run.pl"
-    # Used for "*_recog.py"
-    export decode_cmd="run.pl"
-
-# Local machine logging to stdout and log file, without any Job scheduling system
-elif [ "${cmd_backend}" = stdout ]; then
-
-    # The other usage
-    export train_cmd="stdout.pl"
-    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
-    export cuda_cmd="stdout.pl"
-    # Used for "*_recog.py"
-    export decode_cmd="stdout.pl"
-
-
-# "qsub" (Sun Grid Engine, or derivation of it)
-elif [ "${cmd_backend}" = sge ]; then
-    # The default setting is written in conf/queue.conf.
-    # You must change "-q g.q" for the "queue" for your environment.
-    # To know the "queue" names, type "qhost -q"
-    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
-
-    export train_cmd="queue.pl"
-    export cuda_cmd="queue.pl"
-    export decode_cmd="queue.pl"
-
-
-# "qsub" (Torque/PBS.)
-elif [ "${cmd_backend}" = pbs ]; then
-    # The default setting is written in conf/pbs.conf.
-
-    export train_cmd="pbs.pl"
-    export cuda_cmd="pbs.pl"
-    export decode_cmd="pbs.pl"
-
-
-# "sbatch" (Slurm)
-elif [ "${cmd_backend}" = slurm ]; then
-    # The default setting is written in conf/slurm.conf.
-    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
-    # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
-    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
-
-    export train_cmd="slurm.pl"
-    export cuda_cmd="slurm.pl"
-    export decode_cmd="slurm.pl"
-
-elif [ "${cmd_backend}" = ssh ]; then
-    # You have to create ".queue/machines" to specify the host to execute jobs.
-    # e.g. .queue/machines
-    #   host1
-    #   host2
-    #   host3
-    # Assuming you can login them without any password, i.e. You have to set ssh keys.
-
-    export train_cmd="ssh.pl"
-    export cuda_cmd="ssh.pl"
-    export decode_cmd="ssh.pl"
-
-# This is an example of specifying several unique options in the JHU CLSP cluster setup.
-# Users can modify/add their own command options according to their cluster environments.
-elif [ "${cmd_backend}" = jhu ]; then
-
-    export train_cmd="queue.pl --mem 2G"
-    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
-    export decode_cmd="queue.pl --mem 4G"
-
-else
-    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
-    return 1
-fi
diff --git a/egs2/how2/asr1/cmd.sh b/egs2/how2/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/README.md b/egs2/how2_2000h/asr1/README.md
new file mode 100644
index 00000000000..ce6fe0a19db
--- /dev/null
+++ b/egs2/how2_2000h/asr1/README.md
@@ -0,0 +1,28 @@
+## End to End Speech Recognition with How2-2000h
+
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+
+# Results on ASR
+
+
+## asr_base_conformer_lf_mix
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
diff --git a/egs2/how2_2000h/asr1/asr.sh b/egs2/how2_2000h/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/how2_2000h/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/cmd.sh b/egs2/how2_2000h/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2_2000h/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/decode_asr.yaml b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..fa4714a6f43
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/fbank.conf b/egs2/how2_2000h/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/asr1/conf/pbs.conf b/egs2/how2_2000h/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/asr1/conf/pitch.conf b/egs2/how2_2000h/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/asr1/conf/queue.conf b/egs2/how2_2000h/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/asr1/conf/slurm.conf b/egs2/how2_2000h/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
new file mode 120000
index 00000000000..ee7d1d03dbc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000000..519477f7a43
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
new file mode 100755
index 00000000000..03fdd93249f
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
new file mode 100644
index 00000000000..f21213f3421
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml b/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
new file mode 100644
index 00000000000..606081aa9b1
--- /dev/null
+++ b/egs2/how2_2000h/asr1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
@@ -0,0 +1,78 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 4 days.
+batch_type: length
+batch_bins: 60000000
+accum_grad: 10
+max_epoch: 100
+patience: none
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [20,20,20,20,20,20,40,40,40,40,40,40]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf: 
+        warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/asr1/db.sh b/egs2/how2_2000h/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/how2_2000h/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data.sh b/egs2/how2_2000h/asr1/local/data.sh
new file mode 100755
index 00000000000..ffd918d7d09
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
+data_how2=how2_feats
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data download"
+
+    if [ -d ${data_how2} ]; then
+        log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
+    else
+        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
+        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation and verification"
+    mv how2_feats/data .
+    mv how2_feats/fbank .
+fi 
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/nlsyms b/egs2/how2_2000h/asr1/local/data_normalization/nlsyms
new file mode 100644
index 00000000000..8497d1e0046
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/nlsyms
@@ -0,0 +1,2 @@
+(h|H)(m|M)+ hesmark
+(U|u)(m|M)+ hesmark
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/shortened b/egs2/how2_2000h/asr1/local/data_normalization/shortened
new file mode 100644
index 00000000000..305228debf3
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/shortened
@@ -0,0 +1,4 @@
+Dr. Doctor
+Mr. Mister
+Ms. Miss
+No.1 Number one
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/symbols b/egs2/how2_2000h/asr1/local/data_normalization/symbols
new file mode 100644
index 00000000000..915b7a23070
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/symbols
@@ -0,0 +1,6 @@
+¾ 3/4
+½ 1/2
+% percent
+[+] plus
+= equal
+&[^;]*; and
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/data_normalization/url b/egs2/how2_2000h/asr1/local/data_normalization/url
new file mode 100644
index 00000000000..ca215839a83
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/data_normalization/url
@@ -0,0 +1,5 @@
+@ at
+www[.] www dot
+[.]com[/] dot com slash
+[.]com dot com
+[.]org dot org
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/local/path.sh b/egs2/how2_2000h/asr1/local/path.sh
new file mode 100755
index 00000000000..a0b8041dfb2
--- /dev/null
+++ b/egs2/how2_2000h/asr1/local/path.sh
@@ -0,0 +1,19 @@
+# check extra module installation
+if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null; then
+    echo "Error: it seems that longformer is not installed." >&2
+    echo "Error: please install longformer as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import nlgeval' > /dev/null; then
+    echo "Error: it seems that nlgeval is not installed." >&2
+    echo "Error: please install nlgeval as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
diff --git a/egs2/how2_2000h/asr1/path.sh b/egs2/how2_2000h/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/how2_2000h/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/pyscripts b/egs2/how2_2000h/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/how2_2000h/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/run.sh b/egs2/how2_2000h/asr1/run.sh
new file mode 100755
index 00000000000..7ff75326ed3
--- /dev/null
+++ b/egs2/how2_2000h/asr1/run.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+train_set="tr_2000h_utt"
+valid_set="cv05_utt"
+test_sets="dev5_test_utt"
+
+asr_config=conf/train_asr_conformer_lf.yaml
+inference_config=conf/decode_asr.yaml
+
+feats_type=extracted
+
+token_type=bpe
+
+nlsyms=data/nlsyms
+nbpe=1000
+bpe_nlsyms="[hes]"
+
+use_lm=false
+
+
+
+./asr.sh \
+    --lang en \
+    --feats_type ${feats_type} \
+    --token_type ${token_type} \
+    --nbpe ${nbpe} \
+    --nlsyms_txt ${nlsyms} \
+    --bpe_nlsyms ${bpe_nlsyms} \
+    --use_lm ${use_lm} \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/how2_2000h/asr1/scripts b/egs2/how2_2000h/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/how2_2000h/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/steps b/egs2/how2_2000h/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/how2_2000h/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/how2_2000h/asr1/utils b/egs2/how2_2000h/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/how2_2000h/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/README.md b/egs2/how2_2000h/sum1/README.md
new file mode 100644
index 00000000000..c95baf49e2c
--- /dev/null
+++ b/egs2/how2_2000h/sum1/README.md
@@ -0,0 +1,70 @@
+## End to End Speech Summarization
+
+This recipe can be used to build E2E Speech Summarization models using restricted self-attention on the HowTo corpus of instructional videos. 
+
+HowTo 2000h fbank-pitch features have been released to enable reproduction of this recipe. 
+
+Training is done in two stages, (a) ASR Pretraining, and (b) Summarization fine-tuning
+
+First run ASR pretraining as follows:
+The recipe is based on asr1
+```bash
+local/run_asr.sh --asr_tag asr_pretrain
+``` 
+Then run the finetuning on summarization using the previously trained model as the initialization
+
+```bash
+./run.sh --asr_tag sum_finetune --asr_args "--init_param exp/asr_asr_pretrain/valid.acc.ave_10best.pth:::ctc"
+```
+
+# Results on ASR
+
+
+## asr_base_conformer_lf_mix
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|55215|93.1|4.8|2.1|1.9|8.8|56.7|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|276377|97.1|1.1|1.9|1.9|4.8|56.7|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_model_valid.acc.best/dev5_test|3016|82484|94.1|3.5|2.4|2.2|8.0|56.7|
+
+
+
+# Results on Summarization
+
+## asr_ft_sum
+### SUMM
+- Model link: [huggingface](https://huggingface.co/espnet/roshansh_how2_asr_raw_ft_sum_valid.acc)
+- ASR config: [./conf/train_sum_conformer_lf.yaml](./conf/train_sum_conformer_lf.yaml)
+- Inference config: [./conf/decode_sum.yaml](./conf/decode_sum.yaml)
+
+|dataset|Snt|Wrd|ROUGE-1|ROUGE-2|ROUGE-L|METEOR|BERTScore|
+|---|---|---|---|---|---|---|---|
+|decode_sum_asr_model_valid.acc.best/dev5_test_sum|2127|69795|60.72|44.7|56.1|29.36|91.53|
+
+
+
+Please cite the following paper if you use this recipe:
+```Bibtex
+@misc{sharma2022speech,
+      title={Speech Summarization using Restricted Self-Attention}, 
+      author={Roshan Sharma and Shruti Palaskar and Alan W Black and Florian Metze},
+      year={2022},
+      eprint={2110.06263},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+```
diff --git a/egs2/how2_2000h/sum1/asr.sh b/egs2/how2_2000h/sum1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/how2_2000h/sum1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/cmd.sh b/egs2/how2_2000h/sum1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/how2_2000h/sum1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/decode_asr.yaml b/egs2/how2_2000h/sum1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..fa4714a6f43
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_ctc.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/decode_sum.yaml b/egs2/how2_2000h/sum1/conf/decode_sum.yaml
new file mode 120000
index 00000000000..27c573f341b
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/decode_sum.yaml
@@ -0,0 +1 @@
+tuning/decode_sum.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/fbank.conf b/egs2/how2_2000h/sum1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/how2_2000h/sum1/conf/pbs.conf b/egs2/how2_2000h/sum1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/how2_2000h/sum1/conf/pitch.conf b/egs2/how2_2000h/sum1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/how2_2000h/sum1/conf/queue.conf b/egs2/how2_2000h/sum1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/how2_2000h/sum1/conf/slurm.conf b/egs2/how2_2000h/sum1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml b/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml
new file mode 120000
index 00000000000..ee7d1d03dbc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/train_asr_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_ctc_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml b/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml
new file mode 120000
index 00000000000..ba6ab56ca56
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/train_sum_conformer_lf.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_vid_lf.yaml
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode.yaml
new file mode 100644
index 00000000000..519477f7a43
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.0
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 0.3
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml
new file mode 100755
index 00000000000..03fdd93249f
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode_ctc.yaml
@@ -0,0 +1,7 @@
+beam_size: 20
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.0
+maxlenratio: 0.0
+ctc_weight: 1.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml b/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml
new file mode 100755
index 00000000000..4682af74153
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/decode_sum.yaml
@@ -0,0 +1,7 @@
+beam_size: 4
+batch_size: 1
+penalty: 0.1
+minlenratio: 0.01
+maxlenratio: 0.2
+ctc_weight: 0.0
+lm_weight: 0.0
diff --git a/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml b/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml
new file mode 100644
index 00000000000..f21213f3421
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/pretrain_hubert.yaml
@@ -0,0 +1,58 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: avhubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
new file mode 100644
index 00000000000..606081aa9b1
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_ctc_lf.yaml
@@ -0,0 +1,78 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 4 days.
+batch_type: length
+batch_bins: 60000000
+accum_grad: 10
+max_epoch: 100
+patience: none
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [20,20,20,20,20,20,40,40,40,40,40,40]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf: 
+        warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml
new file mode 100644
index 00000000000..f0454207ee5
--- /dev/null
+++ b/egs2/how2_2000h/sum1/conf/tuning/train_asr_conformer_vid_lf.yaml
@@ -0,0 +1,80 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 2 days.
+batch_bins: 200000
+batch_type: length
+accum_grad: 10
+max_epoch: 100
+patience: 10
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 5000
+encoder: longformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "abs_pos"
+    selfattention_layer_type: "lf_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+    attention_windows: [40,40,40,40,40,40,40,40,60,60,60,60]
+    attention_dilation: [1,1,1,1,1,1,1,1,1,1,1,1]
+    attention_mode: tvm
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 6
+    dropout_rate: 0.15
+    positional_dropout_rate: 0.15
+    self_attention_dropout_rate: 0.15
+    src_attention_dropout_rate: 0.15
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.15
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: reducelronplateau
+scheduler_conf:
+        mode: min
+        factor: 0.5
+        patience: 1
+    #scheduler: warmuplr
+    #scheduler_conf: 
+    #    warmup_steps: 25000
+        
+ctc_conf:
+        ignore_nan_grad: true
+
+unused_parameters: True 
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/how2_2000h/sum1/db.sh b/egs2/how2_2000h/sum1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/how2_2000h/sum1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data.sh b/egs2/how2_2000h/sum1/local/data.sh
new file mode 100755
index 00000000000..ffd918d7d09
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+url_how2_2000="https://drive.google.com/file/d/1SHg7La_hflMTIm6gaCus46sn4zYqWJvb/view?usp=sharing"
+data_how2=how2_feats
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data download"
+
+    if [ -d ${data_how2} ]; then
+        log "$0: HowTo directory or archive already exists in ${data_how2}. Skipping download."
+    else
+        ../../../utils/download_from_google_drive.sh ${url_how2_2000} $PWD tar.gz
+        log "$0: Successfully downloaded and un-tarred how2_feats.tar.gz"
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation and verification"
+    mv how2_feats/data .
+    mv how2_feats/fbank .
+fi 
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/nlsyms b/egs2/how2_2000h/sum1/local/data_normalization/nlsyms
new file mode 100644
index 00000000000..8497d1e0046
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/nlsyms
@@ -0,0 +1,2 @@
+(h|H)(m|M)+ hesmark
+(U|u)(m|M)+ hesmark
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/shortened b/egs2/how2_2000h/sum1/local/data_normalization/shortened
new file mode 100644
index 00000000000..305228debf3
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/shortened
@@ -0,0 +1,4 @@
+Dr. Doctor
+Mr. Mister
+Ms. Miss
+No.1 Number one
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/symbols b/egs2/how2_2000h/sum1/local/data_normalization/symbols
new file mode 100644
index 00000000000..915b7a23070
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/symbols
@@ -0,0 +1,6 @@
+¾ 3/4
+½ 1/2
+% percent
+[+] plus
+= equal
+&[^;]*; and
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/data_normalization/url b/egs2/how2_2000h/sum1/local/data_normalization/url
new file mode 100644
index 00000000000..ca215839a83
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/data_normalization/url
@@ -0,0 +1,5 @@
+@ at
+www[.] www dot
+[.]com[/] dot com slash
+[.]com dot com
+[.]org dot org
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/path.sh b/egs2/how2_2000h/sum1/local/path.sh
new file mode 100755
index 00000000000..a0b8041dfb2
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/path.sh
@@ -0,0 +1,19 @@
+# check extra module installation
+if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null; then
+    echo "Error: it seems that longformer is not installed." >&2
+    echo "Error: please install longformer as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import nlgeval' > /dev/null; then
+    echo "Error: it seems that nlgeval is not installed." >&2
+    echo "Error: please install nlgeval as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
+if ! python -c 'import datasets' > /dev/null; then
+    echo "Error: it seems that datasets is not installed." >&2
+    echo "Error: please install datasets as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
+    return 1
+fi
diff --git a/egs2/how2_2000h/sum1/local/run_asr.sh b/egs2/how2_2000h/sum1/local/run_asr.sh
new file mode 120000
index 00000000000..8d5b78f2cf0
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/run_asr.sh
@@ -0,0 +1 @@
+../../asr1/run.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/local/score.sh b/egs2/how2_2000h/sum1/local/score.sh
new file mode 100755
index 00000000000..da549ebcc62
--- /dev/null
+++ b/egs2/how2_2000h/sum1/local/score.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright 2021 Carnegie Mellon University (Author : Roshan Sharma)
+
+## begin configuration section.
+data=data/dev5_test_sum
+# end configuration section.
+
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir>"
+  exit 1;
+fi
+
+
+asr_expdir=$1
+
+name=$(basename ${data}) # e.g. dev5_test
+echo "${asr_expdir}/decode_*/${name}"
+for dir in ${asr_expdir}/decode_*/${name}; do
+    python pyscripts/utils/score_summarization.py $data/text $dir/text $(echo $dir | sed 's/exp//g') > $dir/result.sum
+done   
diff --git a/egs2/how2_2000h/sum1/path.sh b/egs2/how2_2000h/sum1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/how2_2000h/sum1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/pyscripts b/egs2/how2_2000h/sum1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/how2_2000h/sum1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/run.sh b/egs2/how2_2000h/sum1/run.sh
new file mode 100755
index 00000000000..5acfc2abc59
--- /dev/null
+++ b/egs2/how2_2000h/sum1/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+train_set="tr_2000h_sum"
+valid_set="cv05_sum"
+test_sets="dev5_test_sum"
+asr_config=conf/train_sum_conformer_lf.yaml
+inference_config=conf/decode_sum.yaml
+
+feats_type=extracted
+
+token_type=bpe
+
+nlsyms=data/nlsyms
+nbpe=1000
+bpe_nlsyms="[hes]"
+
+use_lm=false
+mdur=100
+
+## Run local/run_asr.sh to pretrain an ASR Model on How2, and fine-tune that model on Speech Summarization
+
+./asr.sh \
+    --lang en \
+    --feats_type ${feats_type} \
+    --token_type ${token_type} \
+    --nbpe ${nbpe} \
+    --nlsyms_txt ${nlsyms} \
+    --bpe_nlsyms ${bpe_nlsyms} \
+    --use_lm ${use_lm} \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --max_wav_duration "$mdur" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/how2_2000h/sum1/scripts b/egs2/how2_2000h/sum1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/how2_2000h/sum1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/steps b/egs2/how2_2000h/sum1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/how2_2000h/sum1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/how2_2000h/sum1/utils b/egs2/how2_2000h/sum1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/how2_2000h/sum1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/asr.sh b/egs2/hub4_spanish/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/cmd.sh b/egs2/hub4_spanish/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/hub4_spanish/asr1/conf/decode_asr.yaml b/egs2/hub4_spanish/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/conf/fbank.conf b/egs2/hub4_spanish/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/hub4_spanish/asr1/conf/pbs.conf b/egs2/hub4_spanish/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/hub4_spanish/asr1/conf/pitch.conf b/egs2/hub4_spanish/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/hub4_spanish/asr1/conf/queue.conf b/egs2/hub4_spanish/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/hub4_spanish/asr1/conf/slurm.conf b/egs2/hub4_spanish/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/hub4_spanish/asr1/conf/train_asr.yaml b/egs2/hub4_spanish/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/conf/train_lm.yaml b/egs2/hub4_spanish/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/decode_rnn.yaml b/egs2/hub4_spanish/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/decode_transformer.yaml b/egs2/hub4_spanish/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..5a125de415c
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,78 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/hub4_spanish/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..a889e110cf3
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,52 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/hub4_spanish/asr1/db.sh b/egs2/hub4_spanish/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/data.sh b/egs2/hub4_spanish/asr1/local/data.sh
new file mode 100755
index 00000000000..694d58d6608
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+ . utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${HUB4_SPANISH}
+if [ -z "${HUB4_SPANISH}" ]; then
+    log "Fill the value of 'HUB4_SPANISH' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set="test"
+
+audio_data=${HUB4_SPANISH}/LDC98S74
+transcript_data=${HUB4_SPANISH}/LDC98T29
+eval_data=${HUB4_SPANISH}/LDC2001S91
+dev_list=dev.list
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Prepare eval data with ${HUB4_SPANISH}"
+
+    # Eval dataset preparation
+    # prepare_data.sh does not really care about the order or number of the
+    # corpus directories
+    local/prepare_data.sh \
+      ${eval_data}/HUB4_1997NE/doc/h4ne97sp.sgm \
+      ${eval_data}/HUB4_1997NE/h4ne_sp/h4ne97sp.sph data/${test_set}
+    local/prepare_test_text.pl \
+      "<unk>" data/${test_set}/text > data/${test_set}/text.clean
+    mv data/${test_set}/text data/${test_set}/text.old
+    mv data/${test_set}/text.clean data/${test_set}/text
+    utils/fix_data_dir.sh data/${test_set}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage1: Prepare train data with ${HUB4_SPANISH}"
+    ## Training dataset preparation
+    local/prepare_data.sh ${audio_data} ${transcript_data} data/${train_set}
+    local/prepare_training_text.pl \
+      "<unk>" data/${train_set}/text > data/${train_set}/text.clean
+    mv data/${train_set}/text data/${train_set}/text.old
+    mv data/${train_set}/text.clean data/${train_set}/text
+    utils/fix_data_dir.sh data/${train_set}
+
+    # For generating the dev set. Use provided utterance list otherwise
+    # num_dev=$(wc -l < data/eval/segments)
+    # ./utils/subset_data_dir.sh data/${train_set} ${num_dev} data/${train_dev}
+
+    ./utils/subset_data_dir.sh --utt-list ${dev_list} data/${train_set} data/${train_dev}
+
+    mv data/${train_set} data/${train_set}.tmp
+    mkdir -p data/${train_set}
+    awk '{print $1}' data/${train_dev}/segments | grep -vf - data/${train_set}.tmp/segments > data/${train_set}/uttlist.list
+    ./utils/subset_data_dir.sh --utt-list data/${train_set}/uttlist.list data/${train_set}.tmp data/${train_set}
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/hub4_spanish/asr1/local/parse_sgm.pl b/egs2/hub4_spanish/asr1/local/parse_sgm.pl
new file mode 120000
index 00000000000..e9c1ed6d3df
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/parse_sgm.pl
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/parse_sgm.pl
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/path.sh b/egs2/hub4_spanish/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/hub4_spanish/asr1/local/prepare_data.sh b/egs2/hub4_spanish/asr1/local/prepare_data.sh
new file mode 120000
index 00000000000..3ca46aff91e
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/prepare_data.sh
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/prepare_data.sh
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/prepare_test_text.pl b/egs2/hub4_spanish/asr1/local/prepare_test_text.pl
new file mode 120000
index 00000000000..1f940ad6e38
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/prepare_test_text.pl
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/prepare_test_text.pl
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/prepare_training_text.pl b/egs2/hub4_spanish/asr1/local/prepare_training_text.pl
new file mode 120000
index 00000000000..cde41e2580c
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/prepare_training_text.pl
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/prepare_training_text.pl
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/local/write_kaldi_files.pl b/egs2/hub4_spanish/asr1/local/write_kaldi_files.pl
new file mode 120000
index 00000000000..0f70002df02
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/local/write_kaldi_files.pl
@@ -0,0 +1 @@
+../../../../egs/hub4_spanish/asr1/local/write_kaldi_files.pl
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/path.sh b/egs2/hub4_spanish/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/pyscripts b/egs2/hub4_spanish/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/run.sh b/egs2/hub4_spanish/asr1/run.sh
new file mode 100755
index 00000000000..e7e7a8bb204
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set="test"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=500
+
+./asr.sh \
+    --ngpu 4 \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/hub4_spanish/asr1/scripts b/egs2/hub4_spanish/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/steps b/egs2/hub4_spanish/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/hub4_spanish/asr1/utils b/egs2/hub4_spanish/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/hub4_spanish/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/README.md b/egs2/hui_acg/tts1/README.md
new file mode 100644
index 00000000000..42c814104aa
--- /dev/null
+++ b/egs2/hui_acg/tts1/README.md
@@ -0,0 +1,38 @@
+# HUI AUDIO CORPUS GERMAN RECIPE
+
+This is the recipe of German single speaker TTS model with [HUI-audio-corpus-german](https://opendata.iisys.de/datasets.html#hui-audio-corpus-german).
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+# INITIAL RESULTS
+
+- Single female speaker (Hokuspokus)
+
+## Environments
+
+- date: `Sun Aug  1 10:40:12 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.5.1`
+- Git hash: `9e6803d2f6c68e951268d0a4d23460737d9fdd39`
+  - Commit date: `Sun Aug 1 10:14:48 2021 +0900`
+
+## Pretrained Models
+
+### hui_acg_tts_train_transformer_raw_hokuspokus_phn_espeak_ng_german_train.loss.ave
+- https://zenodo.org/record/5150954
+
+### hui_acg_tts_train_tacotron2_raw_hokuspokus_phn_espeak_ng_german_train.loss.ave
+- https://zenodo.org/record/5150957
+
+### hui_acg_tts_train_conformer_fastspeech2_raw_hokuspokus_phn_espeak_ng_german_teacher_transformer_train.loss.ave
+- https://zenodo.org/record/5150959
diff --git a/egs2/hui_acg/tts1/cmd.sh b/egs2/hui_acg/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/hui_acg/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/hui_acg/tts1/conf/decode.yaml b/egs2/hui_acg/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/conf/mfcc.conf b/egs2/hui_acg/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/hui_acg/tts1/conf/pbs.conf b/egs2/hui_acg/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/hui_acg/tts1/conf/queue.conf b/egs2/hui_acg/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/hui_acg/tts1/conf/slurm.conf b/egs2/hui_acg/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/hui_acg/tts1/conf/train.yaml b/egs2/hui_acg/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/hui_acg/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/hui_acg/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/hui_acg/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..e006156e8be
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false  # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/hui_acg/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/hui_acg/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..434096c4bbc
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 200            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 12000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/hui_acg/tts1/conf/tuning/train_tacotron2.yaml b/egs2/hui_acg/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/hui_acg/tts1/conf/tuning/train_transformer.yaml b/egs2/hui_acg/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/hui_acg/tts1/conf/vad.conf b/egs2/hui_acg/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/hui_acg/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/hui_acg/tts1/db.sh b/egs2/hui_acg/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/hui_acg/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/local/data.sh b/egs2/hui_acg/tts1/local/data.sh
new file mode 100755
index 00000000000..80a092649b3
--- /dev/null
+++ b/egs2/hui_acg/tts1/local/data.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=100
+spk=Hokuspokus
+text_format=raw
+threshold=35
+nj=32
+g2p=espeak_ng_german
+
+log "$0 $*"
+# shellcheck disable=SC1091
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+# shellcheck disable=SC1091
+. ./cmd.sh || exit 1;
+# shellcheck disable=SC1091
+. ./db.sh || exit 1;
+
+if [ -z "${HUI_ACG}" ]; then
+   log "Fill the value of 'HUI_ACG' of db.sh"
+   exit 1
+fi
+
+db_root=${HUI_ACG}
+train_set=${spk,,}_tr_no_dev
+dev_set=${spk,,}_dev
+eval_set=${spk,,}_eval1
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: local/data_download.sh"
+    local/data_download.sh "${db_root}" "${spk}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    local/data_prep.sh "${db_root}" "${spk}" "data/${spk,,}"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: scripts/audio/trim_silence.sh"
+    # shellcheck disable=SC2154
+    scripts/audio/trim_silence.sh \
+        --cmd "${train_cmd}" \
+        --nj "${nj}" \
+        --fs 44100 \
+        --win_length 2048 \
+        --shift_length 512 \
+        --threshold "${threshold}" \
+        "data/${spk,,}" "data/${spk,,}/log"
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: utils/subset_data_dir.sh"
+    utils/subset_data_dir.sh "data/${spk,,}" 500 "data/${spk,,}_deveval"
+    utils/subset_data_dir.sh --first "data/${spk,,}_deveval" 250 "data/${dev_set}"
+    utils/subset_data_dir.sh --last "data/${spk,,}_deveval" 250 "data/${eval_set}"
+    utils/copy_data_dir.sh "data/${spk,,}" "data/${train_set}"
+    utils/filter_scp.pl --exclude "data/${spk,,}_deveval/wav.scp" \
+        "data/${spk,,}/wav.scp" > "data/${train_set}/wav.scp"
+    utils/fix_data_dir.sh "data/${train_set}"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && [ "${text_format}" = phn ]; then
+    log "stage 3: pyscripts/utils/convert_text_to_phn.py"
+    for dset in "${train_set}" "${dev_set}" "${eval_set}"; do
+        utils/copy_data_dir.sh "data/${dset}" "data/${dset}_phn"
+        pyscripts/utils/convert_text_to_phn.py --g2p "${g2p}" --nj "${nj}" \
+            "data/${dset}/text" "data/${dset}_phn/text"
+        utils/fix_data_dir.sh "data/${dset}_phn"
+    done
+fi
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/hui_acg/tts1/local/data_download.sh b/egs2/hui_acg/tts1/local/data_download.sh
new file mode 100755
index 00000000000..551d74308e9
--- /dev/null
+++ b/egs2/hui_acg/tts1/local/data_download.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+download_dir=$1
+spk=$2
+
+available_spks=(
+    "Bernd_Ungerer" "Hokuspokus" "Friedrich" "Karlsson" "Eva_K" "awb" "ksp"
+)
+base_url=https://opendata.iisys.de/systemintegration/Datasets/HUI-Audio-Corpus-German/dataset_clean
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <download_dir> <spk>"
+    exit 1
+fi
+
+# check spk
+if ! echo "${available_spks[*]}" | grep -q "${spk}"; then
+    echo "Specified spk (${spk}) is not available or not supported." >&2
+    echo "Available spk: ${available_spks[*]}" >&2
+    exit 1
+fi
+
+set -euo pipefail
+
+if [ ! -e "${download_dir}/${spk}" ]; then
+    mkdir -p "${download_dir}"
+    wget "${base_url}/${spk}_Clean.zip" -P "${download_dir}"
+    unzip "${download_dir}/${spk}_Clean.zip" -d "${download_dir}"
+    echo "Successfully finished download and unzip."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/egs2/hui_acg/tts1/local/data_prep.sh b/egs2/hui_acg/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..d3d31185334
--- /dev/null
+++ b/egs2/hui_acg/tts1/local/data_prep.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+spk=$2
+data_dir=$3
+
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 <db_root> <spk> <data_dir>"
+    echo "e.g.: $0 downloads Hokuspokus data/Hokuspokus"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check directory existence
+[ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+
+# make scp, utt2spk, and spk2utt
+find "${db_root}/${spk}" -name "*.wav" | sort | while read -r filename; do
+    id=${spk}_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} ${filename}" >> "${scp}"
+    echo "${id} ${spk}" >> "${utt2spk}"
+done
+utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+echo "Successfully finished making wav.scp, utt2spk, spk2utt."
+
+# make text
+find "${db_root}" -name "metadata.csv" | sort | while read -r filename; do
+    awk -F "|" -v spk="${spk}" '{print spk "_" $1 " " $2}' < "${filename}" | sort >> "${text}"
+done
+echo "Successfully finished making text."
+
+utils/fix_data_dir.sh "${data_dir}"
+echo "Successfully finished preparing data directory."
diff --git a/egs2/hui_acg/tts1/local/path.sh b/egs2/hui_acg/tts1/local/path.sh
new file mode 100644
index 00000000000..3c8f2419056
--- /dev/null
+++ b/egs2/hui_acg/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import phonemizer" > /dev/null; then
+    echo "Error: phonemizer is not installed." >&2
+    echo "Error: please install phonemizer and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && source activate_python.sh && ./installers/install_phonemizer.sh" >&2
+    return 1
+fi
diff --git a/egs2/hui_acg/tts1/path.sh b/egs2/hui_acg/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/hui_acg/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/pyscripts b/egs2/hui_acg/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/hui_acg/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/run.sh b/egs2/hui_acg/tts1/run.sh
new file mode 100755
index 00000000000..3524ba6b390
--- /dev/null
+++ b/egs2/hui_acg/tts1/run.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Feature related
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+# Data prep related
+spk=Hokuspokus   # See local/data_download.sh to check available spks.
+text_format=phn  # Use "raw" or "phn". If use "phn", convert to phn in data prep.
+local_data_opts=""
+local_data_opts+=" --spk ${spk}"
+local_data_opts+=" --text_format ${text_format}"
+if [ "${text_format}" = phn ]; then
+    local_data_opts+=" --g2p espeak_ng_german"
+fi
+
+dset_prefix=${spk,,}_
+dset_suffix=""
+if [ "${text_format}" = phn ]; then
+    dset_suffix=_phn
+fi
+train_set=${dset_prefix}tr_no_dev${dset_suffix}
+valid_set=${dset_prefix}dev${dset_suffix}
+test_sets="${dset_prefix}dev${dset_suffix} ${dset_prefix}eval1${dset_suffix}"
+
+# config related
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# NOTE(kan-bayashi): Make sure that you use text_format=raw
+#   if you want to use token_type=char.
+token_type=phn
+
+# NOTE(kan-bayashi): On-the-fly with Espeak is really slow,
+#   so we convert text into phn in data prep stage via
+#   --text_format=phn and use g2p=none for training.
+# g2p=espeak_ng_german
+g2p=none
+
+./tts.sh \
+    --local_data_opts "${local_data_opts}" \
+    --audio_format wav \
+    --lang de \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type "${token_type}" \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    "$@"
diff --git a/egs2/hui_acg/tts1/scripts b/egs2/hui_acg/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/hui_acg/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/sid b/egs2/hui_acg/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/hui_acg/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/steps b/egs2/hui_acg/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/hui_acg/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/tts.sh b/egs2/hui_acg/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/hui_acg/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/hui_acg/tts1/utils b/egs2/hui_acg/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/hui_acg/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/README.md b/egs2/iemocap/asr1/README.md
new file mode 100644
index 00000000000..aee70f8bac6
--- /dev/null
+++ b/egs2/iemocap/asr1/README.md
@@ -0,0 +1,67 @@
+# RESULTS
+## Dataset
+- IEMOCAP database: The Interactive Emotional Dyadic Motion Capture database
+  - Database: https://sail.usc.edu/iemocap/
+  - Paper: https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf
+
+## Environments
+- date: `Thu Sep  9 21:55:50 EDT 2021`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `5dacbea87269472b60c5c24d42a09900f81a27c7`
+  - Commit date: `Wed Sep 8 15:23:33 2021 +0900`
+
+
+## Using Transformer based encoder-decoder and self-supervised learning features [HuBERT_large_ll60k, Transformer, utt_mvn](conf/tuning/train_asr_transformer_hubert_960hr_large.yaml) and predicting transcript along with emotion
+- ASR config: [conf/tuning/train_asr_transformer_hubert_960hr_large.yaml](conf/tuning/train_asr_transformer_hubert_960hr_large.yaml)
+- token_type: word
+- keep_nbest_models: 10
+- Emotional Labels: anger, happiness, sadness and neutral
+
+|dataset|Snt|Emotion Classification (%)|
+|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/test|941|69.53|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|390|77.18|
+
+### ASR results
+
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/test|941|11017|75.7|15.1|9.2|5.6|29.9|76.1|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|390|4355|82.8|9.4|7.9|3.3|20.5|58.5|
+
+# Sentiment Analysis RESULTS
+## Environments
+- date: `Thu Feb 17 11:25:22 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `f6cde1c419c814a14ccd40abe557a780508cbcdf`
+  - Commit date: `Fri Feb 11 12:25:33 2022 -0500`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- Sentiment Labels: Positive, Neutral, Negative
+- Pretrained Model
+   - Hugging Face: https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_model_valid.acc.ave_10best/valid|754|53.9|65.7|66.4|
+|decode_asr_model_valid.acc.ave_10best/test|1650|50.3|54.5|55.7|
+
+## Using Conformer based encoder, Transformer based decoder, and self-supervised learning features with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer_hubert.yaml](conf/tuning/train_asr_conformer_hubert.yaml)
+- token_type: word
+- Sentiment Labels: Positive, Neutral, Negative
+- Pretrained Model
+   - Hugging Face: https://huggingface.co/espnet/YushiUeda_iemocap_sentiment_asr_train_asr_conformer_hubert
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_model_valid.acc.ave_10best/valid|754|66.5|76.4|75.7|
+|decode_asr_model_valid.acc.ave_10best/test|1650|62.0|65.5|65.8|
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/asr.sh b/egs2/iemocap/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/iemocap/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/cmd.sh b/egs2/iemocap/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iemocap/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iemocap/asr1/conf/decode_asr.yaml b/egs2/iemocap/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..2a0495063be
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 10
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/conf/fbank.conf b/egs2/iemocap/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/iemocap/asr1/conf/pbs.conf b/egs2/iemocap/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iemocap/asr1/conf/pitch.conf b/egs2/iemocap/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/iemocap/asr1/conf/queue.conf b/egs2/iemocap/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iemocap/asr1/conf/slurm.conf b/egs2/iemocap/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iemocap/asr1/conf/train_asr.yaml b/egs2/iemocap/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..f4714a2bf87
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer_adam_specaug_fast.yaml
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..021db5f3da6
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,60 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 200
+batch_size: 64
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml
new file mode 100644
index 00000000000..c6acbbb652a
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/tuning/train_asr_conformer_hubert.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_transformer_adam_specaug_fast.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_transformer_adam_specaug_fast.yaml
new file mode 100644
index 00000000000..9f2afb5bd9a
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/tuning/train_asr_transformer_adam_specaug_fast.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 20000
+max_epoch: 200
+batch_size: 64
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/conf/tuning/train_asr_transformer_hubert_960hr_large.yaml b/egs2/iemocap/asr1/conf/tuning/train_asr_transformer_hubert_960hr_large.yaml
new file mode 100755
index 00000000000..e04f468d8a1
--- /dev/null
+++ b/egs2/iemocap/asr1/conf/tuning/train_asr_transformer_hubert_960hr_large.yaml
@@ -0,0 +1,80 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 20000
+max_epoch: 120
+batch_size: 64
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/db.sh b/egs2/iemocap/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/iemocap/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/data.sh b/egs2/iemocap/asr1/local/data.sh
new file mode 100755
index 00000000000..de0d3807ec6
--- /dev/null
+++ b/egs2/iemocap/asr1/local/data.sh
@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+stage=1
+stop_stage=100
+
+lowercase=false
+# Convert into lowercase if "true".
+remove_punctuation=false
+# Remove punctuation (except apostrophes) if "true".
+# Note that punctuation normalization will be performed in the "false" case. 
+remove_tag=false
+# Remove [TAGS] (e.g.[LAUGHTER]) if "true".
+remove_emo=
+# Remove the utterances with the specified emotional labels
+# emotional labels: ang (anger), hap (happiness), exc (excitement), sad (sadness),
+# fru (frustration), fea (fear), sur (surprise), neu (neutral), and xxx (other)
+convert_to_sentiment=false
+# for sentiment (positive, negative and neutral) analysis
+# mapping from emotion to sentiment is as follows:
+# Positive: hap, exc, sur
+# Negative: ang, sad, fru, fea
+# Neutral: neu
+
+#data
+datadir=/ocean/projects/cis210027p/shared/corpora/IEMOCAP_full_release
+# IEMOCAP_full_release
+#  |_ README.txt
+#  |_ Documentation/
+#  |_ Session{1-5}/
+#      |_ sentences/wav/ ...<wav files for each utterance>
+#      |_ dialog/
+#          |_ transcriptions/ ...<transcription files>
+#          |_ EmoEvaluation/ ...<emotion annotation files>
+# In this recipe
+# Sessions 1-3 & 4F (Ses01, SeS02, Ses03,and Ses04F) are used for training (6871 utterances),
+# Session 4M (Ses04M) is used for validation (998 utterances), and
+# Session 5 (Ses05) is used for evaluation (2170 utterances).
+# Download data from here:
+# https://sail.usc.edu/iemocap/
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: IEMOCAP Data Preparation"
+    # This process may take a few minutes for the first run
+    # Remove "data/${tmp}/tmp.done" if you want to start all over again
+    if [ -n "${remove_emo}" ]; then
+        log "Remove ${remove_emo} from emotional labels"
+        tmp="tmp/remove_emo"
+    else
+        log "Use all 9 emotional labels"
+        tmp=tmp
+    fi
+    if [ ! -e data/${tmp}/tmp.done ];then
+        mkdir -p data/{train,valid,test}
+        mkdir -p data/${tmp}
+        echo -n "" > data/${tmp}/wav.scp; echo -n "" > data/${tmp}/utt2spk; echo -n "" > data/${tmp}/text
+        for n in {1..5}; do
+            for file in "${datadir}"/Session"${n}"/sentences/wav/*/*.wav; do
+                utt_id=$(basename ${file} .wav)
+                ses_id=$(echo "${utt_id}" | sed "s/_[FM][0-9]\{3\}//g")
+                words=$(grep ${utt_id} ${datadir}/Session${n}/dialog/transcriptions/${ses_id}.txt \
+                        | sed "s/^.*\]:\s\(.*\)$/\1/g")
+                emo=$(grep ${utt_id} ${datadir}/Session${n}/dialog/EmoEvaluation/${ses_id}.txt \
+                        | sed "s/^.*\t${utt_id}\t\([a-z]\{3\}\)\t.*$/\1/g")
+                if ! eval "echo ${remove_emo} | grep -q ${emo}" ; then
+                    # for sentiment analysis
+                    if [ ${convert_to_sentiment} = "true" ]; then
+                        words2=$(echo "$words" | perl local/prepare_sentiment.pl)
+                        if [ ${emo} = "hap" ] || [ ${emo} = "exc" ] || [ ${emo} = "sur" ]; then
+                            echo "${utt_id} Positive ${words2}" >> data/${tmp}/text
+                            echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
+                            echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
+                        elif [ ${emo} = "ang" ] || [ ${emo} = "sad" ] || [ ${emo} = "fru" ] || [ ${emo} = "fea" ]; then
+                            echo "${utt_id} Negative ${words2}" >> data/${tmp}/text
+                            echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
+                            echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
+                        elif [ ${emo} = "neu" ];then
+                            echo "${utt_id} Neutral ${words2}" >> data/${tmp}/text
+                            echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
+                            echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
+                        fi
+                    else
+                        echo "${utt_id} <${emo}> ${words}" >> data/${tmp}/text
+                        echo "${utt_id} ${file}" >> data/${tmp}/wav.scp
+                        echo "${utt_id} ${utt_id}" >> data/${tmp}/utt2spk
+                    fi
+                fi
+            done
+        done
+        dos2unix data/${tmp}/wav.scp; dos2unix data/${tmp}/utt2spk; dos2unix data/${tmp}/text
+        utils/utt2spk_to_spk2utt.pl data/${tmp}/utt2spk > "data/${tmp}/spk2utt"
+        touch data/${tmp}/tmp.done
+    fi
+    for file in wav.scp utt2spk spk2utt text; do
+        grep -e "Ses01" -e "Ses02" -e "Ses03" -e "Ses04F" data/${tmp}/${file} > data/train/${file}
+        grep "Ses04M" data/${tmp}/${file} > data/valid/${file}
+        grep "Ses05" data/${tmp}/${file} > data/test/${file}
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    if [ ${convert_to_sentiment} != "true" ]; then
+        log "stage 2: IEMOCAP Transcript Conversion"
+        mkdir -p data/{train,valid,test}/{original,tmp}/
+        for dset in train valid test; do        
+            cp data/${dset}/text -t data/${dset}/original/
+            if ${lowercase}; then
+                log "lowercase ${dset}"
+                perl local/lowercase.pl < data/${dset}/text > data/${dset}/tmp/text
+                cp data/${dset}/tmp/text data/${dset}/text
+            fi
+            if ${remove_punctuation}; then
+                log "remove_punctuation ${dset}"
+                perl local/remove_punctuation.pl < data/${dset}/text > data/${dset}/tmp/text
+                cp data/${dset}/tmp/text data/${dset}/text
+            fi
+            if ${remove_tag}; then
+                log "remove_tag ${dset}"
+                perl local/remove_tag.pl < data/${dset}/text > data/${dset}/tmp/text
+                cp data/${dset}/tmp/text data/${dset}/text
+            fi
+            #Remove extra space and normalize punctuation
+            perl local/normalize_punctuation.pl < data/${dset}/text > data/${dset}/tmp/text
+            cp data/${dset}/tmp/text data/${dset}/text
+        done
+    fi
+    for dset in test valid train; do 
+        utils/validate_data_dir.sh --no-feats data/${dset} || exit 1
+    done
+fi
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/lowercase.pl b/egs2/iemocap/asr1/local/lowercase.pl
new file mode 100755
index 00000000000..92f2fa57c23
--- /dev/null
+++ b/egs2/iemocap/asr1/local/lowercase.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# Modified for espnet2 iemocap recipe
+# Yushi Ueda, Carnegie Mellon University (2021)
+
+use warnings;
+use strict;
+
+while (@ARGV) {
+    $_ = shift;
+    /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  if ($_ =~ /(^Ses.+[0-9]{3})\s(<.+$)/){
+    my $word = lc($2);
+    print "$1 $word\n";
+  }
+}
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/normalize_punctuation.pl b/egs2/iemocap/asr1/local/normalize_punctuation.pl
new file mode 100755
index 00000000000..1dbe61a8359
--- /dev/null
+++ b/egs2/iemocap/asr1/local/normalize_punctuation.pl
@@ -0,0 +1,117 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# Modified for espnet2 iemocap recipe
+# Yushi Ueda, Carnegie Mellon University (2021)
+
+use warnings;
+use strict;
+
+my $language = "en";
+my $PENN = 0;
+
+while (@ARGV) {
+    $_ = shift;
+    /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+    /^-l$/ && ($language = shift, next);
+    /^[^\-]/ && ($language = $_, next);
+  	/^-penn$/ && ($PENN = 1, next);
+}
+
+while(<STDIN>) {
+    s/\r//g;
+    # remove extra spaces
+    s/\(/ \(/g;
+    s/\)/\) /g; s/ +/ /g;
+    s/\) ([\.\!\:\?\;\,])/\)$1/g;
+    s/\( /\(/g;
+    s/ \)/\)/g;
+    s/(\d) \%/$1\%/g;
+    s/ :/:/g;
+    s/ ;/;/g;
+    # normalize unicode punctuation
+    if ($PENN == 0) {
+      s/\`/\'/g;
+      s/\'\'/ \" /g;
+    }
+
+    s/„/\"/g;
+    s/“/\"/g;
+    s/”/\"/g;
+    s/–/-/g;
+    s/—/ - /g; s/ +/ /g;
+    s/´/\'/g;
+    s/([a-z])‘([a-z])/$1\'$2/gi;
+    s/([a-z])’([a-z])/$1\'$2/gi;
+    s/‘/\'/g;
+    s/‚/\'/g;
+    s/’/\"/g;
+    s/''/\"/g;
+    s/´´/\"/g;
+    s/…/.../g;
+    # French quotes
+    s/ « / \"/g;
+    s/« /\"/g;
+    s/«/\"/g;
+    s/ » /\" /g;
+    s/ »/\"/g;
+    s/»/\"/g;
+    # handle pseudo-spaces
+    s/ \%/\%/g;
+    s/nº /nº /g;
+    s/ :/:/g;
+    s/ ºC/ ºC/g;
+    s/ cm/ cm/g;
+    s/ \?/\?/g;
+    s/ \!/\!/g;
+    s/ ;/;/g;
+    s/, /, /g; s/ +/ /g;
+
+    # English "quotation," followed by comma, style
+    if ($language eq "en") {
+	s/\"([,\.]+)/$1\"/g;
+    }
+    # Czech is confused
+    elsif ($language eq "cs" || $language eq "cz") {
+    }
+    # German/Spanish/French "quotation", followed by comma, style
+    else {
+	s/,\"/\",/g;	
+	s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
+    }
+
+
+    if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
+	s/(\d) (\d)/$1,$2/g;
+    }
+    else {
+	s/(\d) (\d)/$1.$2/g;
+    }
+
+    # modified by Yushi Ueda
+    # replace punctuations into 4 types (. <period> , <comma> ? <question mark> -- <dash>) for simplicity
+    # & separate words and punctuations by inserting a space
+    # . ... : ; ! | . <period>
+    # ,           | , <comma> (no change) 
+    # ? !? ?!     | ? <question mark>
+    # - -- "      | -- <dash> (Note that - for joining words (e.g. "hyper-parameter") will be kept)
+    s/\.\.+/\. /g;
+    s/[\.\:\;\!] / \. /g;
+    s/[\.\:\;\!]$/ \./g;
+    s/\,/ \,/g;
+    s/\!\?/ \?/g;
+    s/\?\!/ \?/g;
+    s/\?/ \?/g;
+    s/([a-zA-Z0-9])\-\-/$1 \-\-/g;
+    s/\-\-([a-zA-Z0-9])/\-\- $1/g;
+    s/([a-zA-Z0-9])\- /$1 \-\- /g;
+    s/ \-([a-zA-Z0-9])/ \-\- $1/g;
+    s/([a-zA-Z0-9])\-$/$1 \-\-/g;
+    s/ \- / \-\- /g;
+    s/\"/ \-\- /g;
+    # remove extra spaces
+    s/ +/ /g;
+
+    print $_;
+}
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/path.sh b/egs2/iemocap/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iemocap/asr1/local/prepare_sentiment.pl b/egs2/iemocap/asr1/local/prepare_sentiment.pl
new file mode 100644
index 00000000000..1c6faf4baf8
--- /dev/null
+++ b/egs2/iemocap/asr1/local/prepare_sentiment.pl
@@ -0,0 +1,30 @@
+#!/usr/bin/env perl
+#
+# Yushi Ueda, Carnegie Mellon University (2022)
+
+use warnings;
+use strict;
+
+while (@ARGV) {
+    $_ = shift;
+    /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  # remove punctuation except apostrophes
+  s/([\.\,\?\!\-\:\;\"])/ $1 /g;
+  s/[\.\,\?\!\-\:\;\"]//g;
+  # remove tag (e.g. [LAUGHTER])
+  s/\[.+\]//g;
+  # Detect valid apostrophe cases and split those into a two words
+  s/([A-Za-z])\'([A-Za-z])/$1 \'$2/g;
+  # Clean up special cases of standalone apostrophes
+  s/([A-Za-z])\' /$1 /g;
+  # remove extra spaces
+  s/ +/ /g;
+  # lowercasing
+  print lc($_);
+}
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/remove_punctuation.pl b/egs2/iemocap/asr1/local/remove_punctuation.pl
new file mode 100755
index 00000000000..6350c80bbf3
--- /dev/null
+++ b/egs2/iemocap/asr1/local/remove_punctuation.pl
@@ -0,0 +1,15 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+while(<STDIN>) {
+  # remove punctuation except apostrophe
+  # extra spaces will be removed later in "normalize_punctuation.pl"
+  s/([\.\,\?\!\-\:\;\"])/ $1 /g;
+  s/[\.\,\?\!\-\:\;\"]//g;
+  print "$_";
+}
diff --git a/egs2/iemocap/asr1/local/remove_tag.pl b/egs2/iemocap/asr1/local/remove_tag.pl
new file mode 100755
index 00000000000..c2a9866eca5
--- /dev/null
+++ b/egs2/iemocap/asr1/local/remove_tag.pl
@@ -0,0 +1,13 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+while(<STDIN>) {
+  # remove tag (e.g. [LAUGHTER])
+  s/\[.+\]//g;
+  print "$_";
+}
diff --git a/egs2/iemocap/asr1/local/score.py b/egs2/iemocap/asr1/local/score.py
new file mode 120000
index 00000000000..eb4334259c5
--- /dev/null
+++ b/egs2/iemocap/asr1/local/score.py
@@ -0,0 +1 @@
+../../../slue-voxceleb/asr1/local/score.py
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/local/score.sh b/egs2/iemocap/asr1/local/score.sh
new file mode 120000
index 00000000000..938c01f1250
--- /dev/null
+++ b/egs2/iemocap/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../slue-voxceleb/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/path.sh b/egs2/iemocap/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/iemocap/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/pyscripts b/egs2/iemocap/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/iemocap/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/run.sh b/egs2/iemocap/asr1/run.sh
new file mode 100755
index 00000000000..09a8a871cdc
--- /dev/null
+++ b/egs2/iemocap/asr1/run.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+local_data_opts="--lowercase true --remove_punctuation true --remove_emo xxx_exc_fru_fea_sur"
+# local_data_opts: 5 following options can be set (default=false)
+#--lowercase
+#   Convert transcripts into lowercase if "true".
+#--remove_punctuation
+#   Remove punctuation (except apostrophes) if "true".
+#--remove_tag
+#   Remove [TAGS] (e.g.[LAUGHTER]) if "true".
+#--remove_emo
+#   Remove the utterances with the specified emotional labels.
+#   If specifying two or more labels, concatenate them with "_" (e.g. xxx_exc_fru_fea_sur).
+#   emotional labels: ang (anger), hap (happiness), exc (excitement), sad (sadness),
+#   fru (frustration), fea (fear), sur (surprise), neu (neutral), and xxx (other)
+#--convert_to_sentiment
+#   Convert emotion to sentiment (Positive, Negative and Neutral)
+#   mapping from emotion to sentiment is as follows:
+#   Positive: hap, exc, sur
+#   Negative: ang, sad, fru, fea
+#   Neutral: neu
+#   This option normalizes text irrelevant of "--lowercase" "--remove_punctuation" "--remove_tag" options
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --local_data_opts "${local_data_opts}" "$@"
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/scripts b/egs2/iemocap/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/iemocap/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/steps b/egs2/iemocap/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iemocap/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iemocap/asr1/utils b/egs2/iemocap/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iemocap/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/README.md b/egs2/indic_speech/tts1/README.md
new file mode 100644
index 00000000000..ced0d21999c
--- /dev/null
+++ b/egs2/indic_speech/tts1/README.md
@@ -0,0 +1,12 @@
+# INDIC SPEECH RECIPE
+
+This is the recipe of the TTS model with the [IndicSpeech](http://cvit.iiit.ac.in/research/projects/cvit-projects/text-to-speech-dataset-for-indian-languages) corpus.
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
diff --git a/egs2/indic_speech/tts1/cmd.sh b/egs2/indic_speech/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/indic_speech/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/indic_speech/tts1/conf/decode.yaml b/egs2/indic_speech/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/conf/mfcc.conf b/egs2/indic_speech/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/indic_speech/tts1/conf/pbs.conf b/egs2/indic_speech/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/indic_speech/tts1/conf/queue.conf b/egs2/indic_speech/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/indic_speech/tts1/conf/slurm.conf b/egs2/indic_speech/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/indic_speech/tts1/conf/train.yaml b/egs2/indic_speech/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/indic_speech/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/indic_speech/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/indic_speech/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/indic_speech/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/indic_speech/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..434096c4bbc
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 200            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 12000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/indic_speech/tts1/conf/tuning/train_tacotron2.yaml b/egs2/indic_speech/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/indic_speech/tts1/conf/tuning/train_transformer.yaml b/egs2/indic_speech/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/indic_speech/tts1/conf/vad.conf b/egs2/indic_speech/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/indic_speech/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/indic_speech/tts1/db.sh b/egs2/indic_speech/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/indic_speech/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/local/data.sh b/egs2/indic_speech/tts1/local/data.sh
new file mode 100755
index 00000000000..763215e9fcb
--- /dev/null
+++ b/egs2/indic_speech/tts1/local/data.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+
+g2p=espeak_ng_hindi
+nj=12
+text_format=phn
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+db_root=${INDIC_SPEECH}
+spk="Hindi_TTS_dataset"
+
+train_set="train_no_dev"
+dev_set="dev"
+eval_set="eval"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: Data Download"
+    unzip "${db_root}/${spk}/Dataset.zip"
+    mv Dataset "${db_root}/${spk}"
+    rm "${db_root}/${spk}/Dataset.zip"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Preparing Data"
+    python3 local/data_prep.py -d ${db_root}
+    utils/spk2utt_to_utt2spk.pl data/${spk}/spk2utt > data/${spk}/utt2spk
+    utils/fix_data_dir.sh data/${spk}
+    utils/validate_data_dir.sh --no-feats data/${spk}
+    utils/subset_data_dir.sh --last data/${spk} 200 data/${spk}_tmp
+    utils/subset_data_dir.sh --last data/${spk}_tmp 100 data/${eval_set}
+    utils/subset_data_dir.sh --first data/${spk}_tmp 100 data/${dev_set}
+    n=$(( $(wc -l < data/${spk}/wav.scp) - 200 ))
+    utils/subset_data_dir.sh --first data/${spk} ${n} data/${train_set}
+    rm -rf data/${spk}_tmp
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && [ "${text_format}" = phn ]; then
+    log "stage 1: pyscripts/utils/convert_text_to_phn.py"
+    for dset in "${train_set}" "${dev_set}" "${eval_set}"; do
+        utils/copy_data_dir.sh "data/${dset}" "data/${dset}_phn"
+        pyscripts/utils/convert_text_to_phn.py --g2p "${g2p}" --nj "${nj}" \
+            "data/${dset}/text" "data/${dset}_phn/text"
+        utils/fix_data_dir.sh "data/${dset}_phn"
+    done
+fi
diff --git a/egs2/indic_speech/tts1/local/data_prep.py b/egs2/indic_speech/tts1/local/data_prep.py
new file mode 100644
index 00000000000..6229dc0e179
--- /dev/null
+++ b/egs2/indic_speech/tts1/local/data_prep.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import csv
+import os
+import subprocess
+
+from tqdm import tqdm
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    spk = "Hindi_TTS_dataset"
+    wav_dir = os.path.join(args.d, "%s/Dataset" % spk)
+    annotations_path = os.path.join(args.d, "%s/annotations.csv" % spk)
+    utt2text = {}
+    utt2f = {}
+    text_strs = []
+    wav_scp_strs = []
+
+    with open(annotations_path) as csv_file:
+        csv_reader = csv.reader(csv_file, delimiter="|")
+        rows = []
+        for row in csv_reader:
+            rows.append(row)
+        for row in tqdm(rows):
+            f = row[0]  # e.g. Dataset/3487.wav
+            f = os.path.basename(row[0])
+            wav_path = os.path.join(wav_dir, f)
+            mp3_path = wav_path.replace(".wav", ".mp3")
+            if os.path.exists(wav_path) or os.path.exists(mp3_path):
+                if not os.path.exists(mp3_path):
+                    os.rename(wav_path, mp3_path)
+                if not os.path.exists(wav_path):
+                    os.system(
+                        "ffmpeg -i %s -ac 1 %s -loglevel quiet" % (mp3_path, wav_path)
+                    )
+                utt = f[:-4]
+                utt = spk + "_" + utt
+                utt2f[utt] = f
+                text = row[1]
+                utt2text[utt] = text
+
+    utts = [utt for utt in utt2text]
+    utts = sorted(utts)
+    utts_str = " ".join(utts)
+    spk2utt_str = "%s %s" % (spk, utts_str)
+    text_strs = ["%s %s" % (utt, utt2text[utt]) for utt in utts]
+    wav_scp_strs = []
+    for utt in utts:
+        wav_scp_strs.append("%s %s/%s" % (utt, wav_dir, utt2f[utt]))
+
+    data_subdir = "data/%s" % spk
+    if not os.path.exists(data_subdir):
+        os.makedirs(data_subdir)
+    with open(os.path.join(data_subdir, "text"), "w+") as ouf:
+        for s in text_strs:
+            ouf.write("%s\n" % s)
+    with open(os.path.join(data_subdir, "wav.scp"), "w+") as ouf:
+        for s in wav_scp_strs:
+            ouf.write("%s\n" % s)
+    with open(os.path.join(data_subdir, "spk2utt"), "w+") as ouf:
+        ouf.write("%s\n" % spk2utt_str)
diff --git a/egs2/indic_speech/tts1/local/path.sh b/egs2/indic_speech/tts1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/indic_speech/tts1/path.sh b/egs2/indic_speech/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/indic_speech/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/pyscripts b/egs2/indic_speech/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/indic_speech/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/run.sh b/egs2/indic_speech/tts1/run.sh
new file mode 100755
index 00000000000..ceafd0ca322
--- /dev/null
+++ b/egs2/indic_speech/tts1/run.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Peter Wu
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Feature related
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+# Data prep related
+text_format=phn  # Use "raw" or "phn". If use "phn", convert to phn in data prep.
+local_data_opts=""
+local_data_opts+=" --text_format ${text_format}"
+if [ "${text_format}" = phn ]; then
+    local_data_opts+=" --g2p espeak_ng_hindi"
+fi
+
+dset_suffix=""
+if [ "${text_format}" = phn ]; then
+    dset_suffix=_phn
+fi
+train_set=tr_no_dev${dset_suffix}
+valid_set=dev${dset_suffix}
+test_sets="dev${dset_suffix} eval1${dset_suffix}"
+
+# config related
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# NOTE(kan-bayashi): Make sure that you use text_format=raw
+#   if you want to use token_type=char.
+token_type=phn
+
+# NOTE(kan-bayashi): On-the-fly with Espeak is really slow,
+#   so we convert text into phn in data prep stage via
+#   --text_format=phn and use g2p=none for training.
+# g2p=espeak_ng_hindi
+g2p=none
+
+./tts.sh \
+    --local_data_opts "${local_data_opts}" \
+    --audio_format wav \
+    --lang hi \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type "${token_type}" \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    "$@"
diff --git a/egs2/indic_speech/tts1/scripts b/egs2/indic_speech/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/indic_speech/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/steps b/egs2/indic_speech/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/indic_speech/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/tts.sh b/egs2/indic_speech/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/indic_speech/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/indic_speech/tts1/utils b/egs2/indic_speech/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/indic_speech/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/README.md b/egs2/iwslt14/mt1/README.md
new file mode 100644
index 00000000000..de18c222268
--- /dev/null
+++ b/egs2/iwslt14/mt1/README.md
@@ -0,0 +1,14 @@
+# Results
+
+## mt_train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3_raw_bpe_tc10000
+- mt_config: conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
+- inference_config: conf/decode_mt.yaml
+
+### BLEU
+
+Metric: BLEU-4, detokenized case-sensitive BLEU result (single-reference)
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+|beam5_maxlenratio1.6_penalty0.2/valid|33.3|68.4/42.9/28.9/19.8 (BP = 0.924 ratio = 0.927 hyp_len = 134328 ref_len = 144976)|
+|beam5_maxlenratio1.6_penalty0.2/test|32.2|67.2/41.4/27.4/18.5 (BP = 0.933 ratio = 0.935 hyp_len = 119813 ref_len = 128122)|
diff --git a/egs2/iwslt14/mt1/cmd.sh b/egs2/iwslt14/mt1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iwslt14/mt1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iwslt14/mt1/conf/decode_mt.yaml b/egs2/iwslt14/mt1/conf/decode_mt.yaml
new file mode 100644
index 00000000000..6570a89920d
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/decode_mt.yaml
@@ -0,0 +1,5 @@
+beam_size: 5
+lm_weight: 0.0
+maxlenratio: 1.6
+minlenratio: 0.0
+penalty: 0.2
diff --git a/egs2/iwslt14/mt1/conf/pbs.conf b/egs2/iwslt14/mt1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iwslt14/mt1/conf/queue.conf b/egs2/iwslt14/mt1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iwslt14/mt1/conf/slurm.conf b/egs2/iwslt14/mt1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
new file mode 120000
index 00000000000..050cda0e4d0
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/train_mt_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml b/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
new file mode 100644
index 00000000000..8b2d8844238
--- /dev/null
+++ b/egs2/iwslt14/mt1/conf/tuning/train_mt_transformer_lr3e-3_warmup10k_share_enc_dec_input_dropout0.3.yaml
@@ -0,0 +1,59 @@
+frontend: embed     # embedding + positional encoding
+frontend_conf:
+    embed_dim: 512
+    positional_dropout_rate: 0.3
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: null
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    share_decoder_input_output_embed: false
+    share_encoder_decoder_input_embed: true
+
+num_att_plot: 1
+log_interval: 100
+num_workers: 2
+batch_type: numel
+batch_bins: 400000000
+accum_grad: 1
+max_epoch: 200
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.003
+    betas:
+    - 0.9
+    - 0.98
+    eps: 0.000000001
+    weight_decay: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 10000
diff --git a/egs2/iwslt14/mt1/db.sh b/egs2/iwslt14/mt1/db.sh
new file mode 120000
index 00000000000..a11c0666fa1
--- /dev/null
+++ b/egs2/iwslt14/mt1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/db.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/local/data.sh b/egs2/iwslt14/mt1/local/data.sh
new file mode 100755
index 00000000000..7a2ba392392
--- /dev/null
+++ b/egs2/iwslt14/mt1/local/data.sh
@@ -0,0 +1,186 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./db.sh || exit 1;
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100000
+URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz"
+GZ=de-en.tgz
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${IWSLT14}" ]; then
+    log "Fill the value of 'IWSLT14' of db.sh"
+    exit 1
+fi
+
+
+if [ -f "${IWSLT14}/${GZ}" ]; then
+    log "Data already downloaded"
+else
+    (
+        cd ${IWSLT14}
+        wget "$URL"
+        tar zxvf $GZ
+    )
+    log "Data downloaded and extracted"
+fi
+
+src=de
+tgt=en
+lang=de-en
+prep=iwslt14.tokenized.de-en
+tmp=data/$prep/tmp
+
+if [ ! -d "${IWSLT14}/${lang}" ]; then
+    (
+        cd ${IWSLT14}
+        tar zxvf $GZ
+    )
+    log "Data extracted"
+fi
+
+# check extra module installation
+if ! command -v tokenizer.perl > /dev/null; then
+    echo "Error: it seems that moses is not installed." >&2
+    echo "Error: please install moses as follows." >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    mkdir -p data/train data/valid data/test $tmp
+
+
+    log "preparing test and valid data"
+
+    for l in $src $tgt; do
+        for o in "${IWSLT14}/${lang}"/IWSLT14.TED*."${l}".xml; do
+            fname=${o##*/}
+            f=$tmp/${fname%.*}
+            echo $o $f
+            grep '<seg id' $o | \
+                sed -e 's/<seg id="[0-9]*">\s*//g' | \
+                sed -e 's/\s*<\/seg>\s*//g' | \
+                sed -e "s/\’/\'/g" > $f
+            tokenizer.perl -threads 8 -l $l < $f > $f.tok 
+            lowercase.perl < $f.tok > $f.tok.lc
+            remove_punctuation.pl < $f.tok > $f.tok.rm
+            remove_punctuation.pl < $f.tok.lc > $f.tok.lc.rm
+            echo ""
+        done
+    done
+
+    log "pre-processing train data..."
+    for l in $src $tgt; do
+        f=train.tags.$lang.$l
+        tok=train.tags.$lang.tok.$l
+
+        < $IWSLT14/$lang/$f \
+        grep -v '<url>' | \
+        grep -v '<talkid>' | \
+        grep -v '<keywords>' | \
+        sed -e 's/<title>//g' | \
+        sed -e 's/<\/title>//g' | \
+        sed -e 's/<description>//g' | \
+        sed -e 's/<\/description>//g' > $tmp/$f
+        tokenizer.perl -threads 8 -l $l < $tmp/$f > $tmp/$tok
+        echo ""
+    done
+
+    log "Cleaning train data"
+    clean-corpus-n.perl -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.tok.clean 1 175
+    for l in $src $tgt; do
+        lowercase.perl < $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tags.$lang.tok.clean.lc.$l
+
+        remove_punctuation.pl < $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/train.tags.$lang.tok.lc.rm.$l
+        remove_punctuation.pl < $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tags.$lang.tok.rm.$l
+    done
+
+    #Clean again
+    clean-corpus-n.perl $tmp/train.tags.$lang.tok.lc.rm $src $tgt $tmp/train.tags.$lang.tok.clean.lc.rm 1 175
+    clean-corpus-n.perl $tmp/train.tags.$lang.tok.rm $src $tgt $tmp/train.tags.$lang.tok.clean.rm 1 175
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Creating Splits"
+    for l in $src $tgt; do
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.$l > $tmp/train.tok.clean.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.$l > $tmp/valid.tok.clean.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/train.tok.clean.lc.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.$l > $tmp/valid.tok.clean.lc.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.rm.$l > $tmp/train.tok.clean.lc.rm.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.lc.rm.$l > $tmp/valid.tok.clean.lc.rm.$l
+
+        awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.rm.$l > $tmp/train.tok.clean.rm.$l
+        awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.$lang.tok.clean.rm.$l > $tmp/valid.tok.clean.rm.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok \
+            > $tmp/test.tok.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.lc \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.lc \
+            > $tmp/test.tok.lc.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.lc.rm \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.lc.rm \
+            > $tmp/test.tok.lc.rm.$l
+
+        cat $tmp/IWSLT14.TED.dev2010.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TEDX.dev2012.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2010.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2011.$lang.$l.tok.rm \
+            $tmp/IWSLT14.TED.tst2012.$lang.$l.tok.rm \
+            > $tmp/test.tok.rm.$l
+
+        nl -s ' ' -n rz $tmp/train.tok.clean.$l | awk '{print "utt" $0}' > data/train/text.tc.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.rm.$l | awk '{print "utt" $0}' > data/train/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.lc.$l | awk '{print "utt" $0}' > data/train/text.lc.$l
+        nl -s ' ' -n rz $tmp/train.tok.clean.lc.rm.$l | awk '{print "utt" $0}' > data/train/text.lc.rm.$l
+
+        nl -s ' ' -n rz $tmp/valid.tok.clean.$l | awk '{print "utt" $0}' > data/valid/text.tc.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.rm.$l | awk '{print "utt" $0}' > data/valid/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.lc.$l | awk '{print "utt" $0}' > data/valid/text.lc.$l
+        nl -s ' ' -n rz $tmp/valid.tok.clean.lc.rm.$l | awk '{print "utt" $0}' > data/valid/text.lc.rm.$l
+
+        nl -s ' ' -n rz $tmp/test.tok.$l | awk '{print "utt" $0}' > data/test/text.tc.$l
+        nl -s ' ' -n rz $tmp/test.tok.rm.$l | awk '{print "utt" $0}' > data/test/text.tc.rm.$l
+        nl -s ' ' -n rz $tmp/test.tok.lc.$l | awk '{print "utt" $0}' > data/test/text.lc.$l
+        nl -s ' ' -n rz $tmp/test.tok.lc.rm.$l | awk '{print "utt" $0}' > data/test/text.lc.rm.$l
+
+    done
+fi
diff --git a/egs2/iwslt14/mt1/local/path.sh b/egs2/iwslt14/mt1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iwslt14/mt1/mt.sh b/egs2/iwslt14/mt1/mt.sh
new file mode 120000
index 00000000000..9f4c1d5c0bb
--- /dev/null
+++ b/egs2/iwslt14/mt1/mt.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/mt.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/path.sh b/egs2/iwslt14/mt1/path.sh
new file mode 120000
index 00000000000..a2d87d29a46
--- /dev/null
+++ b/egs2/iwslt14/mt1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/path.sh
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/pyscripts b/egs2/iwslt14/mt1/pyscripts
new file mode 120000
index 00000000000..bca5bde44f3
--- /dev/null
+++ b/egs2/iwslt14/mt1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/pyscripts
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/run.sh b/egs2/iwslt14/mt1/run.sh
new file mode 100755
index 00000000000..b8567d2709a
--- /dev/null
+++ b/egs2/iwslt14/mt1/run.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=de
+tgt_lang=en
+
+train_set=train
+train_dev=valid
+test_set="test valid"
+
+mt_config=conf/train_mt_transformer.yaml
+inference_config=conf/decode_mt.yaml
+
+src_nbpe=1000
+tgt_nbpe=10000   # if token_joint is True, then only tgt_nbpe is used
+
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+# Note, it is best to keep tgt_case as tc to match IWSLT22 eval
+src_case=tc
+tgt_case=tc
+
+./mt.sh \
+    --ignore_init_mismatch true \
+    --use_lm false \
+    --token_joint true \
+    --ngpu 1 \
+    --nj 16 \
+    --inference_nj 32 \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --mt_config "${mt_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" "$@"
diff --git a/egs2/iwslt14/mt1/scripts b/egs2/iwslt14/mt1/scripts
new file mode 120000
index 00000000000..1c11b3c3c7b
--- /dev/null
+++ b/egs2/iwslt14/mt1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/mt1/scripts
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/steps b/egs2/iwslt14/mt1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iwslt14/mt1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iwslt14/mt1/utils b/egs2/iwslt14/mt1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iwslt14/mt1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/iwslt21_low_resource/asr1/README.md b/egs2/iwslt21_low_resource/asr1/README.md
new file mode 100644
index 00000000000..08712070e1e
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/README.md
@@ -0,0 +1,38 @@
+# RESULTS
+
+This is Swahili ASR system from our IWSLT 2021 Low-Resource Speech Translation submission
+([paper](https://aclanthology.org/2021.iwslt-1.21.pdf)).
+
+`_raw` results show the same system scored using unprocessed reference transcriptions
+(without written to spoken language conversion).
+
+## asr_train_asr_conformer_raw_sw_bpe100_sp
+- model link: https://zenodo.org/record/5226979
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swa|868|18332|92.8|6.6|0.7|5.4|12.6|68.2|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swa_raw|868|18412|82.0|17.3|0.7|5.0|22.9|94.6|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swc|868|19504|84.5|12.7|2.8|2.5|18.0|84.2|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swc_raw|868|19512|76.2|21.4|2.5|2.1|26.0|96.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swa|868|117682|98.2|1.1|0.7|4.6|6.4|68.3|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swa_raw|868|119860|95.7|2.0|2.3|4.2|8.5|94.7|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swc|868|119172|96.1|1.7|2.3|2.2|6.2|84.2|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swc_raw|868|121352|94.1|2.1|3.8|2.0|7.9|96.1|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swa|868|74220|96.2|2.6|1.1|4.0|7.7|68.3|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swa_raw|868|76488|90.7|4.0|5.3|5.1|14.4|94.9|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swc|868|74644|93.2|4.0|2.8|2.3|9.1|84.2|
+|decode_asr_lm_lm_train_lm_transformer_sw_bpe100_valid.loss.ave_asr_model_valid.acc.ave/test_iwslt_swc_raw|868|77530|87.9|5.2|6.9|2.6|14.8|96.7|
+
diff --git a/egs2/iwslt21_low_resource/asr1/asr.sh b/egs2/iwslt21_low_resource/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/iwslt21_low_resource/asr1/cmd.sh b/egs2/iwslt21_low_resource/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iwslt21_low_resource/asr1/conf/decode_asr.yaml b/egs2/iwslt21_low_resource/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..0f2ce89749d
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/conf/decode_asr.yaml
@@ -0,0 +1,3 @@
+lm_weight: 0.5
+ctc_weight: 0.5
+beam_size: 30
diff --git a/egs2/iwslt21_low_resource/asr1/conf/fbank.conf b/egs2/iwslt21_low_resource/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/iwslt21_low_resource/asr1/conf/pbs.conf b/egs2/iwslt21_low_resource/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iwslt21_low_resource/asr1/conf/pitch.conf b/egs2/iwslt21_low_resource/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/iwslt21_low_resource/asr1/conf/queue.conf b/egs2/iwslt21_low_resource/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iwslt21_low_resource/asr1/conf/slurm.conf b/egs2/iwslt21_low_resource/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iwslt21_low_resource/asr1/conf/train_asr_conformer.yaml b/egs2/iwslt21_low_resource/asr1/conf/train_asr_conformer.yaml
new file mode 100644
index 00000000000..26d09c76753
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1,83 @@
+# Trained with Tesla V100-SXM2(32GB) x 2 GPUs. It takes about 7 hours.
+batch_type: numel
+batch_bins: 20000000
+accum_grad: 2
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 8
+
+init_param: [
+    "downloads/ru_open_stt_conformer.pth:::decoder.embed.0,decoder.output_layer,ctc.ctc_lo,encoder.embed,normalize"
+]
+
+freeze_param: [
+    "encoder.encoders.0"
+]
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.5
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 10000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/iwslt21_low_resource/asr1/conf/train_lm_transformer.yaml b/egs2/iwslt21_low_resource/asr1/conf/train_lm_transformer.yaml
new file mode 100644
index 00000000000..58fd2bdae9a
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1,31 @@
+# Trained with Tesla V100-SXM2(32GB) x 3 GPUs. It takes about 14 hours.
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 5000000
+accum_grad: 1
+max_epoch: 30
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+
+keep_nbest_models: 1
diff --git a/egs2/iwslt21_low_resource/asr1/db.sh b/egs2/iwslt21_low_resource/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/iwslt21_low_resource/asr1/local/data.sh b/egs2/iwslt21_low_resource/asr1/local/data.sh
new file mode 100755
index 00000000000..3fec50dd716
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/local/data.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    local/prepare_babel_data.py ${BABEL_202}
+    local/prepare_alffa_data.py ${ALFFA}/data_broadcastnews_sw
+    local/prepare_gamayun_data.py ${GAMAYUN}/gamayun-swahili
+    local/prepare_iwslt_data.py ${IWSLT21LR}/IWSLT-lowresource
+    local/prepare_iwslt_data.py --raw-transcriptions ${IWSLT21LR}/IWSLT-lowresource
+
+    for d in data/*; do
+        utils/fix_data_dir.sh ${d}
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: combine datasets"
+    utils/combine_data.sh --extra_files utt2num_frames data/train_babel_alffa_gamayun_iwslt \
+        data/train_babel \
+        data/train_alffa \
+        data/train_gamayun \
+        data/train_iwslt_swa \
+        data/train_iwslt_swc
+    utils/combine_data.sh --extra_files utt2num_frames data/valid_alffa_iwslt \
+        data/test_alffa \
+        data/valid_iwslt_swa \
+        data/valid_iwslt_swc
+    utils/combine_data.sh --extra_files utt2num_frames data/test_iwslt \
+        data/test_iwslt_swa \
+        data/test_iwslt_swc
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: prepare external text data"
+
+    mkdir -p data/local/other_text
+    sed -e 's/<s> //g' -e 's/ <\/s>//g' ${ALFFA}/data_broadcastnews_sw/LM/01-CLN4-TRN.txt | \
+      iconv -f "UTF-8" -t "UTF-8//IGNORE" | \
+      perl -p -e '$_="" if length($_) > 600' | \
+      awk '{ printf("alffa_lng_%07d %s\n",NR,$0) } ' > data/local/other_text/text
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "stage 4: download pre-trained ASR model"
+
+    mkdir -p downloads
+    wget "https://zenodo.org/record/4541727/files/asr_train_asr_conformer_raw_ru_bpe100_valid.acc.ave.zip?download=1" \
+      -O downloads/ru_open_stt.zip
+
+    unzip downloads/ru_open_stt.zip \
+      exp/asr_train_asr_conformer_raw_ru_bpe100/valid.acc.ave_10best.pth \
+      -d downloads/ -f
+
+    mv downloads/exp/asr_train_asr_conformer_raw_ru_bpe100/valid.acc.ave_10best.pth \
+      downloads/ru_open_stt_conformer.pth
+
+    wget "https://zenodo.org/record/5227612/files/swahili-asr-resources.tar.xz?download=1" \
+       -O downloads/swahili-asr-resources.tar.xz
+
+    tar xf downloads/swahili-asr-resources.tar.xz -C downloads
+
+    rm -rf downloads/{exp,ru_open_stt.zip,swahili-asr-resources.tar.xz}
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/iwslt21_low_resource/asr1/local/nlsyms.txt b/egs2/iwslt21_low_resource/asr1/local/nlsyms.txt
new file mode 100644
index 00000000000..6537b426e18
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/local/nlsyms.txt
@@ -0,0 +1,16 @@
+<breath>
+<click>
+<cough>
+<dtmf>
+<foreign>
+<hes>
+<int>
+<laugh>
+<laughter>
+<lipsmack>
+<male-to-female>
+<music>
+<no-speech>
+<ring>
+<spn>
+<sta>
diff --git a/egs2/iwslt21_low_resource/asr1/local/path.sh b/egs2/iwslt21_low_resource/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iwslt21_low_resource/asr1/local/prepare_alffa_data.py b/egs2/iwslt21_low_resource/asr1/local/prepare_alffa_data.py
new file mode 100755
index 00000000000..3fde4f274f0
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/local/prepare_alffa_data.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import os
+from shutil import copyfile
+import sys
+
+idir = sys.argv[1]
+
+for subset in ["train", "test"]:
+    odir = "data/{}_alffa".format(subset)
+    os.makedirs(odir, exist_ok=True)
+
+    with open(os.path.join(odir, "text"), "w", encoding="utf-8") as otext, open(
+        os.path.join(idir, "data", subset, "text"), encoding="utf-8"
+    ) as itext:
+        for line in itext:
+            line = line.replace("<UNK>", "<spn>")
+            line = line.replace(".", " ")
+            line = line.replace("?", " ")
+            line = line.replace("g20", "g twenty")
+            parts = line.strip().split(maxsplit=1)
+
+            if (
+                len(parts) == 2 and parts[1] in ["<laughter>", "<music>", "<spn>"]
+            ) or parts[1][:3] == "16k":
+                continue
+
+            parts[1] = parts[1].replace("-", " ")
+            otext.write("{}\n".format(" ".join(parts)))
+
+    with open(os.path.join(odir, "wav.scp"), "w") as owavscp, open(
+        os.path.join(idir, "data", subset, "wav.scp")
+    ) as iwavscp:
+        for line in iwavscp:
+            parts = line.strip().split(maxsplit=1)
+            parts[1] = parts[1].replace("/my_dir/wav/", "asr_swahili/data/test/wav5/")
+            parts[1] = os.path.join(idir, parts[1][12:])
+            owavscp.write("{} {}\n".format(parts[0], parts[1]))
+
+    copyfile(
+        os.path.join(idir, "data", subset, "utt2spk"), os.path.join(odir, "utt2spk")
+    )
diff --git a/egs2/iwslt21_low_resource/asr1/local/prepare_babel_data.py b/egs2/iwslt21_low_resource/asr1/local/prepare_babel_data.py
new file mode 100755
index 00000000000..f6bbc92b2fa
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/local/prepare_babel_data.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import glob
+import os
+import sys
+
+idir = sys.argv[1]
+
+
+def read_transcription(txt):
+    utts = []
+    text = ""
+    prev_sec = 0.0
+    started = False
+
+    with open(txt) as f:
+        for line in f:
+            if line[0] == "[":
+                sec = float(line[1:-2])
+                if started:
+                    text = text.lower()
+                    text = text.replace("- ", " ")
+                    text = text.replace("*", " ")
+                    text = text.replace("(())", " ")
+                    text = text.replace("~", " ")
+                    text = text.replace("_", " ")
+                    text = text.replace("á", "a")
+                    text = text.replace("é", "e")
+                    words = [
+                        w if w[0] == "<" else w.replace("-", " ") for w in text.split()
+                    ]
+                    words = " ".join(words).split()
+                    if len(words) > 0 and words[0] != "<no-speech>":
+                        utts.append(
+                            {"start": prev_sec, "text": " ".join(words), "end": sec}
+                        )
+                    started = False
+                else:
+                    prev_sec = sec
+                    text = ""
+                    started = True
+            else:
+                text += " " + line
+
+    return utts
+
+
+mysubsets = {"training": "train", "dev": "valid"}
+
+for subset in mysubsets.keys():
+    odir = "data/{}_babel".format(mysubsets[subset])
+    os.makedirs(odir, exist_ok=True)
+
+    with open(odir + "/text", "w", encoding="utf-8") as text, open(
+        odir + "/wav.scp", "w"
+    ) as wavscp, open(odir + "/utt2spk", "w") as utt2spk, open(
+        odir + "/segments", "w"
+    ) as segments:
+        for part in ["scripted", "conversational"]:
+            for audio in glob.glob(os.path.join(idir, part, subset, "audio", "*.sph")):
+                recoid = os.path.split(audio)[1][:-4]
+                wavscp.write(
+                    f"{recoid} ffmpeg -i {audio}"
+                    + " -acodec pcm_s16le -ar 16000 -ac 1 -f wav - |\n"
+                )
+                transcription = os.path.join(
+                    idir, part, subset, "transcription", recoid + ".txt"
+                )
+                utts = read_transcription(transcription)
+                for utt in utts:
+                    uttid = "{}_{:06d}_{:06d}".format(
+                        recoid, int(utt["start"] * 100), int(utt["end"] * 100)
+                    )
+                    text.write("{} {}\n".format(uttid, utt["text"]))
+                    utt2spk.write("{} {}\n".format(uttid, uttid))
+                    segments.write(
+                        "{} {} {} {}\n".format(uttid, recoid, utt["start"], utt["end"])
+                    )
diff --git a/egs2/iwslt21_low_resource/asr1/local/prepare_gamayun_data.py b/egs2/iwslt21_low_resource/asr1/local/prepare_gamayun_data.py
new file mode 100755
index 00000000000..9d2913994ab
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/local/prepare_gamayun_data.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import csv
+import os
+import re
+import sys
+
+num_re = re.compile(r"(\d+\s*([\.,\/]\d+|)(%|))")
+non_alpha_re = re.compile("[^a-z' <>]")
+
+valid_sentences = []
+
+with open("downloads/swahili-asr-resources/valid-iwslt.txt") as f:
+    for line in f:
+        utt, text = line.split(" ", maxsplit=1)
+        valid_sentences.append(text.strip())
+
+strings = {}
+
+with open("downloads/swahili-asr-resources/num2words-swa.txt") as f:
+    for line in f:
+        num, string = line.strip().split("\t")
+        strings[num] = string
+
+
+def num2words(matchobj):
+    num = matchobj.group(0)
+    if num in strings:
+        return strings[num]
+    else:
+        return " <spn> "
+
+
+idir = sys.argv[1]
+
+odir = "data/train_gamayun"
+os.makedirs(odir, exist_ok=True)
+
+with open(odir + "/text", "w", encoding="utf-8") as text, open(
+    odir + "/wav.scp", "w"
+) as wavscp, open(odir + "/utt2spk", "w") as utt2spk, open(
+    os.path.join(idir, "swahili_minikit.csv")
+) as meta:
+    reader = csv.reader(meta, delimiter="\t")
+    for row in reader:
+        uttid = "gamayun_" + row[0]
+        wav = os.path.join(idir, "swahili_minikit", row[1])
+        words = row[3]
+        if words in valid_sentences:
+            continue
+        words = re.sub(num_re, num2words, words)
+        words = words.lower()
+        words = re.sub(non_alpha_re, " ", words)
+        words = " ".join(words.split())
+        wavscp.write(
+            "{} sox --norm=-1 {} -r 16k -t wav -c 1 -b 16 -e signed - |\n".format(
+                uttid, wav
+            )
+        )
+        text.write("{} {}\n".format(uttid, words))
+        utt2spk.write("{} {}\n".format(uttid, uttid))
diff --git a/egs2/iwslt21_low_resource/asr1/local/prepare_iwslt_data.py b/egs2/iwslt21_low_resource/asr1/local/prepare_iwslt_data.py
new file mode 100755
index 00000000000..95bd9f933a5
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/local/prepare_iwslt_data.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import os
+import re
+import yaml
+
+parser = argparse.ArgumentParser(
+    description="Prepare IWSLT'21 Low Resource Speech Translation data."
+)
+
+parser.add_argument(
+    "path", type=str, help="Path to IWSLT'21 Low Resource Speech Translation data"
+)
+parser.add_argument(
+    "--raw-transcriptions",
+    default=False,
+    dest="raw",
+    action="store_true",
+    help="Do not convert transcriptions from the written to spoken language.",
+)
+
+args = parser.parse_args()
+
+num_re = re.compile(r"(\d+\s*([\.,\/]\d+|)(%|))")
+non_alpha_re = re.compile("[^a-z' <>]")
+
+strings = {}
+
+
+def num2words(matchobj):
+    num = matchobj.group(0)
+    if num in strings:
+        return strings[num]
+    else:
+        return " <spn> "
+
+
+valid_utts = []
+
+with open("downloads/swahili-asr-resources/valid-iwslt.txt") as f:
+    valid_utts = [line.split()[0] for line in f]
+
+for pair in ["swa-eng", "swc-fra"]:
+    sw = pair[:3]
+
+    strings = {}
+
+    with open("downloads/swahili-asr-resources/num2words-{}.txt".format(sw)) as f:
+        for line in f:
+            num, string = line.strip().split("\t")
+            strings[num] = string
+
+    subsets = {"train": [], "valid": [], "test": []}
+
+    for subset in ["train", "valid"]:
+        path = os.path.join(args.path, pair, subset)
+
+        with open(os.path.join(path, "txt", subset + ".yaml")) as metafile, open(
+            os.path.join(path, "txt", subset + ".swa")
+        ) as textfile:
+            wavs = yaml.safe_load(metafile)
+            texts = textfile.readlines()
+
+            for i in range(len(wavs)):
+                if wavs[i]["wav"][-4:] != ".wav":
+                    wavs[i]["wav"] += ".wav"
+
+                uttid = "iwslt_" + wavs[i]["wav"][:-4].replace("/", "-")
+                wav = os.path.join(path, "wav", wavs[i]["wav"])
+                sentence = texts[i].strip()
+
+                if subset == "train":
+                    if uttid in valid_utts:
+                        split = "valid"
+                    else:
+                        split = "train"
+                else:
+                    split = "test"
+
+                if not args.raw:
+                    sentence = re.sub(num_re, num2words, sentence)
+                    sentence = sentence.lower()
+                    sentence = re.sub(non_alpha_re, " ", sentence)
+
+                sentence = " ".join(sentence.split())
+
+                subsets[split].append({"wav": wav, "text": sentence, "id": uttid})
+
+    for subset in subsets:
+        odir = "data/{}_iwslt_{}".format(subset, sw)
+
+        if args.raw:
+            odir += "_raw"
+
+        os.makedirs(odir, exist_ok=True)
+
+        with open(odir + "/text", "w", encoding="utf-8") as text, open(
+            odir + "/wav.scp", "w"
+        ) as wavscp, open(odir + "/utt2spk", "w") as utt2spk:
+            for utt in subsets[subset]:
+                wavscp.write(
+                    "{} sox --norm=-1 {}".format(utt["id"], utt["wav"])
+                    + " -r 16k -t wav -c 1 -b 16 -e signed - |\n"
+                )
+                text.write("{} {}\n".format(utt["id"], utt["text"]))
+                utt2spk.write("{} {}\n".format(utt["id"], utt["id"]))
diff --git a/egs2/iwslt21_low_resource/asr1/path.sh b/egs2/iwslt21_low_resource/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/iwslt21_low_resource/asr1/pyscripts b/egs2/iwslt21_low_resource/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/iwslt21_low_resource/asr1/run.sh b/egs2/iwslt21_low_resource/asr1/run.sh
new file mode 100755
index 00000000000..962fc07b282
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train_babel_alffa_gamayun_iwslt"
+valid_set="valid_alffa_iwslt"
+test_sets="test_iwslt_swa test_iwslt_swc test_iwslt_swa_raw test_iwslt_swc_raw"
+
+asr_config=conf/train_asr_conformer.yaml
+lm_config=conf/train_lm_transformer.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang sw \
+    --ngpu 2 \
+    --nbpe 100 \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --lm_config "${lm_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --inference_asr_model valid.acc.ave.pth \
+    --lm_train_text "data/${train_set}/text data/local/other_text/text" \
+    --lm_test_text "data/test_iwslt/text" \
+    --bpe_train_text "data/${train_set}/text" \
+    --bpe_nlsyms "$(perl -pe 's/\n/,/' local/nlsyms.txt)" \
+    --nlsyms_txt local/nlsyms.txt "$@"
diff --git a/egs2/iwslt21_low_resource/asr1/scripts b/egs2/iwslt21_low_resource/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/iwslt21_low_resource/asr1/steps b/egs2/iwslt21_low_resource/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iwslt21_low_resource/asr1/utils b/egs2/iwslt21_low_resource/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iwslt21_low_resource/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/RESULTS.md b/egs2/iwslt22_dialect/asr1/RESULTS.md
new file mode 100644
index 00000000000..510e71cdddd
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/RESULTS.md
@@ -0,0 +1,53 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Wed Feb  2 05:32:30 EST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `99581e0f5af3ad68851d556645e7292771436df9`
+  - Commit date: `Sat Jan 29 11:32:38 2022 -0500`
+
+## asr_train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug_raw_bpe1000_sp
+  - Model link: [huggingface](https://huggingface.co/espnet/brianyan918_iwslt22_dialect_train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug)
+  - ASR config: [./conf/train_asr_conformer.yaml](./conf/train_asr_conformer.yaml)
+  - Inference config: [./conf/decode_asr.yaml](./conf/decode_asr.yaml)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|27370|54.7|39.5|5.8|8.8|54.2|87.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|145852|84.1|7.1|8.8|11.5|27.4|87.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|64424|63.8|22.8|13.4|12.2|48.3|87.9|
+
+## asr_transformer_fisherlike_4gpu_bbins16m_raw_bpe1000_sp
+  - Model link: [huggingface](https://huggingface.co/espnet/brianyan918_iwslt22_dialect_transformer_fisherlike)
+  - ASR config: [./conf/train_asr_transformer.yaml](./conf/train_asr_transformer.yaml)
+  - Inference config: [./conf/decode_asr.yaml](./conf/decode_asr.yaml)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|27370|53.4|41.1|5.5|9.5|56.1|88.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|145852|83.8|7.5|8.7|12.2|28.4|88.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/test1|4204|64424|62.9|23.9|13.3|13.4|50.5|88.2|
diff --git a/egs2/iwslt22_dialect/asr1/asr.sh b/egs2/iwslt22_dialect/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/cmd.sh b/egs2/iwslt22_dialect/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iwslt22_dialect/asr1/conf/decode_asr.yaml b/egs2/iwslt22_dialect/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..bd7f53a63fa
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/decode_asr.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+lm_weight: 0.0
+ctc_weight: 0.3
+penalty: 0.3
diff --git a/egs2/iwslt22_dialect/asr1/conf/fbank.conf b/egs2/iwslt22_dialect/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/iwslt22_dialect/asr1/conf/pbs.conf b/egs2/iwslt22_dialect/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iwslt22_dialect/asr1/conf/pitch.conf b/egs2/iwslt22_dialect/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/iwslt22_dialect/asr1/conf/queue.conf b/egs2/iwslt22_dialect/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iwslt22_dialect/asr1/conf/rnn_lm.yaml b/egs2/iwslt22_dialect/asr1/conf/rnn_lm.yaml
new file mode 100644
index 00000000000..73f5e90c161
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/rnn_lm.yaml
@@ -0,0 +1,16 @@
+lm_conf:        
+    nlayers: 4
+    unit: 2048
+optim: adam
+optim_conf:
+    lr: 0.001
+batch_type: folded
+batch_size: 400   # batch size in LM training
+max_epoch: 20     # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/iwslt22_dialect/asr1/conf/slurm.conf b/egs2/iwslt22_dialect/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iwslt22_dialect/asr1/conf/train_asr_conformer.yaml b/egs2/iwslt22_dialect/asr1/conf/train_asr_conformer.yaml
new file mode 120000
index 00000000000..e9703edbacf
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/conf/train_asr_transformer.yaml b/egs2/iwslt22_dialect/asr1/conf/train_asr_transformer.yaml
new file mode 120000
index 00000000000..2ae798bee1b
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1 @@
+tuning/transformer_fisherlike_4gpu_bbins16m_fix.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/conf/tuning/train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml b/egs2/iwslt22_dialect/asr1/conf/tuning/train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
new file mode 100644
index 00000000000..58f14706d53
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/tuning/train_asr_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
@@ -0,0 +1,73 @@
+batch_type: numel
+batch_bins: 25000000
+accum_grad: 2
+max_epoch: 80
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/iwslt22_dialect/asr1/conf/tuning/transformer_fisherlike_4gpu_bbins16m_fix.yaml b/egs2/iwslt22_dialect/asr1/conf/tuning/transformer_fisherlike_4gpu_bbins16m_fix.yaml
new file mode 100644
index 00000000000..51aca81472e
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/conf/tuning/transformer_fisherlike_4gpu_bbins16m_fix.yaml
@@ -0,0 +1,70 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 100
+optim_conf:
+    lr: 5.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 16000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/iwslt22_dialect/asr1/db.sh b/egs2/iwslt22_dialect/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/local/data.sh b/egs2/iwslt22_dialect/asr1/local/data.sh
new file mode 100755
index 00000000000..f6359a12d13
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./db.sh || exit 1;
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100000
+splits_dir=data/iwslt22_splits
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ -z "${IWSLT22_DIALECT}" ]; then
+    log "Fill the value of 'IWSLT22_DIALECT' of db.sh"
+    exit 1
+fi
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && [ ! -d "${splits_dir}" ]; then
+    log "stage 1: Official splits from IWSLT"
+    
+    git clone https://github.com/kevinduh/iwslt22-dialect.git ${splits_dir}
+    cd ${splits_dir} && ./setup_data.sh ${IWSLT22_DIALECT} && cd -
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    
+    mkdir -p data/train
+    mkdir -p data/dev
+    mkdir -p data/test1
+    local/preprocess.py --out data --data ${splits_dir}
+    
+    for set in train dev test1
+    do
+        utils/utt2spk_to_spk2utt.pl data/${set}/utt2spk > data/${set}/spk2utt
+        utils/fix_data_dir.sh data/${set}
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Normalize Transcripts"
+
+    # check extra module installation
+    if ! command -v tokenizer.perl > /dev/null; then
+        echo "Error: it seems that moses is not installed." >&2
+        echo "Error: please install moses as follows." >&2
+        echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+        exit 1
+    fi
+
+    for set in train dev test1
+    do
+        cut -d ' ' -f 2- data/${set}/text > data/${set}/text.org
+        cut -d ' ' -f 1 data/${set}/text > data/${set}/uttlist
+        # remove punctuation
+        remove_punctuation.pl < data/${set}/text.org > data/${set}/text.rm
+        paste -d ' ' data/${set}/uttlist data/${set}/text.rm > data/${set}/text.tc.rm
+
+        # remove empty lines that were previously only punctuation
+        <"data/${set}/text.tc.rm" awk ' { if( NF != 1 ) print $0; } ' >"data/${set}/text"
+        utils/fix_data_dir.sh --utt_extra_files "text.tc.rm" data/${set}
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/iwslt22_dialect/asr1/local/path.sh b/egs2/iwslt22_dialect/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iwslt22_dialect/asr1/local/preprocess.py b/egs2/iwslt22_dialect/asr1/local/preprocess.py
new file mode 100755
index 00000000000..bbd1e42d342
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/local/preprocess.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# -*- encoding: utf8 -*-
+
+"""
+   TBD
+"""
+
+import re
+import os
+import sys
+import argparse
+import itertools
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--out",
+    "-o",
+    type=str,
+    help="Path to output directory.",
+)
+parser.add_argument("--data", "-d", type=str, help="Path to original corpus.")
+args = parser.parse_args()
+
+
+def time_to_hash(time_str):
+    ret = "%08.3F" % float(time_str)
+    return "".join(str(ret).split("."))
+
+
+def stm_to_kaldi(st_stm, asr_stm, dst):
+    data = {"F": [], "C": [], "S": [], "BT": [], "ET": [], "text_en": [], "text_ta": []}
+    with open(st_stm, "r", encoding="utf-8") as st_stm, open(
+        asr_stm, "r", encoding="utf-8"
+    ) as asr_stm:
+        st_lines = st_stm.readlines()
+        asr_lines = asr_stm.readlines()
+        for i, (st_li, asr_li) in enumerate(zip(st_lines, asr_lines)):
+            F, C, S, BT, ET, _, text_en = st_li.strip().split("\t")
+            F2, _, _, _, _, _, text_ta = asr_li.strip().split("\t")
+            if F != F2:
+                sys.exit("ASR and ST STM files are not in the same order", F, F2)
+            data["F"].append(F)
+            data["C"].append(C)
+            data["S"].append(S)
+            data["BT"].append(BT)
+            data["ET"].append(ET)
+            data["text_en"].append(text_en)
+            data["text_ta"].append(text_ta)
+
+    with open(dst + "/wav.scp", "w", encoding="utf-8") as wav_scp, open(
+        dst + "/utt2spk", "w", encoding="utf-8"
+    ) as utt2spk, open(dst + "/segments", "w", encoding="utf-8") as segments, open(
+        dst + "/text", "w", encoding="utf-8"
+    ) as text_ta, open(
+        dst + "/reco2file_and_channel", "w", encoding="utf-8"
+    ) as reco2file:
+        for i in range(len(data["F"])):
+            recid = data["F"][i].split("/")[-1].split(".")[0]
+            uttid = (
+                data["S"][i]
+                + "_"
+                + recid
+                + "_"
+                + time_to_hash(data["BT"][i])
+                + "-"
+                + time_to_hash(data["ET"][i])
+            )
+            sox_cmd = "sox -R -t wav - -t wav - rate 16000 dither |"
+            wav_scp.write(
+                " ".join(
+                    [
+                        recid,
+                        "sph2pipe -f wav -p -c",
+                        data["C"][i],
+                        data["F"][i],
+                        "|",
+                        sox_cmd,
+                    ]
+                )
+                + "\n"
+            )
+            utt2spk.write(" ".join([uttid, data["S"][i]]) + "\n")
+            segments.write(
+                " ".join([uttid, recid, data["BT"][i], data["ET"][i]]) + "\n"
+            )
+            text_ta.write(" ".join([uttid, data["text_ta"][i]]) + "\n")
+            # 2 channels are stored as separate sph, each with only 1 channel
+            reco2file.write(" ".join([recid, recid, "A"]) + "\n")
+
+
+if __name__ == "__main__":
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.train.stm",
+        args.data + "/stm/asr-aeb.norm.train.stm",
+        args.out + "/train",
+    )
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.dev.stm",
+        args.data + "/stm/asr-aeb.norm.dev.stm",
+        args.out + "/dev",
+    )
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.test1.stm",
+        args.data + "/stm/asr-aeb.norm.test1.stm",
+        args.out + "/test1",
+    )
diff --git a/egs2/iwslt22_dialect/asr1/path.sh b/egs2/iwslt22_dialect/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/pyscripts b/egs2/iwslt22_dialect/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/run.sh b/egs2/iwslt22_dialect/asr1/run.sh
new file mode 100755
index 00000000000..9990ba06e38
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/run.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set=test1
+
+asr_config=conf/train_asr_conformer.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=1000
+
+./asr.sh \
+    --ngpu 1 \
+    --stage 1 \
+    --stop_stage 13 \
+    --audio_format "flac.ark" \
+    --local_data_opts "--stage 0" \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --inference_nj 40 \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/iwslt22_dialect/asr1/scripts b/egs2/iwslt22_dialect/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/steps b/egs2/iwslt22_dialect/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/asr1/utils b/egs2/iwslt22_dialect/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iwslt22_dialect/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/RESULTS.md b/egs2/iwslt22_dialect/st1/RESULTS.md
new file mode 100644
index 00000000000..30c83a47034
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/RESULTS.md
@@ -0,0 +1,29 @@
+<!-- Generated by scripts/utils/show_st_results.sh -->
+# RESULTS
+## Environments
+- date: `Tue Feb  8 12:54:12 EST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `77fce65312877a132bbae01917ad26b74f6e2e14`
+  - Commit date: `Tue Feb 8 10:48:10 2022 -0500`
+
+## st_train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug_raw_bpe_tc1000_sp
+  - Model link: [huggingface](https://huggingface.co/espnet/brianyan918_iwslt22_dialect_train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug)
+  - ST config: [.conf/train_st_conformer.yaml](.conf/train_st_conformer.yaml)
+  - Inference config: [./conf/decode_st_conformer.yaml](./conf/decode_st_conformer.yaml)
+### BLEU
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+pen2_st_model_valid.acc.ave|13.9|44.0/21.8/11.4/6.2 (BP = 0.859 ratio = 0.868 hyp_len = 36614 ref_len = 42181)
+
+## st_transformer_fisherlike_4gpu_bbins16m_fix_raw_bpe_tc1000_sp
+  - Model link: [huggingface](https://huggingface.co/espnet/brianyan918_iwslt22_dialect_st_transformer_fisherlike_4gpu_bbins16m_fix)
+  - ST config: [.conf/train_st_transformer.yaml](.conf/train_st_transformer.yaml)
+  - Inference config: [./conf/decode_st_transformer.yaml](./conf/decode_st_transformer.yaml)
+### BLEU
+
+|dataset|bleu_score|verbose_score|
+|---|---|---|
+pen3_st_model_valid.acc.ave|12.0|37.4/17.3/8.6/4.5 (BP = 0.952 ratio = 0.953 hyp_len = 40192 ref_len = 42181)
diff --git a/egs2/iwslt22_dialect/st1/cmd.sh b/egs2/iwslt22_dialect/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/iwslt22_dialect/st1/conf/decode_st_conformer.yaml b/egs2/iwslt22_dialect/st1/conf/decode_st_conformer.yaml
new file mode 120000
index 00000000000..86cc69be5fc
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/decode_st_conformer.yaml
@@ -0,0 +1 @@
+tuning/decode_pen2.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/decode_st_transformer.yaml b/egs2/iwslt22_dialect/st1/conf/decode_st_transformer.yaml
new file mode 120000
index 00000000000..1eb8cb0438e
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/decode_st_transformer.yaml
@@ -0,0 +1 @@
+tuning/decode_pen3.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/fbank.conf b/egs2/iwslt22_dialect/st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/iwslt22_dialect/st1/conf/pbs.conf b/egs2/iwslt22_dialect/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/iwslt22_dialect/st1/conf/pitch.conf b/egs2/iwslt22_dialect/st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/iwslt22_dialect/st1/conf/queue.conf b/egs2/iwslt22_dialect/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/iwslt22_dialect/st1/conf/slurm.conf b/egs2/iwslt22_dialect/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/iwslt22_dialect/st1/conf/train_st.yaml b/egs2/iwslt22_dialect/st1/conf/train_st.yaml
new file mode 100644
index 00000000000..168a6a7a174
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/train_st.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/iwslt22_dialect/st1/conf/train_st_conformer.yaml b/egs2/iwslt22_dialect/st1/conf/train_st_conformer.yaml
new file mode 120000
index 00000000000..701efc0b460
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/train_st_conformer.yaml
@@ -0,0 +1 @@
+tuning/train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/train_st_transformer.yaml b/egs2/iwslt22_dialect/st1/conf/train_st_transformer.yaml
new file mode 120000
index 00000000000..719cfa93f1a
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/train_st_transformer.yaml
@@ -0,0 +1 @@
+tuning/transformer_fisherlike_4gpu_bbins16m.yaml
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen2.yaml b/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen2.yaml
new file mode 100644
index 00000000000..2842896581f
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen2.yaml
@@ -0,0 +1,6 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.2
+maxlenratio: 0.0
+minlenratio: 0.0
+lm_weight: 0.0
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen3.yaml b/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen3.yaml
new file mode 100644
index 00000000000..a388feca9f4
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/tuning/decode_pen3.yaml
@@ -0,0 +1,6 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.3
+maxlenratio: 0.0
+minlenratio: 0.0
+lm_weight: 0.0
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/conf/tuning/train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml b/egs2/iwslt22_dialect/st1/conf/tuning/train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
new file mode 100644
index 00000000000..cf0d0d8f15d
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/tuning/train_st_conformer_ctc0.3_lr2e-3_warmup15k_newspecaug.yaml
@@ -0,0 +1,89 @@
+batch_type: numel
+batch_bins: 25000000
+accum_grad: 2
+max_epoch: 80
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/iwslt22_dialect/st1/conf/tuning/transformer_fisherlike_4gpu_bbins16m.yaml b/egs2/iwslt22_dialect/st1/conf/tuning/transformer_fisherlike_4gpu_bbins16m.yaml
new file mode 100644
index 00000000000..a5bb48bae78
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/conf/tuning/transformer_fisherlike_4gpu_bbins16m.yaml
@@ -0,0 +1,86 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 12.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 16000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/iwslt22_dialect/st1/db.sh b/egs2/iwslt22_dialect/st1/db.sh
new file mode 120000
index 00000000000..39ab78266e5
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/db.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/local/data.sh b/egs2/iwslt22_dialect/st1/local/data.sh
new file mode 100755
index 00000000000..aca26a59402
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/local/data.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./db.sh || exit 1;
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100000
+splits_dir=data/iwslt22_splits
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ -z "${IWSLT22_DIALECT}" ]; then
+    log "Fill the value of 'IWSLT22_DIALECT' of db.sh"
+    exit 1
+fi
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && [ ! -d "${splits_dir}" ]; then
+    log "stage 1: Official splits from IWSLT"
+    
+    git clone https://github.com/kevinduh/iwslt22-dialect.git ${splits_dir}
+    cd ${splits_dir} && ./setup_data.sh ${IWSLT22_DIALECT} && cd -
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    
+    mkdir -p data/train
+    mkdir -p data/dev
+    mkdir -p data/test1
+    local/preprocess.py --out data --data ${splits_dir}
+    
+    for set in train dev test1
+    do
+        cp data/${set}/text.en data/${set}/text
+        utils/utt2spk_to_spk2utt.pl data/${set}/utt2spk > data/${set}/spk2utt
+        utils/fix_data_dir.sh --utt_extra_files "text.en text.ta" data/${set}
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: Normalize Transcripts"
+
+    # check extra module installation
+    if ! command -v tokenizer.perl > /dev/null; then
+        echo "Error: it seems that moses is not installed." >&2
+        echo "Error: please install moses as follows." >&2
+        echo "Error: cd ${MAIN_ROOT}/tools && make moses.done" >&2
+        exit 1
+    fi
+
+    for set in train dev test1
+    do
+        cut -d ' ' -f 2- data/${set}/text.ta > data/${set}/ta.org
+        cut -d ' ' -f 1 data/${set}/text.ta > data/${set}/uttlist
+        # remove punctuation
+        remove_punctuation.pl < data/${set}/ta.org > data/${set}/ta.rm
+        paste -d ' ' data/${set}/uttlist data/${set}/ta.rm > data/${set}/text.tc.rm.ta
+
+        cut -d ' ' -f 2- data/${set}/text.en > data/${set}/en.org
+        # tokenize
+        tokenizer.perl -l en -q < data/${set}/en.org > data/${set}/en.tok
+        paste -d ' ' data/${set}/uttlist data/${set}/en.tok > data/${set}/text.tc.en
+
+        # remove empty lines that were previously only punctuation
+        # small to use fix_data_dir as is, where it does reduce lines based on extra files
+        <"data/${set}/text.tc.rm.ta" awk ' { if( NF != 1 ) print $0; } ' >"data/${set}/text"
+        utils/fix_data_dir.sh --utt_extra_files "text.tc.rm.ta text.tc.en text.en text.ta" data/${set}
+        cp data/${set}/text.tc.en data/${set}/text
+        utils/fix_data_dir.sh --utt_extra_files "text.tc.rm.ta text.tc.en text.en text.ta" data/${set}
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/iwslt22_dialect/st1/local/path.sh b/egs2/iwslt22_dialect/st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/iwslt22_dialect/st1/local/preprocess.py b/egs2/iwslt22_dialect/st1/local/preprocess.py
new file mode 100755
index 00000000000..2d02de1eb64
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/local/preprocess.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- encoding: utf8 -*-
+
+"""
+   TBD
+"""
+
+import re
+import os
+import sys
+import argparse
+import itertools
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--out",
+    "-o",
+    type=str,
+    help="Path to output directory.",
+)
+parser.add_argument("--data", "-d", type=str, help="Path to original corpus.")
+args = parser.parse_args()
+
+
+def time_to_hash(time_str):
+    ret = "%08.3F" % float(time_str)
+    return "".join(str(ret).split("."))
+
+
+def stm_to_kaldi(st_stm, asr_stm, dst):
+    data = {"F": [], "C": [], "S": [], "BT": [], "ET": [], "text_en": [], "text_ta": []}
+    with open(st_stm, "r", encoding="utf-8") as st_stm, open(
+        asr_stm, "r", encoding="utf-8"
+    ) as asr_stm:
+        st_lines = st_stm.readlines()
+        asr_lines = asr_stm.readlines()
+        for i, (st_li, asr_li) in enumerate(zip(st_lines, asr_lines)):
+            F, C, S, BT, ET, _, text_en = st_li.strip().split("\t")
+            F2, _, _, _, _, _, text_ta = asr_li.strip().split("\t")
+            if F != F2:
+                sys.exit("ASR and ST STM files are not in the same order", F, F2)
+            data["F"].append(F)
+            data["C"].append(C)
+            data["S"].append(S)
+            data["BT"].append(BT)
+            data["ET"].append(ET)
+            data["text_en"].append(text_en)
+            data["text_ta"].append(text_ta)
+
+    with open(dst + "/wav.scp", "w", encoding="utf-8") as wav_scp, open(
+        dst + "/utt2spk", "w", encoding="utf-8"
+    ) as utt2spk, open(dst + "/segments", "w", encoding="utf-8") as segments, open(
+        dst + "/text.en", "w", encoding="utf-8"
+    ) as text_en, open(
+        dst + "/text.ta", "w", encoding="utf-8"
+    ) as text_ta, open(
+        dst + "/reco2file_and_channel", "w", encoding="utf-8"
+    ) as reco2file:
+        for i in range(len(data["F"])):
+            recid = data["F"][i].split("/")[-1].split(".")[0]
+            uttid = (
+                data["S"][i]
+                + "_"
+                + recid
+                + "_"
+                + time_to_hash(data["BT"][i])
+                + "-"
+                + time_to_hash(data["ET"][i])
+            )
+            sox_cmd = "sox -R -t wav - -t wav - rate 16000 dither |"
+            wav_scp.write(
+                " ".join(
+                    [
+                        recid,
+                        "sph2pipe -f wav -p -c",
+                        data["C"][i],
+                        data["F"][i],
+                        "|",
+                        sox_cmd,
+                    ]
+                )
+                + "\n"
+            )
+            utt2spk.write(" ".join([uttid, data["S"][i]]) + "\n")
+            segments.write(
+                " ".join([uttid, recid, data["BT"][i], data["ET"][i]]) + "\n"
+            )
+            text_en.write(" ".join([uttid, data["text_en"][i]]) + "\n")
+            text_ta.write(" ".join([uttid, data["text_ta"][i]]) + "\n")
+            # 2 channels are stored as separate sph, each with only 1 channel
+            reco2file.write(" ".join([recid, recid, "A"]) + "\n")
+
+
+if __name__ == "__main__":
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.train.stm",
+        args.data + "/stm/asr-aeb.norm.train.stm",
+        args.out + "/train",
+    )
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.dev.stm",
+        args.data + "/stm/asr-aeb.norm.dev.stm",
+        args.out + "/dev",
+    )
+    stm_to_kaldi(
+        args.data + "/stm/st-aeb2eng.norm.test1.stm",
+        args.data + "/stm/asr-aeb.norm.test1.stm",
+        args.out + "/test1",
+    )
diff --git a/egs2/iwslt22_dialect/st1/path.sh b/egs2/iwslt22_dialect/st1/path.sh
new file mode 120000
index 00000000000..8e43dca7d4d
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/path.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/pyscripts b/egs2/iwslt22_dialect/st1/pyscripts
new file mode 120000
index 00000000000..f8eb6b2ab7b
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/pyscripts
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/run.sh b/egs2/iwslt22_dialect/st1/run.sh
new file mode 100755
index 00000000000..c977de7cddb
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/run.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=ta
+tgt_lang=en
+
+train_set=train
+train_dev=dev
+test_set=test1
+
+st_config=conf/train_st_conformer.yaml
+inference_config=conf/decode_st.yaml
+
+src_nbpe=1000
+tgt_nbpe=1000
+
+# tc: truecase
+# lc: lowercase
+# lc.rm: lowercase with punctuation removal
+# Note, it is best to keep tgt_case as tc to match IWSLT22 eval
+src_case=tc.rm
+tgt_case=tc
+
+./st.sh \
+    --ignore_init_mismatch true \
+    --stage 1 \
+    --stop_stage 13 \
+    --use_lm false \
+    --token_joint false \
+    --audio_format "flac.ark" \
+    --nj 40 \
+    --inference_nj 40 \
+    --audio_format "flac.ark" \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/iwslt22_dialect/st1/scripts b/egs2/iwslt22_dialect/st1/scripts
new file mode 120000
index 00000000000..c5b75bb5b0a
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/scripts
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/st.sh b/egs2/iwslt22_dialect/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/steps b/egs2/iwslt22_dialect/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/iwslt22_dialect/st1/utils b/egs2/iwslt22_dialect/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/iwslt22_dialect/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/README.md b/egs2/jdcinal/asr1/README.md
new file mode 100644
index 00000000000..850a336ef8f
--- /dev/null
+++ b/egs2/jdcinal/asr1/README.md
@@ -0,0 +1,23 @@
+# RESULTS
+## Dataset
+- Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags
+  - Paper: http://www.lrec-conf.org/proceedings/lrec2018/pdf/464.pdf
+
+## Environments
+- date: `Sat Oct  9 20:44:56 EDT 2021`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a4`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `2c8f7d884480ff46ecccd308d689405232ec345d`
+  - Commit date: `Mon Oct 4 16:11:37 2021 -0400`
+
+
+## Using Conformer based encoder-decoder and predicting transcript along with dialogue act
+- ASR config: [conf/train_asr.yaml](conf/train_asr.yaml)
+- token_type: word
+- keep_nbest_models: 5
+
+|dataset|Dialogue Act Classification (%)|
+|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/test|67.4|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|67.8|
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/asr.sh b/egs2/jdcinal/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/jdcinal/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/cmd.sh b/egs2/jdcinal/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/jdcinal/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/jdcinal/asr1/conf/decode_asr.yaml b/egs2/jdcinal/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/jdcinal/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/conf/fbank.conf b/egs2/jdcinal/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/jdcinal/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/jdcinal/asr1/conf/pbs.conf b/egs2/jdcinal/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/jdcinal/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/jdcinal/asr1/conf/pitch.conf b/egs2/jdcinal/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/jdcinal/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/jdcinal/asr1/conf/queue.conf b/egs2/jdcinal/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/jdcinal/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/jdcinal/asr1/conf/slurm.conf b/egs2/jdcinal/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/jdcinal/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/jdcinal/asr1/conf/train_asr.yaml b/egs2/jdcinal/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..35c0b0eb671
--- /dev/null
+++ b/egs2/jdcinal/asr1/conf/train_asr.yaml
@@ -0,0 +1,60 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 4
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+batch_size: 64
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/db.sh b/egs2/jdcinal/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/jdcinal/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/local/csv2file.pl b/egs2/jdcinal/asr1/local/csv2file.pl
new file mode 100755
index 00000000000..9d9f7a9d7bb
--- /dev/null
+++ b/egs2/jdcinal/asr1/local/csv2file.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use File::Basename;
+use utf8;
+
+my $input_csv = $ARGV[0] or die "Need to get CSV file on the command line\n";
+my $ses_id = basename($input_csv, '.csv');
+my $wav_file = $ARGV[1] or die "Need to get wav file on the command line\n";
+my $output_wav_scp = "data/tmp/wav.scp";
+my $output_utt2spk = "data/tmp/utt2spk";
+my $output_text = "data/tmp/text";
+my $output_segments = "data/tmp/segments";
+
+open(my $csv, '<', $input_csv) or die "Couldn't open '$input_csv' $!\n";
+open (my $wav_scp, '>>', $output_wav_scp) or die "Couldn't open file '$output_wav_scp' $!\n";
+open (my $utt2spk, '>>', $output_utt2spk) or die "Couldn't open file '$output_utt2spk' $!\n";
+open (my $text, '>>', $output_text) or die "Couldn't open file '$output_text' $!\n";
+open (my $segments, '>>', $output_segments) or die "Couldn't open file '$output_segments' $!\n";
+
+# wav.scp
+print $wav_scp "${ses_id} sox ${wav_file} -t wav -c 1 - rate 16000 |\n";
+
+my @lines = <$csv>;
+my $i;
+# preprocess words
+for ($i=@lines-1; $i > 0; $i--){
+  chomp($lines[$i]);
+  my @fields = split "," , $lines[$i];
+  if ($fields[0] eq "*"){
+    my @words = @fields[5 .. $#fields];
+    chomp($lines[$i-1]);
+    $lines[$i-1] = $lines[$i-1] . "," . join(',',@words);
+  }
+}
+
+# create files
+for ($i=0; $i<@lines; $i++){
+  chomp($lines[$i]);
+  my @fields = split "," , $lines[$i];
+  my @tag = split "/" , $fields[2];
+  if($#fields >= 5 && $fields[0] ne "*" && $tag[0] =~ /[A-Z]{2}-[A-Z]{2,3}/ && $fields[3] =~ /^[0-9]+$/ && $fields[4] =~ /^[0-9]+$/ && $fields[4] > $fields[3] && $fields[4] - $fields[3] < 400000000){
+    my $start_time = $fields[3] / 10000000;
+    my $end_time = $fields[4] / 10000000;
+    my @words = @fields[5 .. $#fields];
+    my $joined_words = join(' ',@words);
+    # clean words
+    $joined_words =~ s/ +/ /g;
+    $joined_words =~ s/\?/ /g;
+    $joined_words =~ s/\([LFR]([^a-zA-Z]+)\)/$1/g;
+    $joined_words =~ s/\(P\)//g;
+    $joined_words =~ s/\[.+\]//g;
+    $joined_words =~ s/ +/ /g;
+    $joined_words =~ s/^ //g;
+    $joined_words =~ s/ $//g;
+    # utt2spk
+    print $utt2spk "${ses_id}_$fields[3]_$fields[4] ${ses_id}_$fields[3]_$fields[4]\n";
+    # segments
+    print $segments "${ses_id}_$fields[3]_$fields[4] ${ses_id} ${start_time} ${end_time}\n";
+    # text
+    print $text "${ses_id}_$fields[3]_$fields[4] $tag[0] $joined_words\n";
+  }
+}
+
+close $csv or die "Couldn't close '$input_csv' $!\n";
+close $wav_scp or die "Couldn't close file '$output_wav_scp' $!\n";
+close $utt2spk or die "Couldn't close file '$output_utt2spk' $!\n";
+close $text or die "Couldn't close file '$output_text' $!\n";
+close $segments or die "Couldn't close file '$output_segments' $!\n";
diff --git a/egs2/jdcinal/asr1/local/data.sh b/egs2/jdcinal/asr1/local/data.sh
new file mode 100755
index 00000000000..32eb6ffca2e
--- /dev/null
+++ b/egs2/jdcinal/asr1/local/data.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=1
+
+log "$0 $*"
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ ! -e "${JDCINAL}" ]; then
+    log "Fill the value of 'JDCINAL' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "Download data to ${JDCINAL}"
+    if [ ! -d "${JDCINAL}" ]; then
+        mkdir -p "${JDCINAL}"
+    fi
+    url=http://tts.speech.cs.cmu.edu/awb/infomation_navigation_and_attentive_listening_0.2.zip
+    wget ${url} ${JDCINAL}
+
+    log "Unzip data to ${JDCINAL}"
+    unzip -q ${JDCINAL}infomation_navigation_and_attentive_listening_0.2.zip -d ${JDCINAL}
+
+    # Data Preparation
+    mkdir -p data/{train,valid,test}
+    mkdir -p data/tmp
+    echo -n "" > data/tmp/wav.scp; echo -n "" > data/tmp/utt2spk; echo -n "" > data/tmp/text; echo -n "" > data/tmp/segments
+    for m in C P S; do
+        for n in {1..4}; do
+            for file in "${JDCINAL}"/infomation_navigation_and_attentive_listening_0.2/"${m}""${n}"/*.csv; do
+                dos2unix -q $file
+                ses_id=$(basename "${file}" .csv)
+                wav_file="${JDCINAL}"/infomation_navigation_and_attentive_listening_0.2/sound/${ses_id}.trim.wav
+                #create files
+                iconv -f SJIS -t UTF8 ${file} > data/tmp/${ses_id}
+                perl local/csv2file.pl data/tmp/${ses_id} ${wav_file}
+                rm data/tmp/${ses_id}
+            done
+        done
+    done
+    dos2unix -q data/tmp/wav.scp; dos2unix -q data/tmp/utt2spk; dos2unix -q data/tmp/text; dos2unix -q data/tmp/segments
+    sort -u data/tmp/wav.scp -o data/tmp/wav.scp
+    sort -u data/tmp/utt2spk -o data/tmp/utt2spk
+    sort -u data/tmp/text -o data/tmp/text
+    sort -u data/tmp/segments -o data/tmp/segments
+    utils/utt2spk_to_spk2utt.pl data/tmp/utt2spk > "data/tmp/spk2utt"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # make train valid test sets
+    for file in wav.scp utt2spk spk2utt text segments; do
+        grep -e "C1-" -e "C2-" -e "C3-" -e "C4-" -e "P1-" -e "P2-" -e "P3-" data/tmp/${file} > data/train/${file}
+        grep "P4-" data/tmp/${file} > data/valid/${file}
+        grep -e "S1-" -e "S2-" -e "S3-" -e "S4-" data/tmp/${file} > data/test/${file}
+    done
+    for dset in test valid train; do 
+        utils/validate_data_dir.sh --no-feats --non-print data/${dset} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/jdcinal/asr1/local/path.sh b/egs2/jdcinal/asr1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/jdcinal/asr1/local/score.py b/egs2/jdcinal/asr1/local/score.py
new file mode 100755
index 00000000000..8b68151c4e7
--- /dev/null
+++ b/egs2/jdcinal/asr1/local/score.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+
+
+def get_classification_result(hyp_file, ref_file, hyp_write, ref_write):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+
+    error = 0
+    for line_count in range(len(hyp_lines)):
+        hyp_intent = hyp_lines[line_count].split(" ")[0]
+        ref_intent = ref_lines[line_count].split(" ")[0]
+        if hyp_intent != ref_intent:
+            error += 1
+        hyp_write.write(" ".join(hyp_lines[line_count].split(" ")[1:]))
+        ref_write.write(" ".join(ref_lines[line_count].split(" ")[1:]))
+    return 1 - (error / len(hyp_lines))
+
+
+# file path modified from the original score.py in fsc recipe
+exp_root = sys.argv[1]
+valid_hyp_file = open(
+    os.path.join(
+        exp_root, "decode_asr_asr_model_valid.acc.ave_5best/valid/score_wer/hyp.trn"
+    )
+)
+valid_ref_file = open(
+    os.path.join(
+        exp_root, "decode_asr_asr_model_valid.acc.ave_5best/valid/score_wer/ref.trn"
+    )
+)
+valid_hyp_write = open(
+    os.path.join(
+        exp_root,
+        "decode_asr_asr_model_valid.acc.ave_5best/valid/score_wer/hyp_asr.trn",
+    ),
+    "w",
+)
+valid_ref_write = open(
+    os.path.join(
+        exp_root,
+        "decode_asr_asr_model_valid.acc.ave_5best/valid/score_wer/ref_asr.trn",
+    ),
+    "w",
+)
+
+result = get_classification_result(
+    valid_hyp_file, valid_ref_file, valid_hyp_write, valid_ref_write
+)
+print("Valid Intent Classification Result")
+print(result)
+
+# file path modified from the original score.py in fsc recipe
+test_hyp_file = open(
+    os.path.join(
+        exp_root, "decode_asr_asr_model_valid.acc.ave_5best/test/score_wer/hyp.trn"
+    )
+)
+test_ref_file = open(
+    os.path.join(
+        exp_root, "decode_asr_asr_model_valid.acc.ave_5best/test/score_wer/ref.trn"
+    )
+)
+test_hyp_write = open(
+    os.path.join(
+        exp_root, "decode_asr_asr_model_valid.acc.ave_5best/test/score_wer/hyp_asr.trn"
+    ),
+    "w",
+)
+test_ref_write = open(
+    os.path.join(
+        exp_root, "decode_asr_asr_model_valid.acc.ave_5best/test/score_wer/ref_asr.trn"
+    ),
+    "w",
+)
+
+result = get_classification_result(
+    test_hyp_file, test_ref_file, test_hyp_write, test_ref_write
+)
+print("Test Intent Classification Result")
+print(result)
diff --git a/egs2/jdcinal/asr1/local/score.sh b/egs2/jdcinal/asr1/local/score.sh
new file mode 100755
index 00000000000..1e24cdae5b1
--- /dev/null
+++ b/egs2/jdcinal/asr1/local/score.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir>"
+  exit 1;
+fi
+
+asr_expdir=$1
+#_scoredir modified from the original score.sh in fsc recipe
+_scoredir="${asr_expdir}/decode_asr_asr_model_valid.acc.ave_5best/valid/score_wer/"
+python local/score.py ${asr_expdir}
+sclite \
+            -r "${_scoredir}ref_asr.trn" trn \
+            -h "${_scoredir}hyp_asr.trn" trn \
+            -i rm -o all stdout > "${_scoredir}result_asr.txt"
+echo "Write ASR result in ${_scoredir}result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${_scoredir}result_asr.txt"
+_scoredir="${asr_expdir}/decode_asr_asr_model_valid.acc.ave_5best/test/score_wer/"
+sclite \
+            -r "${_scoredir}ref_asr.trn" trn \
+            -h "${_scoredir}hyp_asr.trn" trn \
+            -i rm -o all stdout > "${_scoredir}result_asr.txt"
+echo "Write ASR result in ${_scoredir}result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${_scoredir}result_asr.txt"
+exit 0
+
diff --git a/egs2/jdcinal/asr1/path.sh b/egs2/jdcinal/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/jdcinal/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/pyscripts b/egs2/jdcinal/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/jdcinal/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/run.sh b/egs2/jdcinal/asr1/run.sh
new file mode 100755
index 00000000000..384849a4767
--- /dev/null
+++ b/egs2/jdcinal/asr1/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang jp \
+    --ngpu 4 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_nj 4 \
+    --inference_asr_model valid.acc.ave_5best.pth\
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/scripts b/egs2/jdcinal/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/jdcinal/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/steps b/egs2/jdcinal/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/jdcinal/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/jdcinal/asr1/utils b/egs2/jdcinal/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/jdcinal/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/jkac/tts1/README.md b/egs2/jkac/tts1/README.md
new file mode 100644
index 00000000000..0584447ae53
--- /dev/null
+++ b/egs2/jkac/tts1/README.md
@@ -0,0 +1,32 @@
+# J-KAC RECIPE
+
+This is the recipe of Japanese female single speaker TTS model with [J-KAC dataset](https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus).
+
+Before running the recipe, please download from https://sites.google.com/site/shinnosuketakamichi/research-topics/j-kac_corpus.  
+Then, edit 'JKAC' in `db.sh` and locate unzipped dataset as follows:
+
+```bash
+$ vim db.sh
+JKAC=/path/to/J-KAC
+
+$ tree -L 1 /path/to/J-KAC
+/path/to/J-KAC
+├── pdf
+├── readme.md
+├── readme.pdf
+├── txt
+└── wav
+
+3 directories, 2 files
+```
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
diff --git a/egs2/jkac/tts1/cmd.sh b/egs2/jkac/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/jkac/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/jkac/tts1/conf/decode.yaml b/egs2/jkac/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/jkac/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/jkac/tts1/conf/pbs.conf b/egs2/jkac/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/jkac/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/jkac/tts1/conf/queue.conf b/egs2/jkac/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/jkac/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/jkac/tts1/conf/slurm.conf b/egs2/jkac/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/jkac/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/jkac/tts1/conf/train.yaml b/egs2/jkac/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/jkac/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/jkac/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/jkac/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/jkac/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/jkac/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/jkac/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..8c9c577421a
--- /dev/null
+++ b/egs2/jkac/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: true   # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/jkac/tts1/conf/tuning/train_conformer_fastspeech.yaml b/egs2/jkac/tts1/conf/tuning/train_conformer_fastspeech.yaml
new file mode 100644
index 00000000000..753fa59f88a
--- /dev/null
+++ b/egs2/jkac/tts1/conf/tuning/train_conformer_fastspeech.yaml
@@ -0,0 +1,88 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech. It requires only a single GPU with 12 GB memory
+# and it takes ~3 days to finish the training on Titan V.
+
+# As a default, we assume that the training is based on knowledge
+# distillation, i.e., using teacher model outputs as the target.
+# This assumes that we always use feat_type=fbank. If you use
+# teacher forcing, please be careful to change batch_bins
+# according to feat_type (in default, we use feats_type=raw).
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech       # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 6        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 6        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 384     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    init_enc_alpha: 1.0                          # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0                          # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.1            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1       # dropout rate for transformer decoder attention layer
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 600  # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 6             # gradient accumulation
+batch_bins: 800000        # batch bins (feats_type=fbank)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to decide the best models to be saved
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jkac/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/jkac/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a0b24873a31
--- /dev/null
+++ b/egs2/jkac/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 10            # gradient accumulation
+batch_bins: 2400000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jkac/tts1/conf/tuning/train_fastspeech.yaml b/egs2/jkac/tts1/conf/tuning/train_fastspeech.yaml
new file mode 100644
index 00000000000..7b208533ce7
--- /dev/null
+++ b/egs2/jkac/tts1/conf/tuning/train_fastspeech.yaml
@@ -0,0 +1,80 @@
+# This configuration is for ESPnet2 to train FastSpeech.
+# It requires only a single GPU with 12 GB memory and it
+# takes ~3 days to finish the training on Titan V.
+
+# As a default, we assume that the training is based on knowledge
+# distillation, i.e., using teacher model outputs as the target.
+# This assumes that we always use feat_type=fbank. If you use
+# teacher forcing, please be careful to change batch_bins
+# according to feat_type (in default, we use feats_type=raw).
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech       # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 6        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 6        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 384     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.1            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1       # dropout rate for transformer decoder attention layer
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 600  # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 6             # gradient accumulation
+batch_bins: 800000        # batch bins (feats_type=fbank)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to decide the best models to be saved
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jkac/tts1/conf/tuning/train_fastspeech2.yaml b/egs2/jkac/tts1/conf/tuning/train_fastspeech2.yaml
new file mode 100644
index 00000000000..1fa382c7ea5
--- /dev/null
+++ b/egs2/jkac/tts1/conf/tuning/train_fastspeech2.yaml
@@ -0,0 +1,102 @@
+# This configuration is for ESPnet2 to train FastSpeech2.
+# It requires only a single GPU with 12 GB memory and it
+# takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 800  # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 8             # gradient accumulation
+batch_bins: 3000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jkac/tts1/conf/tuning/train_tacotron2.yaml b/egs2/jkac/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/jkac/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jkac/tts1/conf/tuning/train_transformer.yaml b/egs2/jkac/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/jkac/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jkac/tts1/db.sh b/egs2/jkac/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/jkac/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/jkac/tts1/local/data.sh b/egs2/jkac/tts1/local/data.sh
new file mode 100755
index 00000000000..c9e11c573cd
--- /dev/null
+++ b/egs2/jkac/tts1/local/data.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=1
+fs=48000
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+db_root=${JKAC}
+
+train_set=tr_no_dev
+train_dev=dev
+recog_set=eval1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # Initial normalization of the data
+    local/data_prep.sh ${db_root} data/train ${fs}
+    utils/validate_data_dir.sh --no-feats data/train
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # make evaluation and development sets
+    utils/subset_data_dir.sh data/train 500 data/deveval
+    utils/subset_data_dir.sh --first data/deveval 250 data/${recog_set}
+    utils/subset_data_dir.sh --last data/deveval 250 data/${train_dev}
+    n=$(( $(wc -l < data/train/segments) - 500 ))
+    utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/jkac/tts1/local/data_prep.sh b/egs2/jkac/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..7cf03a4e4bc
--- /dev/null
+++ b/egs2/jkac/tts1/local/data_prep.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 Nagoya University (Takenori Yoshimura)
+#           2021 Nagoya University (Yusuke Yasuda)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db=$1
+data_dir=$2
+fs=$3
+
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 <db> <data_dir> <fs>"
+    echo "e.g.: $0 downloads/J-KAC data/all 24000"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check directory existence
+[ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+wav=${data_dir}/wav
+segments=${data_dir}/segments
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+
+local/prep_segments.py ${db} ${scp} ${utt2spk} ${text} ${segments} $fs
+echo "finished making wav.scp, utt2spk, text, segments."
+
+utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+echo "finished making spk2utt."
\ No newline at end of file
diff --git a/egs2/jkac/tts1/local/path.sh b/egs2/jkac/tts1/local/path.sh
new file mode 100644
index 00000000000..8779ab3ffd1
--- /dev/null
+++ b/egs2/jkac/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import pyopenjtalk" > /dev/null; then
+    echo "Error: pyopenjtalk is not installed." >&2
+    echo "Error: please install pyopenjtalk and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make pyopenjtalk.done" >&2
+    return 1
+fi
diff --git a/egs2/jkac/tts1/local/prep_segments.py b/egs2/jkac/tts1/local/prep_segments.py
new file mode 100755
index 00000000000..2090624521e
--- /dev/null
+++ b/egs2/jkac/tts1/local/prep_segments.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Nagoya University (Yusuke Yasuda)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+from collections import namedtuple
+import os
+import re
+import sys
+import yaml
+
+
+class JKACPath(namedtuple("JKACPath", ["label_path", "wav_path", "category", "title"])):
+    def recording_id(self):
+        return "{}_{}".format(self.category, self.title)
+
+    def wav_scp_str(self, sample_rate=None):
+        if sample_rate is not None:
+            return "{} sox {} -t wav -r {} - |".format(
+                self.recording_id(), self.wav_path, sample_rate
+            )
+        else:
+            return "{} {}".format(self.recording_id(), self.wav_path)
+
+
+class JKACLabel(
+    namedtuple(
+        "JKACLabel",
+        [
+            "path",
+            "chapter_id",
+            "paragraph_id",
+            "style_id",
+            "sentence_id",
+            "sentence",
+            "time_begin",
+            "time_end",
+        ],
+    )
+):
+    def utt_id(self):
+        return "{}_{}_{}_{}_{}_{}".format(
+            self.path.category,
+            self.path.title,
+            self.chapter_id,
+            self.paragraph_id,
+            self.style_id,
+            self.sentence_id,
+        )
+
+    def segment_file_str(self):
+        return "{} {} {:.3f} {:.3f}".format(
+            self.utt_id(), self.path.recording_id(), self.time_begin, self.time_end
+        )
+
+    def kanji_sentence(self):
+        return re.sub(r"\[(.+?)\|(.+?)\]", r"\1", self.sentence).replace("　", " ")
+
+    def furigana_sentence(self):
+        return re.sub(r"\[(.+?)\|(.+?)\]", r"\2", self.sentence).replace("　", " ")
+
+    def text_file_str(self):
+        return "{} {}".format(self.utt_id(), self.kanji_sentence())
+
+    def utt2spk_str(self, speaker_id):
+        return "{} {}".format(self.utt_id(), speaker_id)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Prepare segments from text files in yaml format",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("input_dir", type=str, help="path to J-KAC corpus")
+    parser.add_argument("wav_scp_path", type=str, help="path to output 'wav.scp' file")
+    parser.add_argument("utt2spk_path", type=str, help="path to output 'utt2spk' file")
+    parser.add_argument("text_path", type=str, help="path to output 'text' file")
+    parser.add_argument(
+        "segments_path", type=str, help="path to output 'segments' file"
+    )
+    parser.add_argument("sample_rate", type=str, help="sampling rate")
+    return parser
+
+
+def list_labels(root_path):
+    txt_dir_path = os.path.join(root_path, "txt")
+    wav_dir_path = os.path.join(root_path, "wav")
+    categories = os.listdir(txt_dir_path)
+    for category in categories:
+        category_txt_path = os.path.join(txt_dir_path, category)
+        category_wav_path = os.path.join(wav_dir_path, category)
+        for label_filename in os.listdir(category_txt_path):
+            if label_filename.endswith(".yaml"):
+                title = label_filename.replace(".yaml", "")
+                label_path = os.path.join(category_txt_path, label_filename)
+                wav_path = os.path.join(category_wav_path, title + ".wav")
+                yield JKACPath(
+                    label_path=label_path,
+                    wav_path=wav_path,
+                    category=category,
+                    title=title,
+                )
+
+
+def read_label(path):
+    with open(path.label_path, "r") as f:
+        label_dict = yaml.load(f, Loader=yaml.Loader)
+        return parse_label(label_dict, path)
+
+
+def parse_label(book_dict, path):
+    for chapter_id in book_dict.keys():
+        chapter = book_dict[chapter_id]
+        for paragraph_id in chapter.keys():
+            paragraph = chapter[paragraph_id]
+            for style_id in paragraph.keys():
+                style = paragraph[style_id]
+                for sentence_id, sentence in enumerate(style):
+                    yield JKACLabel(
+                        path=path,
+                        chapter_id=chapter_id,
+                        paragraph_id=paragraph_id,
+                        style_id=style_id,
+                        sentence_id=sentence_id + 1,
+                        sentence=sentence["sent"],
+                        time_begin=sentence["time"][0],
+                        time_end=sentence["time"][1],
+                    )
+
+
+if __name__ == "__main__":
+    args = get_parser().parse_args(sys.argv[1:])
+    sample_rate = None if args.sample_rate == "48000" else args.sample_rate
+
+    with open(args.wav_scp_path, "w") as wav_scp_f, open(
+        args.utt2spk_path, "w"
+    ) as utt2spk_f, open(args.text_path, "w") as text_f, open(
+        args.segments_path, "w"
+    ) as segments_f:
+        paths = list(list_labels(args.input_dir))
+        paths.sort(key=lambda p: p.recording_id())
+        for path in paths:
+            wav_scp_f.write(path.wav_scp_str(sample_rate=sample_rate) + "\n")
+            labels = list(read_label(path))
+            labels.sort(key=lambda l: l.utt_id())
+            for label in labels:
+                text_f.write(label.text_file_str() + "\n")
+                segments_f.write(label.segment_file_str() + "\n")
+                utt2spk_f.write(label.utt2spk_str(speaker_id="JKAC") + "\n")
diff --git a/egs2/jkac/tts1/path.sh b/egs2/jkac/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/jkac/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/jkac/tts1/pyscripts b/egs2/jkac/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/jkac/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/jkac/tts1/run.sh b/egs2/jkac/tts1/run.sh
new file mode 100755
index 00000000000..3d5f45b12eb
--- /dev/null
+++ b/egs2/jkac/tts1/run.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 48000 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+train_set=tr_no_dev
+valid_set=dev
+test_sets="dev eval1"
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# Input example: こ、こんにちは
+
+# 1. Phoneme + Pause
+# (e.g. k o pau k o N n i ch i w a)
+g2p=pyopenjtalk
+
+# 2. Kana + Symbol
+# (e.g. コ 、 コ ン ニ チ ワ)
+# g2p=pyopenjtalk_kana
+
+# 3. Phoneme + Accent
+# (e.g. k 1 0 o 1 0 k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
+# g2p=pyopenjtalk_accent
+
+# 4. Phoneme + Accent + Pause
+# (e.g. k 1 0 o 1 0 pau k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
+# g2p=pyopenjtalk_accent_with_pause
+
+# 5. Phoneme + Prosody symbols
+# (e.g. ^, k, #, o, _, k, o, [, N, n, i, ch, i, w, a, $)
+# g2p=pyopenjtalk_prosody
+
+./tts.sh \
+    --lang jp \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner jaconv \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    ${opts} "$@"
diff --git a/egs2/jkac/tts1/scripts b/egs2/jkac/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/jkac/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/jkac/tts1/steps b/egs2/jkac/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/jkac/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/jkac/tts1/tts.sh b/egs2/jkac/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/jkac/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/jkac/tts1/utils b/egs2/jkac/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/jkac/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/jmd/tts1/README.md b/egs2/jmd/tts1/README.md
new file mode 100644
index 00000000000..cbf38a9c6ba
--- /dev/null
+++ b/egs2/jmd/tts1/README.md
@@ -0,0 +1,32 @@
+# JMD RECIPE
+
+This is the recipe of Japanese single speaker TTS model with [JMD](https://sites.google.com/site/shinnosuketakamichi/research-topics/jmd_corpus) corpus.
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+# INITIAL RESULTS
+
+## Environments
+- date: `Fri Aug 27 10:19:55 JST 2021`
+- python version: `3.6.8 (default, Nov 16 2020, 16:55:22)  [GCC 4.8.5 20150623 (Red Hat 4.8.5-44)]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `9afbd27f5ec406819f33f7e9205ec15d2b41f0d3`
+  - Commit date: `Mon Aug 2 10:28:43 2021 +0900`
+
+## Pretrained Models
+
+### jmd_tts_train_tacotron2_kumamoto_raw_phn_jaconv_pyopenjtalk_train.loss.ave
+- https://zenodo.org/record/5278073
+
+### jmd_tts_train_tacotron2_osaka_raw_phn_jaconv_pyopenjtalk_train.loss.ave
+- https://zenodo.org/record/5277863
diff --git a/egs2/jmd/tts1/cmd.sh b/egs2/jmd/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/jmd/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/jmd/tts1/conf/decode.yaml b/egs2/jmd/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/jmd/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/jmd/tts1/conf/pbs.conf b/egs2/jmd/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/jmd/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/jmd/tts1/conf/queue.conf b/egs2/jmd/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/jmd/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/jmd/tts1/conf/slurm.conf b/egs2/jmd/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/jmd/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/jmd/tts1/conf/train.yaml b/egs2/jmd/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/jmd/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/jmd/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/jmd/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/jmd/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/jmd/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/jmd/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..8c9c577421a
--- /dev/null
+++ b/egs2/jmd/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: true   # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/jmd/tts1/conf/tuning/train_conformer_fastspeech.yaml b/egs2/jmd/tts1/conf/tuning/train_conformer_fastspeech.yaml
new file mode 100644
index 00000000000..753fa59f88a
--- /dev/null
+++ b/egs2/jmd/tts1/conf/tuning/train_conformer_fastspeech.yaml
@@ -0,0 +1,88 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech. It requires only a single GPU with 12 GB memory
+# and it takes ~3 days to finish the training on Titan V.
+
+# As a default, we assume that the training is based on knowledge
+# distillation, i.e., using teacher model outputs as the target.
+# This assumes that we always use feat_type=fbank. If you use
+# teacher forcing, please be careful to change batch_bins
+# according to feat_type (in default, we use feats_type=raw).
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech       # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 6        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 6        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 384     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    init_enc_alpha: 1.0                          # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0                          # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.1            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1       # dropout rate for transformer decoder attention layer
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 600  # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 6             # gradient accumulation
+batch_bins: 800000        # batch bins (feats_type=fbank)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to decide the best models to be saved
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jmd/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/jmd/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a0b24873a31
--- /dev/null
+++ b/egs2/jmd/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 10            # gradient accumulation
+batch_bins: 2400000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jmd/tts1/conf/tuning/train_fastspeech.yaml b/egs2/jmd/tts1/conf/tuning/train_fastspeech.yaml
new file mode 100644
index 00000000000..7b208533ce7
--- /dev/null
+++ b/egs2/jmd/tts1/conf/tuning/train_fastspeech.yaml
@@ -0,0 +1,80 @@
+# This configuration is for ESPnet2 to train FastSpeech.
+# It requires only a single GPU with 12 GB memory and it
+# takes ~3 days to finish the training on Titan V.
+
+# As a default, we assume that the training is based on knowledge
+# distillation, i.e., using teacher model outputs as the target.
+# This assumes that we always use feat_type=fbank. If you use
+# teacher forcing, please be careful to change batch_bins
+# according to feat_type (in default, we use feats_type=raw).
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech       # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 6        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 6        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 384     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.1            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1       # dropout rate for transformer decoder attention layer
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 600  # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 6             # gradient accumulation
+batch_bins: 800000        # batch bins (feats_type=fbank)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to decide the best models to be saved
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jmd/tts1/conf/tuning/train_fastspeech2.yaml b/egs2/jmd/tts1/conf/tuning/train_fastspeech2.yaml
new file mode 100644
index 00000000000..1fa382c7ea5
--- /dev/null
+++ b/egs2/jmd/tts1/conf/tuning/train_fastspeech2.yaml
@@ -0,0 +1,102 @@
+# This configuration is for ESPnet2 to train FastSpeech2.
+# It requires only a single GPU with 12 GB memory and it
+# takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 800  # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 8             # gradient accumulation
+batch_bins: 3000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jmd/tts1/conf/tuning/train_tacotron2.yaml b/egs2/jmd/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/jmd/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jmd/tts1/conf/tuning/train_transformer.yaml b/egs2/jmd/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/jmd/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jmd/tts1/db.sh b/egs2/jmd/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/jmd/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/jmd/tts1/local/clean_text.py b/egs2/jmd/tts1/local/clean_text.py
new file mode 100755
index 00000000000..5110effc9e4
--- /dev/null
+++ b/egs2/jmd/tts1/local/clean_text.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+# Copyright 2021 Takenori Yoshimura
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import re
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--skip-header", action="store_true", help="if true, skip first line"
+    )
+    parser.add_argument("text", type=str, help="text to be cleaned")
+    args = parser.parse_args()
+    with open(args.text, "r", encoding="utf-8") as fid:
+        if args.skip_header:
+            fid.readline()
+        for line in fid.readlines():
+            id, content = line.split(",")
+            content = re.sub("（.*?）", "", content.rstrip())
+            content = re.sub("「(.*?)」", "\\1", content)
+            print("%s %s" % (id, content))
diff --git a/egs2/jmd/tts1/local/data.sh b/egs2/jmd/tts1/local/data.sh
new file mode 100755
index 00000000000..22c441d6df4
--- /dev/null
+++ b/egs2/jmd/tts1/local/data.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+dialect="Kumamoto"
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${JMD}" ]; then
+    log "Fill the value of 'JMD' of db.sh"
+    exit 1
+fi
+db_root=${JMD}
+
+train_set=tr_no_dev
+train_dev=dev
+recog_set=eval1
+
+dialect=$(echo ${dialect} | tr '[:upper:]' '[:lower:]')
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: Data Download"
+    local/data_download.sh "${db_root}" "${dialect}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    local/data_prep.sh "${db_root}" "${dialect}" data/train 24000
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: utils/subset_data_dir.sh"
+    # make evaluation and devlopment sets
+    utils/subset_data_dir.sh --first data/train 100 data/deveval
+    utils/subset_data_dir.sh --first data/deveval 50 data/${recog_set}
+    utils/subset_data_dir.sh --last data/deveval 50 data/${train_dev}
+    n=$(( $(wc -l < data/train/wav.scp) - 100 ))
+    utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/jmd/tts1/local/data_download.sh b/egs2/jmd/tts1/local/data_download.sh
new file mode 100755
index 00000000000..f9d24e08fe1
--- /dev/null
+++ b/egs2/jmd/tts1/local/data_download.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Takenori Yoshimura
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Download JMD Corpus
+
+. ./path.sh || exit 1
+
+download_dir=$1
+dialect=$(echo "$2" | tr '[:upper:]' '[:lower:]')
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <download_dir> <dialect>"
+    exit 1
+fi
+
+set -euo pipefail
+
+case "${dialect}" in
+    "kumamoto") url="https://drive.google.com/a/g.sp.m.is.nagoya-u.ac.jp/uc?id=1gacw6Ak6rlEZ_gx9KwafIIfc3dU0EAHW" ;;
+    "osaka") url="https://drive.google.com/a/g.sp.m.is.nagoya-u.ac.jp/uc?id=1mCbmUKVifEEEcm7A3ofqWW7dCqVXGrsh" ;;
+    *) echo "Given dialect is not supported" ; exit 1 ;;
+esac
+
+if [ ! -e "${download_dir}/${dialect}" ]; then
+    scripts/utils/download_from_google_drive.sh \
+        "${url}" "${download_dir}" zip
+    echo "Successfully downloaded JMD corpus."
+else
+    echo "Already exists. Skipped."
+fi
+
+cwd=$(pwd)
+if [ ! -e "${download_dir}/JMDComplements" ]; then
+    echo "Downloading complements for ${dialect}"
+    cd "${download_dir}"
+    git clone https://github.com/takenori-y/JMDComplements
+    cd "${cwd}"
+    echo "Successfully downloaded JMD complements."
+else
+    echo "Already exists. Skipped."
+fi
+cp "${download_dir}/JMDComplements/${dialect}/segments" "${download_dir}/${dialect}/"
diff --git a/egs2/jmd/tts1/local/data_prep.sh b/egs2/jmd/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..3e4bc5fbc07
--- /dev/null
+++ b/egs2/jmd/tts1/local/data_prep.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Tomoki Hayashi, Takenori Yoshimura
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Prepare kaldi-style data directory for JMD corpus
+
+db=$1
+dialect=$2
+data_dir=$3
+fs=$4
+
+# check arguments
+if [ $# != 4 ]; then
+    echo "Usage: $0 <db> <dialect> <data_dir> <fs>"
+    echo "e.g.: $0 downloads kumamoto data/all 24000"
+    exit 1
+fi
+
+set -euo pipefail
+
+for dset in ${dialect}; do
+    # check directory existence
+    [ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+    # set filenames
+    scp=${data_dir}/wav.scp
+    utt2spk=${data_dir}/utt2spk
+    spk2utt=${data_dir}/spk2utt
+    text=${data_dir}/text
+    segments=${data_dir}/segments
+
+    # check file existence
+    [ -e "${scp}" ] && rm "${scp}"
+    [ -e "${utt2spk}" ] && rm "${utt2spk}"
+    [ -e "${text}" ] && rm "${text}"
+    [ -e "${segments}" ] && rm "${segments}"
+
+    # make scp, utt2spk, spk2utt, and segments
+    find "${db}/${dset}/wav24kHz" -name "*.wav" | sort | while read -r filename; do
+        utt_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+        if [ "${fs}" -eq 24000 ]; then
+            # default sampling rate
+            echo "${utt_id} ${filename}" >> "${scp}"
+        else
+            echo "${utt_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
+        fi
+        echo "${utt_id} JMD" >> "${utt2spk}"
+    done
+    utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+
+    # make text
+    local/clean_text.py --skip-header "${db}/${dset}/transcripts.csv" > "${text}"
+
+    # copy segments
+    cp "${db}/${dset}/segments" "${segments}"
+
+    # fix
+    utils/fix_data_dir.sh "${data_dir}"
+    echo "Successfully prepared ${dset}."
+done
diff --git a/egs2/jmd/tts1/local/path.sh b/egs2/jmd/tts1/local/path.sh
new file mode 100644
index 00000000000..8779ab3ffd1
--- /dev/null
+++ b/egs2/jmd/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import pyopenjtalk" > /dev/null; then
+    echo "Error: pyopenjtalk is not installed." >&2
+    echo "Error: please install pyopenjtalk and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make pyopenjtalk.done" >&2
+    return 1
+fi
diff --git a/egs2/jmd/tts1/path.sh b/egs2/jmd/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/jmd/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/jmd/tts1/pyscripts b/egs2/jmd/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/jmd/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/jmd/tts1/run.sh b/egs2/jmd/tts1/run.sh
new file mode 100755
index 00000000000..4941d4b0500
--- /dev/null
+++ b/egs2/jmd/tts1/run.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+dialect="Kumamoto"
+
+opts=
+if [ "${fs}" -eq 48000 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+train_set=tr_no_dev
+valid_set=dev
+test_sets="dev eval1"
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# Input example: こ、こんにちは
+
+# 1. Phoneme + Pause
+# (e.g. k o pau k o N n i ch i w a)
+g2p=pyopenjtalk
+
+# 2. Kana + Symbol
+# (e.g. コ 、 コ ン ニ チ ワ)
+# g2p=pyopenjtalk_kana
+
+# 3. Phoneme + Accent
+# (e.g. k 1 0 o 1 0 k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
+# g2p=pyopenjtalk_accent
+
+# 4. Phoneme + Accent + Pause
+# (e.g. k 1 0 o 1 0 pau k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
+# g2p=pyopenjtalk_accent_with_pause
+
+# 5. Phoneme + Prosody symbols
+# (e.g. ^, k, #, o, _, k, o, [, N, n, i, ch, i, w, a, $)
+# g2p=pyopenjtalk_prosody
+
+./tts.sh \
+    --lang jp \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner jaconv \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    --local_data_opts "--dialect ${dialect}" \
+    ${opts} "$@"
diff --git a/egs2/jmd/tts1/scripts b/egs2/jmd/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/jmd/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/jmd/tts1/steps b/egs2/jmd/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/jmd/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/jmd/tts1/tts.sh b/egs2/jmd/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/jmd/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/jmd/tts1/utils b/egs2/jmd/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/jmd/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/jsss/tts1/README.md b/egs2/jsss/tts1/README.md
index a0a56c1d342..058ecb2b819 100644
--- a/egs2/jsss/tts1/README.md
+++ b/egs2/jsss/tts1/README.md
@@ -6,6 +6,8 @@ See the following pages for the usage:
 - [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
 - [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
 - [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
 
 See the following pages before asking the question:
 - [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
diff --git a/egs2/jsss/tts1/cmd.sh b/egs2/jsss/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/jsss/tts1/cmd.sh
+++ b/egs2/jsss/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/jsss/tts1/local/path.sh b/egs2/jsss/tts1/local/path.sh
index e69de29bb2d..8779ab3ffd1 100644
--- a/egs2/jsss/tts1/local/path.sh
+++ b/egs2/jsss/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import pyopenjtalk" > /dev/null; then
+    echo "Error: pyopenjtalk is not installed." >&2
+    echo "Error: please install pyopenjtalk and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make pyopenjtalk.done" >&2
+    return 1
+fi
diff --git a/egs2/jsss/tts1/run.sh b/egs2/jsss/tts1/run.sh
index 70230be6e6f..3d5f45b12eb 100755
--- a/egs2/jsss/tts1/run.sh
+++ b/egs2/jsss/tts1/run.sh
@@ -43,6 +43,10 @@ g2p=pyopenjtalk
 # (e.g. k 1 0 o 1 0 pau k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
 # g2p=pyopenjtalk_accent_with_pause
 
+# 5. Phoneme + Prosody symbols
+# (e.g. ^, k, #, o, _, k, o, [, N, n, i, ch, i, w, a, $)
+# g2p=pyopenjtalk_prosody
+
 ./tts.sh \
     --lang jp \
     --feats_type raw \
diff --git a/egs2/jsut/asr1/cmd.sh b/egs2/jsut/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/jsut/asr1/cmd.sh
+++ b/egs2/jsut/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/jsut/tts1/README.md b/egs2/jsut/tts1/README.md
index 6fbe095a8a2..66c08aea9fe 100644
--- a/egs2/jsut/tts1/README.md
+++ b/egs2/jsut/tts1/README.md
@@ -6,11 +6,165 @@ See the following pages for the usage:
 - [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
 - [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
 - [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
 
 See the following pages before asking the question:
 - [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
 - [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
 
+
+# SIXTH RESULTS
+
+- New g2p (`pyopenjtalk_prosody`)
+
+## Environments
+- date: `Fri Sep 10 13:04:49 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `628b46282537ce532d613d6bafb75e826e8455de`
+  - Commit date: `Wed Sep 8 13:30:50 2021 +0900`
+
+## Pretrained models
+
+### jsut_tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody_train.loss.ave
+- https://zenodo.org/record/5499026
+
+### jsut_tts_train_transformer_raw_phn_jaconv_pyopenjtalk_prosody_train.loss.ave
+- https://zenodo.org/record/5499040
+
+### jsut_tts_train_conformer_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_prosody_train.loss.ave
+- https://zenodo.org/record/5499050
+
+### jsut_tts_train_conformer_fastspeech2_transformer_teacher_raw_phn_jaconv_pyopenjtalk_prosody_train.loss.ave
+- https://zenodo.org/record/5499066
+
+### jsut_tts_train_vits_raw_phn_jaconv_pyopenjtalk_prosody_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_vits.yaml \
+    --g2p pyopenjtalk_prosody \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 22.05khz / 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5521354
+
+
+### jsut_tts_train_full_band_vits_raw_phn_jaconv_pyopenjtalk_prosody_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --ngpu 4 \
+    --fs 44100 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --dumpdir dump/44k \
+    --expdir exp/44k
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_full_band_vits.yaml \
+    --g2p pyopenjtalk_prosody \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 44.1khz / 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5521340
+
+
+# FIFTH RESULTS
+
+- Initial VITS models
+
+## Environments
+- date: `Fri Sep  3 21:09:25 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a1`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `dee654041cddf80281048b3e7525c1cdafc377ff`
+  - Commit date: `Thu Sep 2 14:45:48 2021 +0900`
+
+## Pretrained Models
+
+### jsut_tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_vits.yaml \
+    --g2p pyopenjtalk_accent_with_pause \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 22.05khz / 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5414980
+
+### jsut_tts_train_full_band_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --ngpu 4 \
+    --fs 44100 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --dumpdir dump/44.1k \
+    --expdir exp/44.1k
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_full_band_vits.yaml \
+    --g2p pyopenjtalk_accent_with_pause \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 44.1khz / 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5521360
+
+
 # FORTH RESULTS
 
 - Use phoneme + accent + pause as the inputs
diff --git a/egs2/jsut/tts1/cmd.sh b/egs2/jsut/tts1/cmd.sh
old mode 100644
new mode 100755
index e0c19d89ddb..2aae6919fef
--- a/egs2/jsut/tts1/cmd.sh
+++ b/egs2/jsut/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/jsut/tts1/conf/tuning/decode_vits.yaml b/egs2/jsut/tts1/conf/tuning/decode_vits.yaml
new file mode 100644
index 00000000000..74bb0ebe0e2
--- /dev/null
+++ b/egs2/jsut/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
diff --git a/egs2/jsut/tts1/conf/tuning/train_full_band_vits.yaml b/egs2/jsut/tts1/conf/tuning/train_full_band_vits.yaml
new file mode 100644
index 00000000000..d64ff4d0d2c
--- /dev/null
+++ b/egs2/jsut/tts1/conf/tuning/train_full_band_vits.yaml
@@ -0,0 +1,185 @@
+# This configuration is for ESPnet2 to train 44.1 kHz VITS,
+# which is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 44100 hz audio as
+# the training data (mainly tested on JSUT and CSMSC).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 44100          # must be the same as the training data
+        n_fft: 2048        # fft points
+        hop_length: 512    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 44100          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 10000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/jsut/tts1/conf/tuning/train_vits.yaml b/egs2/jsut/tts1/conf/tuning/train_vits.yaml
new file mode 100644
index 00000000000..574b8febab4
--- /dev/null
+++ b/egs2/jsut/tts1/conf/tuning/train_vits.yaml
@@ -0,0 +1,185 @@
+# This configuration is for ESPnet2 to train VITS, which
+# is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 22050 hz audio as
+# the training data (mainly tested on LJspeech).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/jsut/tts1/run.sh b/egs2/jsut/tts1/run.sh
index 70230be6e6f..3d5f45b12eb 100755
--- a/egs2/jsut/tts1/run.sh
+++ b/egs2/jsut/tts1/run.sh
@@ -43,6 +43,10 @@ g2p=pyopenjtalk
 # (e.g. k 1 0 o 1 0 pau k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
 # g2p=pyopenjtalk_accent_with_pause
 
+# 5. Phoneme + Prosody symbols
+# (e.g. ^, k, #, o, _, k, o, [, N, n, i, ch, i, w, a, $)
+# g2p=pyopenjtalk_prosody
+
 ./tts.sh \
     --lang jp \
     --feats_type raw \
diff --git a/egs2/jtubespeech/asr1/README.md b/egs2/jtubespeech/asr1/README.md
new file mode 100644
index 00000000000..8d65e412dfd
--- /dev/null
+++ b/egs2/jtubespeech/asr1/README.md
@@ -0,0 +1,18 @@
+# The first result based on the single speaker split
+- We used a CTC threshold -0.3
+- The current scoring considers the space, and we need to remove them from scoring
+## Environments
+- date: `Fri Jun 11 11:19:26 EDT 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.10`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `5c44594ae531d9490f8106a4d81a8875fb361af2`
+  - Commit date: `Thu Jun 10 00:24:57 2021 -0400`
+
+## asr_train_asr_conformer2
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.best/valid_ss_th-0.3|2706|51430|89.0|5.7|5.4|5.4|16.5|68.7|
\ No newline at end of file
diff --git a/egs2/jtubespeech/asr1/asr.sh b/egs2/jtubespeech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/jtubespeech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/jtubespeech/asr1/cmd.sh b/egs2/jtubespeech/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/jtubespeech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/jtubespeech/asr1/conf/decode_asr.yaml b/egs2/jtubespeech/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..1268f3badbe
--- /dev/null
+++ b/egs2/jtubespeech/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/jtubespeech/asr1/conf/fbank.conf b/egs2/jtubespeech/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/jtubespeech/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/jtubespeech/asr1/conf/pbs.conf b/egs2/jtubespeech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/jtubespeech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/jtubespeech/asr1/conf/pitch.conf b/egs2/jtubespeech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/jtubespeech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/jtubespeech/asr1/conf/queue.conf b/egs2/jtubespeech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/jtubespeech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/jtubespeech/asr1/conf/slurm.conf b/egs2/jtubespeech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/jtubespeech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/jtubespeech/asr1/conf/train_asr_conformer.yaml b/egs2/jtubespeech/asr1/conf/train_asr_conformer.yaml
new file mode 100644
index 00000000000..d0c3a1e7def
--- /dev/null
+++ b/egs2/jtubespeech/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1,72 @@
+# This configuration requires 4 GPUs with 24GB memory
+batch_type: numel
+batch_bins: 4000000
+accum_grad: 1
+max_epoch: 50
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+use_amp: true
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/jtubespeech/asr1/db.sh b/egs2/jtubespeech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/jtubespeech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/jtubespeech/asr1/local/data.sh b/egs2/jtubespeech/asr1/local/data.sh
new file mode 100755
index 00000000000..92d53e99da3
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/data.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=10000
+
+thre=-0.3
+maxchars=80
+
+log "$0 $*"
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+data_tag="_ss0622_th${thre}"
+odir=/exp/swatanabe/data/opj/single-speaker
+scores=/exp/swatanabe/data/opj/single-speaker/segments/segments_20210531_ctcscore.txt
+scores=/expscratch/swatanabe/202007espnet/espnet_v2/egs2/jtubespeech/asr1/100G_ctcseg_0622/segments.txt
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "prepare the basic data directories"
+    mkdir -p data/local/data${data_tag}
+    find ${odir}/wav16k/ -name '*.wav' | sort > data/local/data${data_tag}/wav.flist
+    awk -F '/' '{print $NF}' data/local/data${data_tag}/wav.flist | sed -e "s/\.wav//" > data/local/data${data_tag}/id.list
+    paste -d ' ' data/local/data${data_tag}/id.list data/local/data${data_tag}/wav.flist > data/local/data${data_tag}/wav.scp
+
+    top_scores=data/local/data${data_tag}/segments_scores
+    log "save more than ${thre} score file to ${top_scores}"
+    awk -v t="${thre}" '$5 > t' ${scores} > ${top_scores}
+    nutt=$(wc -l ${top_scores} | awk '{print $1}')
+    log "we will use top ${nutt} utterances"
+
+    log "get segments from the score file"
+    awk '{print $1 " " $2 " " $3 " " $4}' ${top_scores} | sort > data/local/data${data_tag}/segments
+
+    log "make utt2spk and spk2utt from data/local/data${data_tag}/segments"
+    paste -d ' ' <(awk '{print $1}' data/local/data${data_tag}/segments) <(awk '{print $2}' data/local/data${data_tag}/segments) > data/local/data${data_tag}/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/local/data${data_tag}/utt2spk > data/local/data${data_tag}/spk2utt
+
+    log "convert unicode space to the ascii space with \`perl -CSDA -plE 's/\s/ /g'\`"
+    paste -d ' ' <(awk '{print $1}' ${top_scores}) <(cut -f 6- -d" " ${top_scores} | perl -CSDA -plE 's/\s/ /g') > data/local/data${data_tag}/text
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "remove too long or too short utterances"
+    rm -fr data/local/data${data_tag}_top${nutt}
+    remove_longshortdata.sh --maxchars ${maxchars} data/local/data${data_tag} data/local/data${data_tag}_trim
+fi
+
+#if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+#    log "split the data into training and validation data by randomly picking up 5% of the recordings as a validation set"
+#    utils/subset_data_dir_tr_cv.sh --cv-spk-percent 5 data/local/data${data_tag}_trim data/train${data_tag} data/valid${data_tag}
+#    utils/fix_data_dir.sh data/train${data_tag}
+#    utils/fix_data_dir.sh data/valid${data_tag}
+#fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    log "make the training data by subtracting the test set speaker list"
+    awk '{print $1}' data/local/data${data_tag}_trim/spk2utt | sort > data/local/data${data_tag}_trim/all_speaker_list
+    comm -3 data/local/data${data_tag}_trim/all_speaker_list local/test_speaker_list > data/local/data${data_tag}_trim/train_speaker_list
+    utils/subset_data_dir.sh --spk-list data/local/data${data_tag}_trim/train_speaker_list data/local/data${data_tag}_trim data/train${data_tag}_nodev
+    utils/fix_data_dir.sh data/train${data_tag}_nodev
+fi
+
+#if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#    log "make more difficult test data by using the test set speaker list and changing the threshold"
+#    utils/subset_data_dir.sh --spk-list local/test_speaker_list data/local/data${data_tag}_trim data/valid${data_tag}_fixspeaker
+#    utils/fix_data_dir.sh data/valid${data_tag}_fixspeaker
+#fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    log "make an official easy_jun21 test set"
+    for d_or_e in dev eval; do
+	# we will add normal, hard, and hell modes in the future
+	# shellcheck disable=SC2043
+	for test_mode in easy_jun21; do
+	    x=${d_or_e}_${test_mode}
+	    if [ ! -d data/${x} ]; then
+		log "get segments from local/${x}.list"
+		mkdir -p data/local/data_${x}
+		grep -f local/${x}.list ${scores} > data/local/data_${x}/segments_scores
+		awk '{print $1 " " $2 " " $3 " " $4}' data/local/data_${x}/segments_scores | sort > data/local/data_${x}/segments
+
+		log "make utt2spk and spk2utt from data/local/data_${x}/segments"
+		paste -d ' ' <(awk '{print $1}' data/local/data_${x}/segments) <(awk '{print $2}' data/local/data_${x}/segments) > data/local/data_${x}/utt2spk
+		utils/utt2spk_to_spk2utt.pl data/local/data_${x}/utt2spk > data/local/data_${x}/spk2utt
+
+		log "convert unicode space to the ascii space with \`perl -CSDA -plE 's/\s/ /g'\`"
+		paste -d ' ' <(awk '{print $1}' data/local/data_${x}/segments_scores) <(cut -f 6- -d" " data/local/data_${x}/segments_scores \
+		    | perl -CSDA -plE 's/\s/ /g') | sort > data/local/data_${x}/text
+
+		log "make a wav file list from spk2utt"
+		awk '{print $1}' data/local/data_${x}/spk2utt > data/local/data_${x}/spk.list
+		find ${odir}/wav16k/ -name '*.wav' | grep -f data/local/data_${x}/spk.list | sort > data/local/data_${x}/wav.flist
+		awk -F '/' '{print $NF}' data/local/data_${x}/wav.flist | sed -e "s/\.wav//" > data/local/data_${x}/id.list
+		paste -d ' ' data/local/data_${x}/id.list data/local/data_${x}/wav.flist | sort > data/local/data_${x}/wav.scp
+
+		utils/copy_data_dir.sh data/local/data_${x} data/${x}
+		utils/fix_data_dir.sh data/${x}
+	    fi
+	done
+    done
+    # for now we just copy dev_easy_jun21, but we will add other tasks
+    log "make a validation set"
+    if [ ! -d data/dev_easy_jun21 ]; then
+	utils/copy_data_dir.sh data/dev_easy_jun21 data/valid
+    fi
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/jtubespeech/asr1/local/dev_easy_jun21.list b/egs2/jtubespeech/asr1/local/dev_easy_jun21.list
new file mode 100644
index 00000000000..7f621571786
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/dev_easy_jun21.list
@@ -0,0 +1,785 @@
+3aILBytvlks_0006
+3aILBytvlks_0014
+3aILBytvlks_0019
+3aILBytvlks_0043
+3aILBytvlks_0050
+3aILBytvlks_0080
+3aILBytvlks_0109
+3aILBytvlks_0111
+3aILBytvlks_0148
+3aILBytvlks_0164
+5QlqblKceJk_0012
+5QlqblKceJk_0019
+5QlqblKceJk_0024
+5ef8wKGqQbw_0063
+5ef8wKGqQbw_0092
+7FU9Yb92o7U_0119
+7FU9Yb92o7U_0177
+7FU9Yb92o7U_0178
+7FU9Yb92o7U_0185
+8AQlJEqWBXg_0050
+BDtdR0Lplc0_0044
+BDtdR0Lplc0_0046
+BDz56lr0QEI_0149
+BLMI9jdtnFc_0034
+BLMI9jdtnFc_0037
+BLMI9jdtnFc_0049
+BLMI9jdtnFc_0098
+C5ETpmxVicY_0022
+C5ETpmxVicY_0026
+C5ETpmxVicY_0027
+C76kmEOvVEE_0035
+C76kmEOvVEE_0041
+C76kmEOvVEE_0044
+C76kmEOvVEE_0048
+C76kmEOvVEE_0050
+C76kmEOvVEE_0051
+C76kmEOvVEE_0056
+C76kmEOvVEE_0059
+C76kmEOvVEE_0065
+C76kmEOvVEE_0066
+C76kmEOvVEE_0067
+C76kmEOvVEE_0071
+C76kmEOvVEE_0081
+C76kmEOvVEE_0082
+C76kmEOvVEE_0083
+C76kmEOvVEE_0086
+C76kmEOvVEE_0093
+C76kmEOvVEE_0106
+C76kmEOvVEE_0107
+C76kmEOvVEE_0109
+C76kmEOvVEE_0110
+C76kmEOvVEE_0113
+C76kmEOvVEE_0127
+C76kmEOvVEE_0131
+C76kmEOvVEE_0137
+C76kmEOvVEE_0145
+C76kmEOvVEE_0146
+C76kmEOvVEE_0150
+C76kmEOvVEE_0153
+C76kmEOvVEE_0159
+C76kmEOvVEE_0160
+C76kmEOvVEE_0163
+C76kmEOvVEE_0164
+C76kmEOvVEE_0168
+C76kmEOvVEE_0169
+C76kmEOvVEE_0178
+C76kmEOvVEE_0180
+CJJV4xe8N4M_0090
+CJJV4xe8N4M_0151
+CJJV4xe8N4M_0165
+CJJV4xe8N4M_0169
+CJJV4xe8N4M_0174
+CJJV4xe8N4M_0237
+CJJV4xe8N4M_0238
+DJ7D7kJhYEA_0137
+DO8RJgIujEY_0058
+EUti6bV7iOQ_0074
+EoTFGgvBce4_0023
+Ex7vU3VmPk0_0074
+Ex7vU3VmPk0_0075
+F9F6rG5Ghig_0002
+F9F6rG5Ghig_0020
+F9F6rG5Ghig_0029
+FI0HEpoOL4k_0001
+FI0HEpoOL4k_0010
+FI0HEpoOL4k_0022
+FI0HEpoOL4k_0025
+FI0HEpoOL4k_0029
+FI0HEpoOL4k_0050
+FSIXO_G96_U_0053
+FWKleFPNEPQ_0014
+FWKleFPNEPQ_0141
+FWKleFPNEPQ_0160
+FWKleFPNEPQ_0229
+FWKleFPNEPQ_0234
+FWKleFPNEPQ_0237
+FWKleFPNEPQ_0244
+G6ypXVO_Fm0_0106
+G6ypXVO_Fm0_0113
+G6ypXVO_Fm0_0117
+G6ypXVO_Fm0_0121
+G6ypXVO_Fm0_0189
+G6ypXVO_Fm0_0202
+G6ypXVO_Fm0_0203
+G6ypXVO_Fm0_0207
+G6ypXVO_Fm0_0220
+G6ypXVO_Fm0_0225
+G6ypXVO_Fm0_0227
+G6ypXVO_Fm0_0234
+G6ypXVO_Fm0_0244
+G6ypXVO_Fm0_0246
+G6ypXVO_Fm0_0249
+G6ypXVO_Fm0_0250
+G6ypXVO_Fm0_0252
+G9FZzpuiLm4_0017
+GLgLvNDTatU_0066
+HeTB6P4Rxt0_0050
+HeTB6P4Rxt0_0053
+I3fhTd0iRn4_0095
+I3fhTd0iRn4_0107
+I3fhTd0iRn4_0135
+ILFrJIMTFNc_0068
+ILFrJIMTFNc_0149
+ILFrJIMTFNc_0178
+ILFrJIMTFNc_0182
+ILFrJIMTFNc_0184
+ILFrJIMTFNc_0185
+ILFrJIMTFNc_0270
+ILFrJIMTFNc_0271
+ILFrJIMTFNc_0273
+ILFrJIMTFNc_0274
+J30dHOAUu4g_0002
+J30dHOAUu4g_0003
+J30dHOAUu4g_0004
+J30dHOAUu4g_0013
+J30dHOAUu4g_0061
+J30dHOAUu4g_0065
+J30dHOAUu4g_0070
+J30dHOAUu4g_0071
+J30dHOAUu4g_0073
+J30dHOAUu4g_0075
+J30dHOAUu4g_0078
+J30dHOAUu4g_0080
+J30dHOAUu4g_0091
+J30dHOAUu4g_0092
+J30dHOAUu4g_0099
+J30dHOAUu4g_0101
+J30dHOAUu4g_0102
+J30dHOAUu4g_0104
+J30dHOAUu4g_0105
+JRARprI9z6k_0192
+JRARprI9z6k_0193
+Ln7QBexLksQ_0077
+Ln7QBexLksQ_0083
+M5rIjd_VWoA_0123
+M5rIjd_VWoA_0208
+MLKARJ4gqD4_0054
+MrnxGpMxGNE_0112
+MrnxGpMxGNE_0186
+MrnxGpMxGNE_0305
+NDmYS82ztFI_0143
+NDmYS82ztFI_0150
+NDmYS82ztFI_0152
+NDmYS82ztFI_0153
+NDmYS82ztFI_0154
+NDmYS82ztFI_0162
+NDmYS82ztFI_0163
+NDmYS82ztFI_0164
+NDmYS82ztFI_0165
+OyQH5sZOaYg_0006
+OyQH5sZOaYg_0008
+OyQH5sZOaYg_0012
+OyQH5sZOaYg_0014
+OyQH5sZOaYg_0026
+OyQH5sZOaYg_0031
+OyQH5sZOaYg_0101
+OyQH5sZOaYg_0104
+OyQH5sZOaYg_0114
+P8kmaj89DoE_0223
+Pkc1C0YDgwI_0056
+Q6RgtFU5hdE_0007
+Q6RgtFU5hdE_0029
+Q6RgtFU5hdE_0036
+Q6RgtFU5hdE_0040
+Q6RgtFU5hdE_0062
+Q6RgtFU5hdE_0072
+Q6RgtFU5hdE_0081
+Q6RgtFU5hdE_0084
+Q6RgtFU5hdE_0209
+Q6RgtFU5hdE_0214
+Q6RgtFU5hdE_0217
+Q6RgtFU5hdE_0220
+Q6RgtFU5hdE_0222
+Q6RgtFU5hdE_0223
+Q6RgtFU5hdE_0224
+Q6RgtFU5hdE_0226
+Q6RgtFU5hdE_0227
+Q6RgtFU5hdE_0232
+Q6RgtFU5hdE_0236
+Q6RgtFU5hdE_0237
+Q6RgtFU5hdE_0241
+Q6RgtFU5hdE_0243
+Q6RgtFU5hdE_0248
+Q6RgtFU5hdE_0294
+Q6RgtFU5hdE_0295
+Q6RgtFU5hdE_0296
+Q6RgtFU5hdE_0299
+Q6RgtFU5hdE_0300
+Q6RgtFU5hdE_0301
+Q6RgtFU5hdE_0302
+Q6RgtFU5hdE_0303
+Q6RgtFU5hdE_0306
+Q6RgtFU5hdE_0311
+Q6RgtFU5hdE_0312
+Q6RgtFU5hdE_0348
+Q6RgtFU5hdE_0352
+Q6RgtFU5hdE_0360
+Q6RgtFU5hdE_0405
+Q6RgtFU5hdE_0452
+Q6RgtFU5hdE_0453
+Q6RgtFU5hdE_0455
+Q6RgtFU5hdE_0457
+Q6RgtFU5hdE_0469
+Q6RgtFU5hdE_0474
+Q6RgtFU5hdE_0477
+Q6RgtFU5hdE_0478
+QqQIwB1tFbU_0010
+QqQIwB1tFbU_0014
+R1m_I9A40EM_0096
+RBv5SiEoqbs_0053
+SCiRzd_4qoQ_0093
+SCiRzd_4qoQ_0095
+SCiRzd_4qoQ_0134
+SCiRzd_4qoQ_0140
+SXWaaPsiXZM_0019
+SXWaaPsiXZM_0042
+SXWaaPsiXZM_0062
+SXWaaPsiXZM_0067
+SXWaaPsiXZM_0068
+SXWaaPsiXZM_0071
+SXWaaPsiXZM_0072
+SXWaaPsiXZM_0089
+SXWaaPsiXZM_0094
+SXWaaPsiXZM_0124
+SXWaaPsiXZM_0125
+SXWaaPsiXZM_0130
+SXWaaPsiXZM_0134
+SXWaaPsiXZM_0135
+SXWaaPsiXZM_0136
+SXWaaPsiXZM_0145
+SXWaaPsiXZM_0150
+SXWaaPsiXZM_0178
+SXWaaPsiXZM_0181
+SXWaaPsiXZM_0182
+SXWaaPsiXZM_0183
+SXWaaPsiXZM_0191
+SXWaaPsiXZM_0192
+SXkUUg5_nuE_0009
+SXkUUg5_nuE_0047
+SXkUUg5_nuE_0049
+SXkUUg5_nuE_0060
+SXkUUg5_nuE_0061
+SjE9k0L1PRI_0055
+SkpqV4wnnsg_0004
+SuE0EtNhm10_0013
+SuE0EtNhm10_0047
+Tp8azIZH2Zg_0021
+Tp8azIZH2Zg_0029
+UHf9F8AjihA_0001
+UHf9F8AjihA_0041
+UW6YE4kTm6s_0049
+WsIoM7MZ5iI_0134
+X7AKyI7FRpA_0162
+X7AKyI7FRpA_0163
+X7AKyI7FRpA_0235
+XDRQQTZ6b9w_0085
+XDRQQTZ6b9w_0212
+XDRQQTZ6b9w_0226
+XDRQQTZ6b9w_0235
+XJrFjEgq5Tk_0028
+XJrFjEgq5Tk_0102
+XJrFjEgq5Tk_0105
+XJrFjEgq5Tk_0106
+XJrFjEgq5Tk_0112
+XJrFjEgq5Tk_0114
+XJrFjEgq5Tk_0117
+XJrFjEgq5Tk_0121
+XJrFjEgq5Tk_0123
+XJrFjEgq5Tk_0124
+XJrFjEgq5Tk_0126
+XJrFjEgq5Tk_0130
+XwQaaMOU6oM_0005
+XwQaaMOU6oM_0012
+XwQaaMOU6oM_0013
+XwQaaMOU6oM_0056
+XwQaaMOU6oM_0061
+XwQaaMOU6oM_0063
+XwQaaMOU6oM_0070
+XwQaaMOU6oM_0075
+XwQaaMOU6oM_0098
+XwQaaMOU6oM_0101
+XwQaaMOU6oM_0107
+XwQaaMOU6oM_0114
+Y5naGnjTMi0_0028
+Y5naGnjTMi0_0033
+YSIxx0eOK4U_0133
+YSIxx0eOK4U_0136
+YSIxx0eOK4U_0215
+YSIxx0eOK4U_0221
+YSIxx0eOK4U_0226
+YSIxx0eOK4U_0233
+YSIxx0eOK4U_0257
+YSIxx0eOK4U_0268
+YSIxx0eOK4U_0275
+YSIxx0eOK4U_0340
+YSIxx0eOK4U_0437
+YSIxx0eOK4U_0438
+Yq3MgleQKSc_0004
+Yq3MgleQKSc_0008
+Yq3MgleQKSc_0047
+YrUcUmQSufE_0002
+YrUcUmQSufE_0033
+YrUcUmQSufE_0043
+YrUcUmQSufE_0081
+YrUcUmQSufE_0093
+YrUcUmQSufE_0098
+Z_MQqWzXWjk_0047
+_eZaDTcAeHo_0022
+_eZaDTcAeHo_0066
+_eZaDTcAeHo_0076
+_oXRvYMp36E_0158
+_oXRvYMp36E_0215
+as1dYNWR1_s_0007
+as1dYNWR1_s_0014
+as1dYNWR1_s_0023
+as1dYNWR1_s_0030
+ay9u8V11JP4_0231
+ay9u8V11JP4_0234
+ay9u8V11JP4_0242
+ay9u8V11JP4_0275
+ay9u8V11JP4_0276
+b2wLhmcPTNw_0055
+b2wLhmcPTNw_0057
+b2wLhmcPTNw_0101
+b2wLhmcPTNw_0116
+b2wLhmcPTNw_0140
+bCyhdG3dCdM_0004
+bCyhdG3dCdM_0056
+cc1W6qLdzVo_0119
+cc1W6qLdzVo_0122
+cc1W6qLdzVo_0123
+d3GnwrM7r28_0001
+d3GnwrM7r28_0007
+d3GnwrM7r28_0009
+d3GnwrM7r28_0029
+d3GnwrM7r28_0037
+d3GnwrM7r28_0038
+d3GnwrM7r28_0041
+d3GnwrM7r28_0042
+d3GnwrM7r28_0046
+d3GnwrM7r28_0049
+d3GnwrM7r28_0053
+d3GnwrM7r28_0056
+d3GnwrM7r28_0058
+d3GnwrM7r28_0060
+d3GnwrM7r28_0068
+d3GnwrM7r28_0069
+d3GnwrM7r28_0070
+d3GnwrM7r28_0073
+d3GnwrM7r28_0074
+d3GnwrM7r28_0076
+d3GnwrM7r28_0077
+d3GnwrM7r28_0079
+d3GnwrM7r28_0080
+dEhjANwDnwQ_0030
+dEhjANwDnwQ_0064
+dEhjANwDnwQ_0076
+dEhjANwDnwQ_0077
+dEhjANwDnwQ_0127
+dEhjANwDnwQ_0144
+dEhjANwDnwQ_0159
+dEhjANwDnwQ_0174
+dEhjANwDnwQ_0175
+dEhjANwDnwQ_0253
+dEhjANwDnwQ_0276
+dEhjANwDnwQ_0294
+dEhjANwDnwQ_0307
+dqrkV7DZWOk_0000
+fFoULWCEha8_0004
+fFoULWCEha8_0008
+fFoULWCEha8_0013
+fZ4iXEWx9Xs_0024
+fZ4iXEWx9Xs_0059
+fZ4iXEWx9Xs_0060
+fZ4iXEWx9Xs_0064
+fZ4iXEWx9Xs_0066
+fZ4iXEWx9Xs_0072
+fZ4iXEWx9Xs_0078
+fZ4iXEWx9Xs_0087
+fZ4iXEWx9Xs_0110
+fZ4iXEWx9Xs_0130
+fZ4iXEWx9Xs_0144
+fZ4iXEWx9Xs_0151
+fZ4iXEWx9Xs_0167
+fZ4iXEWx9Xs_0186
+fZ4iXEWx9Xs_0208
+fZ4iXEWx9Xs_0218
+fZ4iXEWx9Xs_0224
+fZ4iXEWx9Xs_0238
+fZ4iXEWx9Xs_0248
+fpBEupKjHR8_0059
+fpBEupKjHR8_0080
+gm_Nu_I33tY_0036
+gm_Nu_I33tY_0041
+gxdT0bCS8CQ_0024
+gxdT0bCS8CQ_0029
+gxdT0bCS8CQ_0033
+gxdT0bCS8CQ_0035
+gxdT0bCS8CQ_0042
+gxdT0bCS8CQ_0043
+gxdT0bCS8CQ_0044
+gxdT0bCS8CQ_0045
+gxdT0bCS8CQ_0046
+gxdT0bCS8CQ_0052
+gxdT0bCS8CQ_0055
+gxdT0bCS8CQ_0056
+jGjiMMmZ5I4_0027
+jUPVspqB9oI_0032
+kAjaNQmT174_0055
+kAjaNQmT174_0078
+kAjaNQmT174_0139
+kAjaNQmT174_0141
+kAjaNQmT174_0143
+kAjaNQmT174_0149
+kAjaNQmT174_0184
+kAjaNQmT174_0185
+kAjaNQmT174_0188
+kAjaNQmT174_0195
+kAjaNQmT174_0208
+kAjaNQmT174_0209
+kAjaNQmT174_0247
+kAjaNQmT174_0261
+kAjaNQmT174_0262
+kAjaNQmT174_0264
+kAjaNQmT174_0267
+kAjaNQmT174_0272
+kAjaNQmT174_0297
+kAjaNQmT174_0304
+kAjaNQmT174_0314
+kAjaNQmT174_0327
+kAjaNQmT174_0332
+kAjaNQmT174_0333
+kAjaNQmT174_0335
+kAjaNQmT174_0345
+kAjaNQmT174_0349
+kAjaNQmT174_0357
+kAjaNQmT174_0358
+kAjaNQmT174_0359
+kAjaNQmT174_0362
+kAjaNQmT174_0369
+kAjaNQmT174_0371
+kAjaNQmT174_0373
+kAjaNQmT174_0376
+kAjaNQmT174_0377
+kAjaNQmT174_0378
+kAjaNQmT174_0381
+kAjaNQmT174_0382
+kAjaNQmT174_0385
+kAjaNQmT174_0387
+kAjaNQmT174_0388
+kAjaNQmT174_0389
+kAjaNQmT174_0395
+kAjaNQmT174_0419
+kAjaNQmT174_0421
+kAjaNQmT174_0430
+kAjaNQmT174_0435
+kAjaNQmT174_0472
+kAjaNQmT174_0474
+kAjaNQmT174_0484
+kAjaNQmT174_0486
+kAjaNQmT174_0487
+kAjaNQmT174_0489
+kAjaNQmT174_0490
+kAjaNQmT174_0491
+kAjaNQmT174_0494
+kAjaNQmT174_0496
+kHQJzFdsz2Q_0003
+kHQJzFdsz2Q_0005
+kHQJzFdsz2Q_0008
+kHQJzFdsz2Q_0015
+kHQJzFdsz2Q_0019
+kHQJzFdsz2Q_0022
+kHQJzFdsz2Q_0028
+kHQJzFdsz2Q_0033
+kHQJzFdsz2Q_0036
+kHQJzFdsz2Q_0037
+kHQJzFdsz2Q_0040
+kHQJzFdsz2Q_0042
+kHQJzFdsz2Q_0043
+kHQJzFdsz2Q_0046
+kHQJzFdsz2Q_0063
+kHQJzFdsz2Q_0065
+kHQJzFdsz2Q_0066
+kHQJzFdsz2Q_0067
+kHQJzFdsz2Q_0068
+kHQJzFdsz2Q_0071
+kHQJzFdsz2Q_0074
+kHQJzFdsz2Q_0075
+kHQJzFdsz2Q_0079
+kHQJzFdsz2Q_0080
+kHQJzFdsz2Q_0081
+kHQJzFdsz2Q_0082
+kHQJzFdsz2Q_0083
+kHQJzFdsz2Q_0084
+kHQJzFdsz2Q_0086
+kHQJzFdsz2Q_0087
+kHQJzFdsz2Q_0088
+kHQJzFdsz2Q_0089
+kHQJzFdsz2Q_0091
+kHQJzFdsz2Q_0092
+kHQJzFdsz2Q_0093
+kHQJzFdsz2Q_0094
+kHQJzFdsz2Q_0097
+kHQJzFdsz2Q_0098
+kHQJzFdsz2Q_0099
+kHQJzFdsz2Q_0100
+kMPHV8VsWRg_0026
+kMPHV8VsWRg_0027
+kMPHV8VsWRg_0028
+kMPHV8VsWRg_0033
+kMPHV8VsWRg_0044
+kMPHV8VsWRg_0045
+kMPHV8VsWRg_0060
+kMPHV8VsWRg_0065
+kMPHV8VsWRg_0071
+kMPHV8VsWRg_0075
+kMPHV8VsWRg_0079
+kMPHV8VsWRg_0087
+kMPHV8VsWRg_0105
+kMPHV8VsWRg_0128
+kMPHV8VsWRg_0159
+kMPHV8VsWRg_0174
+kMPHV8VsWRg_0192
+kj0YTiexoVI_0070
+lBXbibUlOrw_0040
+lBXbibUlOrw_0046
+lBXbibUlOrw_0047
+lBXbibUlOrw_0052
+lBXbibUlOrw_0053
+lBXbibUlOrw_0057
+lChWD345fO4_0009
+lChWD345fO4_0011
+lChWD345fO4_0014
+lChWD345fO4_0030
+lChWD345fO4_0037
+lChWD345fO4_0040
+lChWD345fO4_0051
+lChWD345fO4_0068
+lChWD345fO4_0070
+lChWD345fO4_0078
+lChWD345fO4_0079
+lChWD345fO4_0080
+lChWD345fO4_0081
+lChWD345fO4_0082
+lChWD345fO4_0083
+lChWD345fO4_0087
+lChWD345fO4_0096
+lChWD345fO4_0097
+lChWD345fO4_0098
+lChWD345fO4_0107
+lChWD345fO4_0109
+lChWD345fO4_0117
+lChWD345fO4_0118
+lChWD345fO4_0120
+lChWD345fO4_0121
+lChWD345fO4_0122
+lChWD345fO4_0123
+lChWD345fO4_0124
+lChWD345fO4_0126
+lChWD345fO4_0130
+lChWD345fO4_0134
+lChWD345fO4_0135
+lChWD345fO4_0136
+lChWD345fO4_0137
+lChWD345fO4_0141
+lChWD345fO4_0143
+lChWD345fO4_0145
+lChWD345fO4_0146
+lChWD345fO4_0147
+lChWD345fO4_0148
+lChWD345fO4_0149
+lChWD345fO4_0150
+lChWD345fO4_0154
+lChWD345fO4_0155
+lChWD345fO4_0158
+lChWD345fO4_0159
+lChWD345fO4_0160
+lChWD345fO4_0165
+lChWD345fO4_0166
+lS7o9qTAEhg_0055
+lXCDVcrIimg_0020
+m_8IrmviDDI_0180
+m_8IrmviDDI_0302
+m_8IrmviDDI_0336
+mr18y16hcA0_0071
+mr18y16hcA0_0128
+mr18y16hcA0_0167
+n_18ajaN788_0014
+n_18ajaN788_0024
+nsG1XGv8RU0_0065
+nsG1XGv8RU0_0089
+nsG1XGv8RU0_0091
+nsG1XGv8RU0_0093
+nsG1XGv8RU0_0099
+nsG1XGv8RU0_0112
+nsG1XGv8RU0_0120
+nsG1XGv8RU0_0125
+nsG1XGv8RU0_0166
+nsG1XGv8RU0_0168
+p0wcln3EaGs_0064
+p0wcln3EaGs_0072
+p0wcln3EaGs_0073
+p0wcln3EaGs_0152
+prB2ZcYi_bE_0000
+qK3bdBvf8Sc_0035
+r5zJmnjKQvw_0027
+r5zJmnjKQvw_0047
+r5zJmnjKQvw_0090
+r5zJmnjKQvw_0091
+r5zJmnjKQvw_0096
+rRwfAdl4oo8_0002
+rRwfAdl4oo8_0003
+rRwfAdl4oo8_0008
+rRwfAdl4oo8_0014
+rRwfAdl4oo8_0039
+rRwfAdl4oo8_0072
+rRwfAdl4oo8_0084
+rRwfAdl4oo8_0147
+rRwfAdl4oo8_0209
+rRwfAdl4oo8_0210
+rRwfAdl4oo8_0212
+s4of2bbqItk_0121
+sdqKdbyb0Ts_0016
+sdqKdbyb0Ts_0054
+sdqKdbyb0Ts_0057
+sdqKdbyb0Ts_0058
+sdqKdbyb0Ts_0068
+sdqKdbyb0Ts_0069
+syOmmDr_M4k_0005
+syOmmDr_M4k_0023
+syOmmDr_M4k_0076
+syOmmDr_M4k_0083
+syOmmDr_M4k_0091
+syOmmDr_M4k_0092
+syOmmDr_M4k_0093
+syOmmDr_M4k_0097
+syOmmDr_M4k_0101
+syOmmDr_M4k_0106
+syOmmDr_M4k_0115
+syOmmDr_M4k_0122
+syOmmDr_M4k_0125
+syOmmDr_M4k_0127
+syOmmDr_M4k_0130
+syOmmDr_M4k_0131
+syOmmDr_M4k_0136
+syOmmDr_M4k_0144
+syOmmDr_M4k_0175
+syOmmDr_M4k_0179
+syOmmDr_M4k_0181
+syOmmDr_M4k_0186
+syOmmDr_M4k_0187
+syOmmDr_M4k_0188
+syOmmDr_M4k_0189
+syOmmDr_M4k_0203
+syOmmDr_M4k_0205
+syOmmDr_M4k_0239
+syOmmDr_M4k_0241
+syOmmDr_M4k_0242
+syOmmDr_M4k_0243
+syOmmDr_M4k_0246
+syOmmDr_M4k_0256
+syOmmDr_M4k_0259
+syOmmDr_M4k_0260
+syOmmDr_M4k_0261
+syOmmDr_M4k_0264
+syOmmDr_M4k_0275
+syOmmDr_M4k_0278
+syOmmDr_M4k_0279
+syOmmDr_M4k_0280
+syOmmDr_M4k_0281
+syOmmDr_M4k_0282
+syOmmDr_M4k_0303
+syOmmDr_M4k_0305
+syOmmDr_M4k_0307
+syOmmDr_M4k_0316
+syOmmDr_M4k_0321
+syOmmDr_M4k_0322
+syOmmDr_M4k_0326
+syOmmDr_M4k_0330
+syOmmDr_M4k_0331
+syOmmDr_M4k_0336
+syOmmDr_M4k_0337
+syOmmDr_M4k_0338
+syOmmDr_M4k_0339
+syOmmDr_M4k_0341
+syOmmDr_M4k_0343
+syOmmDr_M4k_0350
+syOmmDr_M4k_0353
+syOmmDr_M4k_0354
+tR9Qyuwxrms_0087
+toFa1Z_VQCA_0014
+toFa1Z_VQCA_0021
+toFa1Z_VQCA_0111
+toFa1Z_VQCA_0115
+toFa1Z_VQCA_0116
+toFa1Z_VQCA_0135
+toFa1Z_VQCA_0138
+toFa1Z_VQCA_0139
+toFa1Z_VQCA_0250
+toFa1Z_VQCA_0253
+toFa1Z_VQCA_0261
+toFa1Z_VQCA_0273
+toFa1Z_VQCA_0287
+u4u5OuWfJBo_0092
+u4u5OuWfJBo_0094
+u4u5OuWfJBo_0101
+uBMD84FISOY_0017
+uBMD84FISOY_0019
+uBMD84FISOY_0040
+uBMD84FISOY_0058
+uBMD84FISOY_0084
+uBMD84FISOY_0085
+ubJR2bKGfz4_0001
+uiZ3t86wOME_0014
+vQDvIl5e3iI_0159
+vQDvIl5e3iI_0220
+vQDvIl5e3iI_0222
+vQDvIl5e3iI_0223
+vX_rnBSXKd8_0003
+vX_rnBSXKd8_0050
+vX_rnBSXKd8_0052
+vX_rnBSXKd8_0057
+vX_rnBSXKd8_0060
+vX_rnBSXKd8_0062
+vX_rnBSXKd8_0063
+wOK7_ZkRMN4_0026
+wOK7_ZkRMN4_0040
+wOK7_ZkRMN4_0047
+wOK7_ZkRMN4_0051
+wOK7_ZkRMN4_0053
+wOK7_ZkRMN4_0056
+wOK7_ZkRMN4_0057
+wOK7_ZkRMN4_0058
+wOK7_ZkRMN4_0059
+wOK7_ZkRMN4_0070
+wOK7_ZkRMN4_0077
+wOK7_ZkRMN4_0078
+wOK7_ZkRMN4_0081
+wOK7_ZkRMN4_0082
+wOK7_ZkRMN4_0085
+wOK7_ZkRMN4_0088
+wOK7_ZkRMN4_0096
+wOK7_ZkRMN4_0097
+wOK7_ZkRMN4_0099
+wOK7_ZkRMN4_0104
+wOK7_ZkRMN4_0106
+wOK7_ZkRMN4_0107
+wOK7_ZkRMN4_0116
+wOK7_ZkRMN4_0125
+wOK7_ZkRMN4_0127
+wOK7_ZkRMN4_0129
+wOK7_ZkRMN4_0130
+wOK7_ZkRMN4_0137
+wOK7_ZkRMN4_0138
+wOK7_ZkRMN4_0149
+wOK7_ZkRMN4_0154
+xqXQH_uf5Co_0117
+xvBOPLvZXxU_0127
+y1O19R_me9U_0093
+y1O19R_me9U_0118
+yP6BakHMbRY_0022
+yP6BakHMbRY_0064
+yP6BakHMbRY_0067
+yP6BakHMbRY_0068
+ypwkINjGyvo_0015
diff --git a/egs2/jtubespeech/asr1/local/dev_normal_jun21.list b/egs2/jtubespeech/asr1/local/dev_normal_jun21.list
new file mode 100644
index 00000000000..b0b2fcc5004
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/dev_normal_jun21.list
@@ -0,0 +1,1036 @@
+33OWZ_X2__U_0003
+33OWZ_X2__U_0005
+33OWZ_X2__U_0008
+33OWZ_X2__U_0009
+33OWZ_X2__U_0012
+3aILBytvlks_0001
+3aILBytvlks_0003
+3aILBytvlks_0004
+3aILBytvlks_0005
+3aILBytvlks_0007
+3aILBytvlks_0008
+3aILBytvlks_0011
+3aILBytvlks_0017
+3aILBytvlks_0062
+3aILBytvlks_0114
+3aILBytvlks_0144
+48gSJYp482w_0000
+48gSJYp482w_0001
+48gSJYp482w_0007
+5QlqblKceJk_0000
+5QlqblKceJk_0001
+5QlqblKceJk_0002
+5QlqblKceJk_0005
+5QlqblKceJk_0006
+5QlqblKceJk_0007
+5QlqblKceJk_0008
+5QlqblKceJk_0009
+5QlqblKceJk_0011
+5QlqblKceJk_0018
+5ef8wKGqQbw_0002
+5ef8wKGqQbw_0004
+5ef8wKGqQbw_0007
+5ef8wKGqQbw_0065
+5ef8wKGqQbw_0071
+5ef8wKGqQbw_0074
+5ef8wKGqQbw_0079
+5ef8wKGqQbw_0089
+5ef8wKGqQbw_0091
+5w6KTNJ1XxY_0002
+5w6KTNJ1XxY_0008
+5w6KTNJ1XxY_0009
+5w6KTNJ1XxY_0012
+5w6KTNJ1XxY_0013
+5w6KTNJ1XxY_0016
+5w6KTNJ1XxY_0017
+5w6KTNJ1XxY_0019
+5w6KTNJ1XxY_0029
+61eExIhQrcY_0005
+61eExIhQrcY_0112
+61eExIhQrcY_0115
+6F97M3iktks_0006
+6F97M3iktks_0008
+6F97M3iktks_0009
+6F97M3iktks_0013
+6F97M3iktks_0018
+6F97M3iktks_0032
+7FU9Yb92o7U_0000
+7FU9Yb92o7U_0005
+7FU9Yb92o7U_0007
+7FU9Yb92o7U_0009
+7FU9Yb92o7U_0012
+7FU9Yb92o7U_0018
+7FU9Yb92o7U_0019
+7FU9Yb92o7U_0034
+7FU9Yb92o7U_0083
+7FU9Yb92o7U_0094
+7FU9Yb92o7U_0098
+7FU9Yb92o7U_0105
+7FU9Yb92o7U_0112
+7FU9Yb92o7U_0118
+7FU9Yb92o7U_0142
+7FU9Yb92o7U_0144
+7FU9Yb92o7U_0161
+7FU9Yb92o7U_0172
+7FU9Yb92o7U_0173
+7FU9Yb92o7U_0179
+7FU9Yb92o7U_0181
+7FU9Yb92o7U_0184
+7FU9Yb92o7U_0197
+7FU9Yb92o7U_0204
+7FU9Yb92o7U_0224
+8RiD9yjvI88_0004
+8RiD9yjvI88_0005
+ARsrJTRa2MQ_0047
+ARsrJTRa2MQ_0077
+BDtdR0Lplc0_0001
+BDtdR0Lplc0_0002
+BDtdR0Lplc0_0006
+BDtdR0Lplc0_0009
+BDtdR0Lplc0_0010
+BDtdR0Lplc0_0014
+BDtdR0Lplc0_0015
+BDtdR0Lplc0_0022
+BDtdR0Lplc0_0035
+BDtdR0Lplc0_0038
+BDtdR0Lplc0_0047
+BDz56lr0QEI_0033
+BLMI9jdtnFc_0000
+BLMI9jdtnFc_0001
+BLMI9jdtnFc_0002
+BLMI9jdtnFc_0004
+BLMI9jdtnFc_0005
+BLMI9jdtnFc_0014
+BLMI9jdtnFc_0020
+BLMI9jdtnFc_0026
+BLMI9jdtnFc_0027
+BLMI9jdtnFc_0029
+BLMI9jdtnFc_0055
+BOoxjHw6c_I_0007
+BOoxjHw6c_I_0041
+BOoxjHw6c_I_0051
+BOoxjHw6c_I_0054
+BOoxjHw6c_I_0070
+BOoxjHw6c_I_0084
+BOoxjHw6c_I_0086
+BOoxjHw6c_I_0087
+BOoxjHw6c_I_0088
+BOoxjHw6c_I_0104
+BOoxjHw6c_I_0202
+BY3aalyDpXk_0000
+BY3aalyDpXk_0001
+BY3aalyDpXk_0013
+BY3aalyDpXk_0017
+BY3aalyDpXk_0047
+BY3aalyDpXk_0052
+Bab8j1Ek4jc_0003
+C5ETpmxVicY_0002
+C5ETpmxVicY_0013
+C5ETpmxVicY_0015
+C5ETpmxVicY_0016
+C5ETpmxVicY_0025
+C76kmEOvVEE_0026
+C76kmEOvVEE_0027
+C76kmEOvVEE_0028
+C76kmEOvVEE_0029
+C76kmEOvVEE_0032
+C76kmEOvVEE_0034
+C76kmEOvVEE_0036
+C76kmEOvVEE_0037
+C76kmEOvVEE_0038
+C76kmEOvVEE_0039
+C76kmEOvVEE_0040
+C76kmEOvVEE_0042
+C76kmEOvVEE_0043
+C76kmEOvVEE_0045
+C76kmEOvVEE_0046
+C76kmEOvVEE_0049
+C76kmEOvVEE_0052
+C76kmEOvVEE_0053
+C76kmEOvVEE_0054
+C76kmEOvVEE_0068
+C76kmEOvVEE_0069
+C76kmEOvVEE_0073
+C76kmEOvVEE_0077
+C76kmEOvVEE_0085
+C76kmEOvVEE_0090
+C76kmEOvVEE_0094
+C76kmEOvVEE_0095
+C76kmEOvVEE_0096
+C76kmEOvVEE_0097
+C76kmEOvVEE_0104
+C76kmEOvVEE_0108
+C76kmEOvVEE_0115
+C76kmEOvVEE_0116
+C76kmEOvVEE_0118
+C76kmEOvVEE_0120
+C76kmEOvVEE_0122
+C76kmEOvVEE_0129
+C76kmEOvVEE_0134
+C76kmEOvVEE_0149
+C76kmEOvVEE_0151
+C76kmEOvVEE_0154
+C76kmEOvVEE_0157
+C76kmEOvVEE_0158
+C76kmEOvVEE_0170
+C76kmEOvVEE_0177
+C76kmEOvVEE_0179
+C76kmEOvVEE_0182
+C76kmEOvVEE_0187
+CJJV4xe8N4M_0067
+CJJV4xe8N4M_0068
+CJJV4xe8N4M_0075
+CJJV4xe8N4M_0093
+CJJV4xe8N4M_0098
+CJJV4xe8N4M_0143
+CJJV4xe8N4M_0150
+CJJV4xe8N4M_0152
+CJJV4xe8N4M_0172
+CJJV4xe8N4M_0176
+CJJV4xe8N4M_0225
+CJJV4xe8N4M_0232
+CJJV4xe8N4M_0312
+DJ7D7kJhYEA_0065
+DJ7D7kJhYEA_0072
+DJ7D7kJhYEA_0074
+DJ7D7kJhYEA_0075
+DJ7D7kJhYEA_0078
+DO8RJgIujEY_0011
+DO8RJgIujEY_0025
+DO8RJgIujEY_0041
+EUti6bV7iOQ_0013
+EUti6bV7iOQ_0015
+EUti6bV7iOQ_0019
+EUti6bV7iOQ_0024
+EUti6bV7iOQ_0029
+EUti6bV7iOQ_0057
+EUti6bV7iOQ_0087
+EUti6bV7iOQ_0094
+EUti6bV7iOQ_0100
+Ex7vU3VmPk0_0002
+Ex7vU3VmPk0_0004
+Ex7vU3VmPk0_0045
+Ex7vU3VmPk0_0066
+Ex7vU3VmPk0_0067
+F9F6rG5Ghig_0000
+F9F6rG5Ghig_0008
+FI0HEpoOL4k_0002
+FI0HEpoOL4k_0004
+FI0HEpoOL4k_0006
+FI0HEpoOL4k_0007
+FI0HEpoOL4k_0011
+FI0HEpoOL4k_0021
+FI0HEpoOL4k_0023
+FI0HEpoOL4k_0024
+FI0HEpoOL4k_0026
+FI0HEpoOL4k_0027
+FSIXO_G96_U_0023
+FUMRQqNIAmg_0005
+FUMRQqNIAmg_0006
+FUMRQqNIAmg_0009
+FUMRQqNIAmg_0011
+FUMRQqNIAmg_0017
+FUMRQqNIAmg_0060
+FWKleFPNEPQ_0002
+FWKleFPNEPQ_0020
+FWKleFPNEPQ_0021
+FWKleFPNEPQ_0028
+FWKleFPNEPQ_0032
+FWKleFPNEPQ_0036
+FWKleFPNEPQ_0039
+FWKleFPNEPQ_0042
+FWKleFPNEPQ_0054
+FWKleFPNEPQ_0068
+FWKleFPNEPQ_0075
+FWKleFPNEPQ_0077
+FWKleFPNEPQ_0078
+FWKleFPNEPQ_0091
+FWKleFPNEPQ_0103
+FWKleFPNEPQ_0112
+FWKleFPNEPQ_0113
+FWKleFPNEPQ_0126
+FWKleFPNEPQ_0151
+FWKleFPNEPQ_0161
+FWKleFPNEPQ_0168
+FWKleFPNEPQ_0171
+FWKleFPNEPQ_0172
+FWKleFPNEPQ_0188
+FWKleFPNEPQ_0215
+FWKleFPNEPQ_0224
+FWKleFPNEPQ_0232
+FWKleFPNEPQ_0233
+FWKleFPNEPQ_0266
+FxFz9nT4y7U_0000
+FxFz9nT4y7U_0002
+FxFz9nT4y7U_0004
+G6ypXVO_Fm0_0036
+G6ypXVO_Fm0_0101
+G6ypXVO_Fm0_0107
+G6ypXVO_Fm0_0112
+G6ypXVO_Fm0_0124
+G6ypXVO_Fm0_0132
+G6ypXVO_Fm0_0144
+G6ypXVO_Fm0_0155
+G6ypXVO_Fm0_0191
+G6ypXVO_Fm0_0201
+G6ypXVO_Fm0_0209
+G6ypXVO_Fm0_0214
+G6ypXVO_Fm0_0219
+G6ypXVO_Fm0_0221
+G6ypXVO_Fm0_0243
+G6ypXVO_Fm0_0254
+G9FZzpuiLm4_0000
+G9FZzpuiLm4_0001
+G9FZzpuiLm4_0002
+G9FZzpuiLm4_0004
+G9FZzpuiLm4_0007
+G9FZzpuiLm4_0008
+G9FZzpuiLm4_0011
+G9FZzpuiLm4_0021
+G9FZzpuiLm4_0022
+GLgLvNDTatU_0055
+GLgLvNDTatU_0062
+GLgLvNDTatU_0070
+GLgLvNDTatU_0071
+HeTB6P4Rxt0_0005
+HeTB6P4Rxt0_0007
+HeTB6P4Rxt0_0010
+HeTB6P4Rxt0_0018
+HeTB6P4Rxt0_0099
+HeTB6P4Rxt0_0106
+Ht86Exdrlok_0004
+Ht86Exdrlok_0005
+Ht86Exdrlok_0006
+Ht86Exdrlok_0009
+Ht86Exdrlok_0014
+Ht86Exdrlok_0016
+Ht86Exdrlok_0043
+Ht86Exdrlok_0046
+Ht86Exdrlok_0047
+Ht86Exdrlok_0061
+I3fhTd0iRn4_0132
+IEcl2DZFZj8_0004
+IEcl2DZFZj8_0006
+IEcl2DZFZj8_0007
+IEcl2DZFZj8_0009
+IEcl2DZFZj8_0012
+IEcl2DZFZj8_0014
+ILFrJIMTFNc_0004
+ILFrJIMTFNc_0052
+ILFrJIMTFNc_0066
+ILFrJIMTFNc_0129
+ILFrJIMTFNc_0137
+ILFrJIMTFNc_0138
+ILFrJIMTFNc_0143
+ILFrJIMTFNc_0168
+ILFrJIMTFNc_0169
+ILFrJIMTFNc_0179
+ILFrJIMTFNc_0197
+ILFrJIMTFNc_0222
+ILFrJIMTFNc_0247
+ILFrJIMTFNc_0260
+ILFrJIMTFNc_0264
+ILFrJIMTFNc_0268
+ILFrJIMTFNc_0269
+ILFrJIMTFNc_0275
+ILFrJIMTFNc_0277
+ILFrJIMTFNc_0282
+ILFrJIMTFNc_0286
+J30dHOAUu4g_0000
+J30dHOAUu4g_0001
+J30dHOAUu4g_0005
+J30dHOAUu4g_0006
+J30dHOAUu4g_0008
+J30dHOAUu4g_0010
+J30dHOAUu4g_0011
+J30dHOAUu4g_0014
+J30dHOAUu4g_0015
+J30dHOAUu4g_0017
+J30dHOAUu4g_0018
+J30dHOAUu4g_0020
+J30dHOAUu4g_0021
+J30dHOAUu4g_0037
+J30dHOAUu4g_0063
+J30dHOAUu4g_0068
+J30dHOAUu4g_0072
+J30dHOAUu4g_0076
+J30dHOAUu4g_0077
+J30dHOAUu4g_0093
+J30dHOAUu4g_0098
+J30dHOAUu4g_0103
+Ln7QBexLksQ_0034
+Ln7QBexLksQ_0044
+M5rIjd_VWoA_0035
+M5rIjd_VWoA_0203
+M5rIjd_VWoA_0216
+MLKARJ4gqD4_0013
+MLKARJ4gqD4_0014
+MLKARJ4gqD4_0060
+MLKARJ4gqD4_0065
+MrnxGpMxGNE_0000
+MrnxGpMxGNE_0007
+MrnxGpMxGNE_0104
+MrnxGpMxGNE_0304
+NDmYS82ztFI_0049
+NDmYS82ztFI_0054
+NDmYS82ztFI_0062
+NDmYS82ztFI_0068
+NDmYS82ztFI_0109
+NDmYS82ztFI_0112
+NDmYS82ztFI_0133
+NDmYS82ztFI_0141
+O9sTLQSZfwo_0161
+O9sTLQSZfwo_0173
+O9sTLQSZfwo_0201
+O9sTLQSZfwo_0264
+O9sTLQSZfwo_0270
+OJRoBJlpssA_0023
+OJRoBJlpssA_0116
+OJRoBJlpssA_0208
+OJRoBJlpssA_0230
+OJRoBJlpssA_0292
+OyQH5sZOaYg_0000
+OyQH5sZOaYg_0001
+OyQH5sZOaYg_0002
+OyQH5sZOaYg_0004
+OyQH5sZOaYg_0009
+OyQH5sZOaYg_0010
+OyQH5sZOaYg_0013
+OyQH5sZOaYg_0018
+OyQH5sZOaYg_0019
+OyQH5sZOaYg_0027
+OyQH5sZOaYg_0028
+OyQH5sZOaYg_0056
+OyQH5sZOaYg_0059
+OyQH5sZOaYg_0090
+OyQH5sZOaYg_0103
+OyQH5sZOaYg_0106
+OyQH5sZOaYg_0117
+P8kmaj89DoE_0044
+P8kmaj89DoE_0086
+P8kmaj89DoE_0117
+P8kmaj89DoE_0142
+P8kmaj89DoE_0153
+P8kmaj89DoE_0160
+P8kmaj89DoE_0169
+PRF7sEOLaJ0_0048
+PRF7sEOLaJ0_0089
+PRF7sEOLaJ0_0091
+PRF7sEOLaJ0_0092
+PRF7sEOLaJ0_0093
+PRF7sEOLaJ0_0095
+PRF7sEOLaJ0_0096
+Q6RgtFU5hdE_0001
+Q6RgtFU5hdE_0006
+Q6RgtFU5hdE_0010
+Q6RgtFU5hdE_0024
+Q6RgtFU5hdE_0026
+Q6RgtFU5hdE_0033
+Q6RgtFU5hdE_0039
+Q6RgtFU5hdE_0046
+Q6RgtFU5hdE_0047
+Q6RgtFU5hdE_0048
+Q6RgtFU5hdE_0066
+Q6RgtFU5hdE_0070
+Q6RgtFU5hdE_0073
+Q6RgtFU5hdE_0082
+Q6RgtFU5hdE_0083
+Q6RgtFU5hdE_0165
+Q6RgtFU5hdE_0199
+Q6RgtFU5hdE_0211
+Q6RgtFU5hdE_0213
+Q6RgtFU5hdE_0229
+Q6RgtFU5hdE_0231
+Q6RgtFU5hdE_0233
+Q6RgtFU5hdE_0235
+Q6RgtFU5hdE_0304
+Q6RgtFU5hdE_0307
+Q6RgtFU5hdE_0309
+Q6RgtFU5hdE_0310
+Q6RgtFU5hdE_0315
+Q6RgtFU5hdE_0317
+Q6RgtFU5hdE_0318
+Q6RgtFU5hdE_0327
+Q6RgtFU5hdE_0346
+Q6RgtFU5hdE_0350
+Q6RgtFU5hdE_0356
+Q6RgtFU5hdE_0357
+Q6RgtFU5hdE_0361
+Q6RgtFU5hdE_0404
+Q6RgtFU5hdE_0430
+Q6RgtFU5hdE_0431
+Q6RgtFU5hdE_0456
+Q6RgtFU5hdE_0464
+Q6RgtFU5hdE_0470
+Q6RgtFU5hdE_0472
+Q6RgtFU5hdE_0493
+QqQIwB1tFbU_0000
+QqQIwB1tFbU_0009
+QqQIwB1tFbU_0022
+R1m_I9A40EM_0001
+R1m_I9A40EM_0036
+R1m_I9A40EM_0052
+R1m_I9A40EM_0084
+R1m_I9A40EM_0085
+R1m_I9A40EM_0087
+R1m_I9A40EM_0088
+R1m_I9A40EM_0090
+R1m_I9A40EM_0095
+RBv5SiEoqbs_0001
+RBv5SiEoqbs_0003
+RBv5SiEoqbs_0004
+RBv5SiEoqbs_0052
+ROOgtnxJWIQ_0006
+SCiRzd_4qoQ_0000
+SCiRzd_4qoQ_0021
+SCiRzd_4qoQ_0026
+SCiRzd_4qoQ_0029
+SCiRzd_4qoQ_0043
+SCiRzd_4qoQ_0054
+SCiRzd_4qoQ_0062
+SCiRzd_4qoQ_0078
+SCiRzd_4qoQ_0080
+SCiRzd_4qoQ_0136
+SCiRzd_4qoQ_0137
+SXWaaPsiXZM_0000
+SXWaaPsiXZM_0014
+SXWaaPsiXZM_0021
+SXWaaPsiXZM_0061
+SXWaaPsiXZM_0076
+SXWaaPsiXZM_0083
+SXWaaPsiXZM_0086
+SXWaaPsiXZM_0097
+SXWaaPsiXZM_0099
+SXWaaPsiXZM_0100
+SXWaaPsiXZM_0109
+SXWaaPsiXZM_0129
+SXWaaPsiXZM_0148
+SXWaaPsiXZM_0149
+SXWaaPsiXZM_0154
+SXWaaPsiXZM_0155
+SXWaaPsiXZM_0164
+SXWaaPsiXZM_0170
+SXWaaPsiXZM_0171
+SXWaaPsiXZM_0174
+SXWaaPsiXZM_0179
+SXWaaPsiXZM_0180
+SXkUUg5_nuE_0001
+SXkUUg5_nuE_0002
+SXkUUg5_nuE_0003
+SXkUUg5_nuE_0004
+SXkUUg5_nuE_0007
+SXkUUg5_nuE_0010
+SXkUUg5_nuE_0012
+SXkUUg5_nuE_0013
+SXkUUg5_nuE_0016
+SXkUUg5_nuE_0019
+SXkUUg5_nuE_0024
+SXkUUg5_nuE_0025
+SXkUUg5_nuE_0034
+SXkUUg5_nuE_0056
+SXkUUg5_nuE_0066
+SjE9k0L1PRI_0009
+SjE9k0L1PRI_0054
+SjE9k0L1PRI_0058
+SjE9k0L1PRI_0081
+SkpqV4wnnsg_0005
+SkpqV4wnnsg_0029
+SkpqV4wnnsg_0033
+SuE0EtNhm10_0003
+SuE0EtNhm10_0014
+SuE0EtNhm10_0075
+Tp8azIZH2Zg_0007
+Tp8azIZH2Zg_0023
+Tp8azIZH2Zg_0038
+UHf9F8AjihA_0003
+UHf9F8AjihA_0031
+UW6YE4kTm6s_0000
+UW6YE4kTm6s_0009
+UW6YE4kTm6s_0010
+UW6YE4kTm6s_0017
+UW6YE4kTm6s_0050
+UW6YE4kTm6s_0080
+UzoRm0lHL4Y_0000
+UzoRm0lHL4Y_0003
+UzoRm0lHL4Y_0006
+UzoRm0lHL4Y_0087
+UzoRm0lHL4Y_0091
+UzoRm0lHL4Y_0100
+UzoRm0lHL4Y_0129
+UzoRm0lHL4Y_0130
+UzoRm0lHL4Y_0131
+UzoRm0lHL4Y_0169
+VzKY1Gx53zc_0004
+VzKY1Gx53zc_0007
+VzKY1Gx53zc_0024
+WI3uSA37jTQ_0000
+WI3uSA37jTQ_0008
+WI3uSA37jTQ_0017
+WI3uSA37jTQ_0019
+WsIoM7MZ5iI_0113
+X7AKyI7FRpA_0034
+X7AKyI7FRpA_0091
+X7AKyI7FRpA_0092
+X7AKyI7FRpA_0116
+X7AKyI7FRpA_0133
+X7AKyI7FRpA_0140
+X7AKyI7FRpA_0141
+X7AKyI7FRpA_0145
+X7AKyI7FRpA_0149
+X7AKyI7FRpA_0155
+X7AKyI7FRpA_0296
+XDRQQTZ6b9w_0027
+XDRQQTZ6b9w_0060
+XDRQQTZ6b9w_0062
+XDRQQTZ6b9w_0170
+XDRQQTZ6b9w_0219
+XDRQQTZ6b9w_0232
+XDRQQTZ6b9w_0238
+XDRQQTZ6b9w_0253
+XJrFjEgq5Tk_0000
+XJrFjEgq5Tk_0005
+XJrFjEgq5Tk_0038
+XJrFjEgq5Tk_0049
+XJrFjEgq5Tk_0055
+XJrFjEgq5Tk_0103
+XJrFjEgq5Tk_0108
+XwQaaMOU6oM_0002
+XwQaaMOU6oM_0011
+XwQaaMOU6oM_0016
+XwQaaMOU6oM_0024
+XwQaaMOU6oM_0025
+XwQaaMOU6oM_0028
+XwQaaMOU6oM_0048
+XwQaaMOU6oM_0049
+XwQaaMOU6oM_0055
+XwQaaMOU6oM_0057
+XwQaaMOU6oM_0059
+XwQaaMOU6oM_0060
+XwQaaMOU6oM_0064
+XwQaaMOU6oM_0065
+XwQaaMOU6oM_0076
+XwQaaMOU6oM_0078
+XwQaaMOU6oM_0080
+XwQaaMOU6oM_0096
+XwQaaMOU6oM_0104
+XwQaaMOU6oM_0112
+XwQaaMOU6oM_0115
+XwQaaMOU6oM_0119
+Y5naGnjTMi0_0010
+YSIxx0eOK4U_0004
+YSIxx0eOK4U_0061
+YSIxx0eOK4U_0078
+YSIxx0eOK4U_0143
+YSIxx0eOK4U_0149
+YSIxx0eOK4U_0161
+YSIxx0eOK4U_0206
+YSIxx0eOK4U_0212
+YSIxx0eOK4U_0214
+YSIxx0eOK4U_0217
+YSIxx0eOK4U_0218
+YSIxx0eOK4U_0220
+YSIxx0eOK4U_0231
+YSIxx0eOK4U_0235
+YSIxx0eOK4U_0262
+YSIxx0eOK4U_0264
+YSIxx0eOK4U_0299
+YSIxx0eOK4U_0327
+YSIxx0eOK4U_0333
+YSIxx0eOK4U_0335
+YSIxx0eOK4U_0339
+YSIxx0eOK4U_0342
+YSIxx0eOK4U_0361
+YSIxx0eOK4U_0404
+YSIxx0eOK4U_0409
+YSIxx0eOK4U_0427
+YSIxx0eOK4U_0431
+YSIxx0eOK4U_0436
+Yq3MgleQKSc_0002
+Yq3MgleQKSc_0007
+Yq3MgleQKSc_0012
+Yq3MgleQKSc_0023
+Yq3MgleQKSc_0027
+Yq3MgleQKSc_0039
+Yq3MgleQKSc_0042
+Yq3MgleQKSc_0046
+YrUcUmQSufE_0003
+YrUcUmQSufE_0079
+YrUcUmQSufE_0080
+YrUcUmQSufE_0090
+Z_MQqWzXWjk_0005
+Z_MQqWzXWjk_0009
+Z_MQqWzXWjk_0015
+Z_MQqWzXWjk_0038
+Z_MQqWzXWjk_0043
+Z_MQqWzXWjk_0051
+_eZaDTcAeHo_0034
+_eZaDTcAeHo_0059
+_eZaDTcAeHo_0092
+_gxknz2jr70_0000
+_gxknz2jr70_0001
+_gxknz2jr70_0003
+_gxknz2jr70_0004
+_gxknz2jr70_0005
+_gxknz2jr70_0011
+_oXRvYMp36E_0021
+_oXRvYMp36E_0024
+_oXRvYMp36E_0032
+_oXRvYMp36E_0208
+aJDTMo_pkLE_0000
+aJDTMo_pkLE_0008
+aJDTMo_pkLE_0010
+aJDTMo_pkLE_0012
+aJDTMo_pkLE_0013
+aJDTMo_pkLE_0014
+aJDTMo_pkLE_0038
+as1dYNWR1_s_0000
+as1dYNWR1_s_0004
+as1dYNWR1_s_0006
+as1dYNWR1_s_0012
+as1dYNWR1_s_0013
+as1dYNWR1_s_0021
+as1dYNWR1_s_0025
+ay9u8V11JP4_0035
+ay9u8V11JP4_0077
+ay9u8V11JP4_0079
+ay9u8V11JP4_0122
+ay9u8V11JP4_0126
+ay9u8V11JP4_0249
+ay9u8V11JP4_0280
+b2wLhmcPTNw_0000
+b2wLhmcPTNw_0006
+b2wLhmcPTNw_0007
+b2wLhmcPTNw_0008
+b2wLhmcPTNw_0053
+b2wLhmcPTNw_0059
+b2wLhmcPTNw_0061
+b2wLhmcPTNw_0074
+b2wLhmcPTNw_0081
+b2wLhmcPTNw_0086
+b2wLhmcPTNw_0089
+b2wLhmcPTNw_0093
+b2wLhmcPTNw_0094
+b2wLhmcPTNw_0097
+b2wLhmcPTNw_0127
+b2wLhmcPTNw_0138
+b2wLhmcPTNw_0143
+b2wLhmcPTNw_0149
+bV4JkpgcNOs_0002
+bV4JkpgcNOs_0013
+bV4JkpgcNOs_0018
+bV4JkpgcNOs_0059
+b_EoBrEr_tQ_0002
+b_EoBrEr_tQ_0056
+cTHlQsTG_Go_0002
+cc1W6qLdzVo_0000
+cc1W6qLdzVo_0006
+cc1W6qLdzVo_0007
+cc1W6qLdzVo_0011
+cc1W6qLdzVo_0013
+cc1W6qLdzVo_0037
+cc1W6qLdzVo_0120
+cc1W6qLdzVo_0121
+d3GnwrM7r28_0002
+d3GnwrM7r28_0004
+d3GnwrM7r28_0021
+d3GnwrM7r28_0026
+d3GnwrM7r28_0033
+d3GnwrM7r28_0040
+d3GnwrM7r28_0044
+d3GnwrM7r28_0062
+d3GnwrM7r28_0066
+d3GnwrM7r28_0067
+d3GnwrM7r28_0081
+d3GnwrM7r28_0082
+dEhjANwDnwQ_0005
+dEhjANwDnwQ_0008
+dEhjANwDnwQ_0015
+dEhjANwDnwQ_0017
+dEhjANwDnwQ_0018
+dEhjANwDnwQ_0021
+dEhjANwDnwQ_0023
+dEhjANwDnwQ_0026
+dEhjANwDnwQ_0033
+dEhjANwDnwQ_0035
+dEhjANwDnwQ_0036
+dEhjANwDnwQ_0093
+dEhjANwDnwQ_0125
+dEhjANwDnwQ_0135
+dEhjANwDnwQ_0151
+dEhjANwDnwQ_0160
+dEhjANwDnwQ_0163
+dEhjANwDnwQ_0165
+dEhjANwDnwQ_0166
+dEhjANwDnwQ_0176
+dEhjANwDnwQ_0179
+dEhjANwDnwQ_0181
+dEhjANwDnwQ_0215
+dEhjANwDnwQ_0222
+dEhjANwDnwQ_0232
+dEhjANwDnwQ_0243
+dEhjANwDnwQ_0252
+dEhjANwDnwQ_0262
+dEhjANwDnwQ_0269
+dEhjANwDnwQ_0271
+dEhjANwDnwQ_0278
+dEhjANwDnwQ_0295
+dEhjANwDnwQ_0297
+dEhjANwDnwQ_0299
+dEhjANwDnwQ_0306
+dqrkV7DZWOk_0002
+dqrkV7DZWOk_0005
+dqrkV7DZWOk_0009
+dqrkV7DZWOk_0011
+dqrkV7DZWOk_0025
+fFoULWCEha8_0009
+fFoULWCEha8_0016
+fFoULWCEha8_0021
+fFoULWCEha8_0033
+fZ4iXEWx9Xs_0025
+fZ4iXEWx9Xs_0042
+fZ4iXEWx9Xs_0073
+fZ4iXEWx9Xs_0099
+fZ4iXEWx9Xs_0105
+fZ4iXEWx9Xs_0121
+fZ4iXEWx9Xs_0138
+fZ4iXEWx9Xs_0146
+fZ4iXEWx9Xs_0154
+fZ4iXEWx9Xs_0163
+fZ4iXEWx9Xs_0212
+fZ4iXEWx9Xs_0222
+fZ4iXEWx9Xs_0231
+fZ4iXEWx9Xs_0246
+fpBEupKjHR8_0058
+fpBEupKjHR8_0072
+fus6vSQVcFo_0003
+fus6vSQVcFo_0005
+fus6vSQVcFo_0008
+fus6vSQVcFo_0078
+fus6vSQVcFo_0080
+gxdT0bCS8CQ_0007
+gxdT0bCS8CQ_0011
+gxdT0bCS8CQ_0013
+gxdT0bCS8CQ_0014
+gxdT0bCS8CQ_0017
+gxdT0bCS8CQ_0022
+gxdT0bCS8CQ_0023
+gxdT0bCS8CQ_0047
+heRyH9ETDsY_0007
+heRyH9ETDsY_0062
+i4jo61blz2c_0003
+i4jo61blz2c_0005
+i4jo61blz2c_0007
+i4jo61blz2c_0008
+i4jo61blz2c_0043
+i4jo61blz2c_0044
+i4jo61blz2c_0049
+i4jo61blz2c_0053
+iQpe0PKPQo4_0000
+jGjiMMmZ5I4_0025
+jGjiMMmZ5I4_0026
+jUPVspqB9oI_0000
+jUPVspqB9oI_0001
+jUPVspqB9oI_0002
+jUPVspqB9oI_0003
+jUPVspqB9oI_0004
+jUPVspqB9oI_0005
+jUPVspqB9oI_0006
+jUPVspqB9oI_0009
+jUPVspqB9oI_0020
+jUPVspqB9oI_0025
+jUPVspqB9oI_0030
+jWbXM0z79DU_0000
+jWbXM0z79DU_0001
+jWbXM0z79DU_0031
+jWbXM0z79DU_0057
+jWbXM0z79DU_0109
+kAjaNQmT174_0005
+kAjaNQmT174_0012
+kAjaNQmT174_0023
+kAjaNQmT174_0025
+kAjaNQmT174_0027
+kAjaNQmT174_0059
+kAjaNQmT174_0062
+kAjaNQmT174_0074
+kAjaNQmT174_0080
+kAjaNQmT174_0140
+kAjaNQmT174_0187
+kAjaNQmT174_0196
+kAjaNQmT174_0197
+kAjaNQmT174_0200
+kAjaNQmT174_0258
+kAjaNQmT174_0259
+kAjaNQmT174_0268
+kAjaNQmT174_0269
+kAjaNQmT174_0307
+kAjaNQmT174_0316
+kAjaNQmT174_0318
+kAjaNQmT174_0319
+kAjaNQmT174_0322
+kAjaNQmT174_0326
+kAjaNQmT174_0339
+kAjaNQmT174_0346
+kAjaNQmT174_0353
+kAjaNQmT174_0365
+kAjaNQmT174_0374
+kAjaNQmT174_0375
+kAjaNQmT174_0407
+kAjaNQmT174_0417
+kAjaNQmT174_0433
+kAjaNQmT174_0436
+kAjaNQmT174_0455
+kAjaNQmT174_0458
+kAjaNQmT174_0467
+kAjaNQmT174_0478
+kAjaNQmT174_0483
+kAjaNQmT174_0495
+kHQJzFdsz2Q_0001
+kHQJzFdsz2Q_0002
+kHQJzFdsz2Q_0010
+kHQJzFdsz2Q_0039
+kHQJzFdsz2Q_0047
+kHQJzFdsz2Q_0062
+kHQJzFdsz2Q_0069
+kHQJzFdsz2Q_0085
+kMPHV8VsWRg_0001
+kMPHV8VsWRg_0035
+kMPHV8VsWRg_0056
+kMPHV8VsWRg_0114
+kMPHV8VsWRg_0130
+kMPHV8VsWRg_0132
+kMPHV8VsWRg_0138
+kMPHV8VsWRg_0151
+lBXbibUlOrw_0059
+lCUSA9DJAsU_0020
+lChWD345fO4_0000
+lChWD345fO4_0017
+lChWD345fO4_0042
+lChWD345fO4_0044
+lChWD345fO4_0074
+lChWD345fO4_0129
+lChWD345fO4_0140
+lChWD345fO4_0162
+lChWD345fO4_0170
+lS7o9qTAEhg_0053
+lXCDVcrIimg_0066
+mZo8sR71dSA_0008
+mZo8sR71dSA_0109
+mZo8sR71dSA_0110
+m_8IrmviDDI_0019
+m_8IrmviDDI_0026
+m_8IrmviDDI_0100
+m_8IrmviDDI_0108
+m_8IrmviDDI_0126
+m_8IrmviDDI_0243
+m_8IrmviDDI_0249
+m_8IrmviDDI_0255
+m_8IrmviDDI_0369
+m_8IrmviDDI_0378
+mr18y16hcA0_0171
+nclwKlzwQtg_0169
+nsG1XGv8RU0_0017
+nsG1XGv8RU0_0071
+nsG1XGv8RU0_0072
+nsG1XGv8RU0_0074
+nsG1XGv8RU0_0075
+nsG1XGv8RU0_0076
+nsG1XGv8RU0_0090
+nsG1XGv8RU0_0098
+nsG1XGv8RU0_0113
+nsG1XGv8RU0_0128
+nsG1XGv8RU0_0130
+nsG1XGv8RU0_0137
+nsG1XGv8RU0_0138
+nsG1XGv8RU0_0163
+p0wcln3EaGs_0057
+p0wcln3EaGs_0075
+r5zJmnjKQvw_0081
+r5zJmnjKQvw_0092
+rRwfAdl4oo8_0013
+rRwfAdl4oo8_0016
+rRwfAdl4oo8_0073
+rRwfAdl4oo8_0085
+rRwfAdl4oo8_0148
+rRwfAdl4oo8_0208
+rRwfAdl4oo8_0213
+rRwfAdl4oo8_0214
+sdqKdbyb0Ts_0005
+sdqKdbyb0Ts_0020
+sdqKdbyb0Ts_0053
+syOmmDr_M4k_0006
+syOmmDr_M4k_0089
+syOmmDr_M4k_0102
+syOmmDr_M4k_0116
+syOmmDr_M4k_0126
+syOmmDr_M4k_0133
+syOmmDr_M4k_0135
+syOmmDr_M4k_0139
+syOmmDr_M4k_0140
+syOmmDr_M4k_0143
+syOmmDr_M4k_0145
+syOmmDr_M4k_0148
+syOmmDr_M4k_0170
+syOmmDr_M4k_0204
+syOmmDr_M4k_0245
+syOmmDr_M4k_0249
+syOmmDr_M4k_0253
+syOmmDr_M4k_0257
+syOmmDr_M4k_0306
+syOmmDr_M4k_0314
+syOmmDr_M4k_0315
+syOmmDr_M4k_0323
+syOmmDr_M4k_0325
+syOmmDr_M4k_0347
+syOmmDr_M4k_0352
+syOmmDr_M4k_0356
+toFa1Z_VQCA_0011
+toFa1Z_VQCA_0028
+toFa1Z_VQCA_0085
+toFa1Z_VQCA_0094
+toFa1Z_VQCA_0163
+toFa1Z_VQCA_0251
+toFa1Z_VQCA_0259
+toFa1Z_VQCA_0262
+toFa1Z_VQCA_0272
+toFa1Z_VQCA_0274
+toFa1Z_VQCA_0279
+u4u5OuWfJBo_0100
+uBMD84FISOY_0013
+uBMD84FISOY_0016
+uBMD84FISOY_0024
+uBMD84FISOY_0031
+uBMD84FISOY_0068
+uBMD84FISOY_0082
+ubJR2bKGfz4_0067
+v8RVnr1CThg_0050
+vQDvIl5e3iI_0096
+vQDvIl5e3iI_0113
+vQDvIl5e3iI_0131
+vQDvIl5e3iI_0142
+vQDvIl5e3iI_0217
+vQDvIl5e3iI_0218
+vQDvIl5e3iI_0233
+vX_rnBSXKd8_0048
+vX_rnBSXKd8_0051
+vX_rnBSXKd8_0053
+wOK7_ZkRMN4_0017
+wOK7_ZkRMN4_0020
+wOK7_ZkRMN4_0028
+wOK7_ZkRMN4_0031
+wOK7_ZkRMN4_0036
+wOK7_ZkRMN4_0037
+wOK7_ZkRMN4_0073
+wOK7_ZkRMN4_0083
+wOK7_ZkRMN4_0086
+wOK7_ZkRMN4_0094
+wOK7_ZkRMN4_0113
+wOK7_ZkRMN4_0115
+wOK7_ZkRMN4_0126
+wOK7_ZkRMN4_0143
+wOK7_ZkRMN4_0152
+y1O19R_me9U_0040
+y1O19R_me9U_0043
+y1O19R_me9U_0064
+y1O19R_me9U_0067
+y1O19R_me9U_0092
+y1O19R_me9U_0109
diff --git a/egs2/jtubespeech/asr1/local/dev_speaker_list b/egs2/jtubespeech/asr1/local/dev_speaker_list
new file mode 100644
index 00000000000..7c7c206d997
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/dev_speaker_list
@@ -0,0 +1,162 @@
+23wZ9OmGJI4
+33OWZ_X2__U
+3aILBytvlks
+48gSJYp482w
+5QlqblKceJk
+5ef8wKGqQbw
+5w6KTNJ1XxY
+61eExIhQrcY
+6F97M3iktks
+7FU9Yb92o7U
+8AQlJEqWBXg
+8RiD9yjvI88
+ARsrJTRa2MQ
+BDtdR0Lplc0
+BDz56lr0QEI
+BLMI9jdtnFc
+BOoxjHw6c_I
+BY3aalyDpXk
+Bab8j1Ek4jc
+C5ETpmxVicY
+C76kmEOvVEE
+CJJV4xe8N4M
+CkVNl7w_Ibc
+DJ7D7kJhYEA
+DO8RJgIujEY
+DsARvCX5Q24
+EUti6bV7iOQ
+EoTFGgvBce4
+Ex7vU3VmPk0
+F9F6rG5Ghig
+FI0HEpoOL4k
+FSIXO_G96_U
+FUMRQqNIAmg
+FWKleFPNEPQ
+FxFz9nT4y7U
+G6ypXVO_Fm0
+G9FZzpuiLm4
+GLgLvNDTatU
+HeTB6P4Rxt0
+Ht86Exdrlok
+I3fhTd0iRn4
+IEcl2DZFZj8
+ILFrJIMTFNc
+Ig9q6z0uIB0
+J30dHOAUu4g
+JRARprI9z6k
+JUfZD1an9i0
+Ln7QBexLksQ
+M5rIjd_VWoA
+MLKARJ4gqD4
+MrnxGpMxGNE
+NDmYS82ztFI
+O9sTLQSZfwo
+OJRoBJlpssA
+OyQH5sZOaYg
+P8kmaj89DoE
+PRF7sEOLaJ0
+Pkc1C0YDgwI
+Q6RgtFU5hdE
+QqQIwB1tFbU
+R1m_I9A40EM
+RBv5SiEoqbs
+ROOgtnxJWIQ
+SCiRzd_4qoQ
+SXWaaPsiXZM
+SXkUUg5_nuE
+SjE9k0L1PRI
+SkpqV4wnnsg
+SuE0EtNhm10
+Tp8azIZH2Zg
+UHf9F8AjihA
+UW6YE4kTm6s
+UzoRm0lHL4Y
+V9oqJBUCPfA
+VzKY1Gx53zc
+WI3uSA37jTQ
+WsIoM7MZ5iI
+X7AKyI7FRpA
+XDRQQTZ6b9w
+XJrFjEgq5Tk
+XewnyUJgyA4
+XwQaaMOU6oM
+Y5naGnjTMi0
+YJlYb0Jif9o
+YSIxx0eOK4U
+Yq3MgleQKSc
+YrUcUmQSufE
+Z_MQqWzXWjk
+_eZaDTcAeHo
+_gxknz2jr70
+_oXRvYMp36E
+_pQ1BCdSTTI
+aJDTMo_pkLE
+as1dYNWR1_s
+ay9u8V11JP4
+b2wLhmcPTNw
+bCyhdG3dCdM
+bV4JkpgcNOs
+b_EoBrEr_tQ
+cTHlQsTG_Go
+cc1W6qLdzVo
+d3GnwrM7r28
+dEhjANwDnwQ
+dqrkV7DZWOk
+fFoULWCEha8
+fZ4iXEWx9Xs
+fpBEupKjHR8
+fus6vSQVcFo
+g9ZRGyB6Iqo
+gm_Nu_I33tY
+gxdT0bCS8CQ
+heRyH9ETDsY
+i4jo61blz2c
+iQpe0PKPQo4
+i_MnyoKqwLw
+jGjiMMmZ5I4
+jUPVspqB9oI
+jWbXM0z79DU
+kAjaNQmT174
+kHQJzFdsz2Q
+kMPHV8VsWRg
+kj0YTiexoVI
+lBXbibUlOrw
+lCUSA9DJAsU
+lChWD345fO4
+lS7o9qTAEhg
+lXCDVcrIimg
+lz1_yfop7XM
+mZo8sR71dSA
+m_8IrmviDDI
+mr18y16hcA0
+n_18ajaN788
+nclwKlzwQtg
+nsG1XGv8RU0
+p0OejCxh9sk
+p0wcln3EaGs
+prB2ZcYi_bE
+qK3bdBvf8Sc
+r5zJmnjKQvw
+rAzZN30Elcs
+rRwfAdl4oo8
+reQf05AZm9k
+s4of2bbqItk
+sdqKdbyb0Ts
+syOmmDr_M4k
+tR9Qyuwxrms
+toFa1Z_VQCA
+u4u5OuWfJBo
+uBMD84FISOY
+ubJR2bKGfz4
+uiZ3t86wOME
+v8RVnr1CThg
+vQDvIl5e3iI
+vX_rnBSXKd8
+vvlozhvQPJw
+wOK7_ZkRMN4
+xqXQH_uf5Co
+xvBOPLvZXxU
+y1O19R_me9U
+yP6BakHMbRY
+ypwkINjGyvo
+z4TWyXhOQ3Y
diff --git a/egs2/jtubespeech/asr1/local/eval_easy_jun21.list b/egs2/jtubespeech/asr1/local/eval_easy_jun21.list
new file mode 100644
index 00000000000..e1053923f01
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/eval_easy_jun21.list
@@ -0,0 +1,829 @@
+1Uy_W5iOHKA_0008
+1Uy_W5iOHKA_0012
+1Uy_W5iOHKA_0013
+1Uy_W5iOHKA_0020
+1Uy_W5iOHKA_0022
+1Uy_W5iOHKA_0024
+1Uy_W5iOHKA_0025
+1Uy_W5iOHKA_0029
+1Uy_W5iOHKA_0030
+1VK1jSCuS0I_0036
+1qP4vc3yTCw_0047
+2bWvx461Umc_0006
+2bWvx461Umc_0009
+2bWvx461Umc_0021
+2bWvx461Umc_0022
+2bWvx461Umc_0027
+2bWvx461Umc_0032
+2bWvx461Umc_0044
+2bWvx461Umc_0049
+2bWvx461Umc_0070
+3Nrbi6yRugo_0013
+3Nrbi6yRugo_0017
+3Nrbi6yRugo_0018
+3Nrbi6yRugo_0073
+3TLv_8tpMdA_0001
+3TLv_8tpMdA_0003
+3TLv_8tpMdA_0005
+3TLv_8tpMdA_0136
+3TLv_8tpMdA_0137
+3TLv_8tpMdA_0139
+3TLv_8tpMdA_0192
+3TLv_8tpMdA_0202
+3TLv_8tpMdA_0204
+3TLv_8tpMdA_0205
+3TLv_8tpMdA_0206
+3TLv_8tpMdA_0213
+3TLv_8tpMdA_0214
+3TLv_8tpMdA_0215
+3TLv_8tpMdA_0216
+3TLv_8tpMdA_0218
+3TLv_8tpMdA_0219
+3TLv_8tpMdA_0220
+4BCatS_OM0o_0003
+4BCatS_OM0o_0029
+5N1LmwPFh1g_0022
+5N1LmwPFh1g_0027
+5oOOuO5G1jY_0003
+5oOOuO5G1jY_0005
+5oOOuO5G1jY_0008
+5oOOuO5G1jY_0036
+5oOOuO5G1jY_0044
+5oOOuO5G1jY_0067
+5oOOuO5G1jY_0068
+5oOOuO5G1jY_0075
+5oOOuO5G1jY_0078
+5oOOuO5G1jY_0080
+5oOOuO5G1jY_0083
+5oOOuO5G1jY_0084
+5oOOuO5G1jY_0096
+5oOOuO5G1jY_0101
+5oOOuO5G1jY_0108
+5oOOuO5G1jY_0111
+5oOOuO5G1jY_0112
+5oOOuO5G1jY_0116
+5oOOuO5G1jY_0118
+5oOOuO5G1jY_0126
+5oOOuO5G1jY_0130
+5oOOuO5G1jY_0132
+5oOOuO5G1jY_0136
+5oOOuO5G1jY_0138
+5oOOuO5G1jY_0140
+5oOOuO5G1jY_0141
+5oOOuO5G1jY_0142
+5oOOuO5G1jY_0143
+5oOOuO5G1jY_0145
+5oOOuO5G1jY_0146
+5oOOuO5G1jY_0147
+5oOOuO5G1jY_0150
+5oOOuO5G1jY_0154
+5oOOuO5G1jY_0156
+5oOOuO5G1jY_0159
+5oOOuO5G1jY_0161
+5oOOuO5G1jY_0162
+5oOOuO5G1jY_0166
+5oOOuO5G1jY_0169
+5oOOuO5G1jY_0171
+5oOOuO5G1jY_0173
+5pf0oP5PQAE_0021
+5pf0oP5PQAE_0029
+5pf0oP5PQAE_0037
+5pf0oP5PQAE_0038
+5pf0oP5PQAE_0058
+5pf0oP5PQAE_0062
+5pf0oP5PQAE_0068
+5pf0oP5PQAE_0072
+5pf0oP5PQAE_0078
+5pf0oP5PQAE_0079
+5pf0oP5PQAE_0083
+5pf0oP5PQAE_0084
+5pf0oP5PQAE_0086
+5pf0oP5PQAE_0091
+5pf0oP5PQAE_0098
+5pf0oP5PQAE_0099
+5pf0oP5PQAE_0100
+5pf0oP5PQAE_0101
+5pf0oP5PQAE_0105
+5pf0oP5PQAE_0107
+5pf0oP5PQAE_0112
+5pf0oP5PQAE_0113
+5pf0oP5PQAE_0114
+5pf0oP5PQAE_0115
+5pf0oP5PQAE_0116
+5pf0oP5PQAE_0118
+5pf0oP5PQAE_0121
+5pf0oP5PQAE_0125
+5pf0oP5PQAE_0128
+5pf0oP5PQAE_0129
+5pf0oP5PQAE_0134
+5pf0oP5PQAE_0139
+5pf0oP5PQAE_0141
+5pf0oP5PQAE_0142
+5pf0oP5PQAE_0146
+5pf0oP5PQAE_0147
+5pf0oP5PQAE_0149
+5pf0oP5PQAE_0151
+5pf0oP5PQAE_0155
+5pf0oP5PQAE_0161
+5pf0oP5PQAE_0163
+5pf0oP5PQAE_0164
+5pf0oP5PQAE_0166
+5pf0oP5PQAE_0168
+5pf0oP5PQAE_0169
+5pf0oP5PQAE_0170
+5pf0oP5PQAE_0173
+5pf0oP5PQAE_0175
+5pf0oP5PQAE_0176
+5pf0oP5PQAE_0184
+5pf0oP5PQAE_0187
+5pf0oP5PQAE_0192
+5pf0oP5PQAE_0194
+5pf0oP5PQAE_0199
+5pf0oP5PQAE_0200
+5pf0oP5PQAE_0208
+5pf0oP5PQAE_0212
+5pf0oP5PQAE_0214
+5pf0oP5PQAE_0216
+5pf0oP5PQAE_0223
+5pf0oP5PQAE_0224
+5pf0oP5PQAE_0230
+5pf0oP5PQAE_0236
+5pf0oP5PQAE_0237
+6QWSHPwlBTQ_0214
+7h6xz2r2AMs_0037
+7h6xz2r2AMs_0038
+7kCGm71FJo0_0011
+8XIOuVh4lgM_0092
+A3ZINKGArTU_0014
+AQAgvg3KHwo_0255
+AQAgvg3KHwo_0361
+AQAgvg3KHwo_0397
+AQAgvg3KHwo_0431
+BMBYqLshQkw_0001
+BMBYqLshQkw_0022
+BMBYqLshQkw_0024
+DLyKmfU2WLU_0135
+DLyKmfU2WLU_0145
+DLyKmfU2WLU_0150
+DLyKmfU2WLU_0155
+DLyKmfU2WLU_0156
+DLyKmfU2WLU_0161
+DLyKmfU2WLU_0172
+DYjooacju_8_0109
+DYjooacju_8_0114
+DYjooacju_8_0115
+DYjooacju_8_0119
+DYjooacju_8_0122
+DYjooacju_8_0127
+DYjooacju_8_0173
+E4PgIY36DTY_0180
+E4PgIY36DTY_0189
+EXdqYY60y04_0001
+EXdqYY60y04_0019
+EXdqYY60y04_0046
+EXdqYY60y04_0047
+EXdqYY60y04_0052
+EXdqYY60y04_0059
+EXdqYY60y04_0077
+EXdqYY60y04_0084
+EXdqYY60y04_0096
+EXdqYY60y04_0097
+EXdqYY60y04_0106
+EXdqYY60y04_0111
+EXdqYY60y04_0137
+EXdqYY60y04_0147
+EXdqYY60y04_0164
+EXdqYY60y04_0191
+EZKBDWRn7wc_0072
+EZKBDWRn7wc_0081
+FmlGi6u7aIo_0036
+FmlGi6u7aIo_0068
+GTVsvvCjwjk_0082
+GTVsvvCjwjk_0087
+GTVsvvCjwjk_0089
+LrJicbYDgBk_0182
+LrJicbYDgBk_0208
+LyzaKXK15Vg_0024
+LyzaKXK15Vg_0042
+LyzaKXK15Vg_0044
+MP5mZaUupXc_0082
+MQdCxzu9I7Y_0035
+McQrH0iqFlY_0043
+McQrH0iqFlY_0055
+McQrH0iqFlY_0057
+McQrH0iqFlY_0102
+McQrH0iqFlY_0119
+McQrH0iqFlY_0126
+ObVqn63Zc14_0058
+PWFevMuRRks_0044
+PWFevMuRRks_0074
+QachBPW_RZY_0003
+QachBPW_RZY_0031
+RYIMKlZQpIs_0052
+RYIMKlZQpIs_0057
+RYIMKlZQpIs_0065
+RYIMKlZQpIs_0093
+RYIMKlZQpIs_0094
+RYIMKlZQpIs_0095
+RYIMKlZQpIs_0097
+RYIMKlZQpIs_0098
+RYIMKlZQpIs_0118
+RYIMKlZQpIs_0119
+RYIMKlZQpIs_0121
+RYIMKlZQpIs_0126
+RYIMKlZQpIs_0130
+RYIMKlZQpIs_0132
+RYIMKlZQpIs_0149
+RYIMKlZQpIs_0151
+RYIMKlZQpIs_0161
+RYIMKlZQpIs_0168
+RYIMKlZQpIs_0169
+RYIMKlZQpIs_0181
+SjhpH8CEvkM_0000
+SjhpH8CEvkM_0005
+SjhpH8CEvkM_0006
+SjhpH8CEvkM_0022
+SjhpH8CEvkM_0026
+SjhpH8CEvkM_0062
+SjhpH8CEvkM_0065
+SjhpH8CEvkM_0067
+SjhpH8CEvkM_0076
+SjhpH8CEvkM_0080
+SjhpH8CEvkM_0105
+SuCN2QHLBHU_0046
+SuCN2QHLBHU_0099
+TrSURh_zqG8_0001
+U8iB8i0o6lM_0031
+U8iB8i0o6lM_0036
+U8iB8i0o6lM_0037
+U8iB8i0o6lM_0038
+U8iB8i0o6lM_0078
+U8iB8i0o6lM_0086
+U8iB8i0o6lM_0106
+U8iB8i0o6lM_0109
+U8iB8i0o6lM_0110
+UPhG598XWOQ_0049
+UPhG598XWOQ_0060
+UdzWBW_GVdA_0020
+UdzWBW_GVdA_0036
+UdzWBW_GVdA_0043
+UdzWBW_GVdA_0046
+UdzWBW_GVdA_0048
+UdzWBW_GVdA_0054
+UdzWBW_GVdA_0055
+UdzWBW_GVdA_0064
+UdzWBW_GVdA_0066
+UdzWBW_GVdA_0069
+UdzWBW_GVdA_0072
+UdzWBW_GVdA_0092
+UdzWBW_GVdA_0095
+UdzWBW_GVdA_0096
+UdzWBW_GVdA_0116
+UdzWBW_GVdA_0122
+UdzWBW_GVdA_0123
+UdzWBW_GVdA_0126
+UdzWBW_GVdA_0128
+UdzWBW_GVdA_0134
+UkspwzJI9D0_0156
+VmGfENnSrtE_0057
+VmGfENnSrtE_0058
+VmGfENnSrtE_0063
+Vr0_vZpLDOA_0053
+Vr0_vZpLDOA_0058
+XKy80mmqQbE_0015
+XdfLK4pXgNU_0001
+YHzoGWjcLRg_0012
+YHzoGWjcLRg_0019
+YHzoGWjcLRg_0020
+YHzoGWjcLRg_0022
+YHzoGWjcLRg_0108
+YHzoGWjcLRg_0120
+YHzoGWjcLRg_0168
+YHzoGWjcLRg_0173
+YHzoGWjcLRg_0175
+YHzoGWjcLRg_0176
+YHzoGWjcLRg_0177
+YHzoGWjcLRg_0178
+YHzoGWjcLRg_0179
+YHzoGWjcLRg_0180
+YHzoGWjcLRg_0182
+YHzoGWjcLRg_0183
+YHzoGWjcLRg_0184
+YHzoGWjcLRg_0185
+YHzoGWjcLRg_0187
+YHzoGWjcLRg_0188
+YHzoGWjcLRg_0189
+YHzoGWjcLRg_0191
+YHzoGWjcLRg_0192
+YHzoGWjcLRg_0194
+YHzoGWjcLRg_0195
+YHzoGWjcLRg_0198
+YHzoGWjcLRg_0199
+YHzoGWjcLRg_0200
+YHzoGWjcLRg_0201
+YHzoGWjcLRg_0205
+YHzoGWjcLRg_0206
+YHzoGWjcLRg_0208
+YHzoGWjcLRg_0209
+YHzoGWjcLRg_0210
+YHzoGWjcLRg_0211
+YHzoGWjcLRg_0212
+YHzoGWjcLRg_0214
+YHzoGWjcLRg_0215
+YxYY8V9MPCI_0060
+YxYY8V9MPCI_0062
+YxYY8V9MPCI_0063
+Z5nP4YB93KM_0044
+Z5nP4YB93KM_0089
+Z5nP4YB93KM_0113
+ZAmgWLDdyZ4_0113
+ZAmgWLDdyZ4_0121
+ZAmgWLDdyZ4_0122
+ZAmgWLDdyZ4_0126
+ZAmgWLDdyZ4_0127
+ZAmgWLDdyZ4_0128
+ZAmgWLDdyZ4_0129
+ZAmgWLDdyZ4_0131
+ZAmgWLDdyZ4_0141
+ZAmgWLDdyZ4_0145
+ZAmgWLDdyZ4_0146
+ZAmgWLDdyZ4_0147
+ZAmgWLDdyZ4_0148
+ZAmgWLDdyZ4_0150
+ZAmgWLDdyZ4_0151
+ZAmgWLDdyZ4_0153
+ZkKAwe9SYNY_0012
+ZkKAwe9SYNY_0016
+ZkKAwe9SYNY_0023
+ZkKAwe9SYNY_0026
+ZkKAwe9SYNY_0055
+ZkKAwe9SYNY_0068
+ZkKAwe9SYNY_0069
+ZkKAwe9SYNY_0070
+ZkKAwe9SYNY_0073
+ZkKAwe9SYNY_0077
+ZkKAwe9SYNY_0078
+ZkKAwe9SYNY_0086
+ZkKAwe9SYNY_0087
+ZkKAwe9SYNY_0090
+ZkKAwe9SYNY_0092
+ZkKAwe9SYNY_0094
+ZkKAwe9SYNY_0096
+ZkKAwe9SYNY_0097
+ZkKAwe9SYNY_0099
+ZkKAwe9SYNY_0102
+ZkKAwe9SYNY_0107
+ZkKAwe9SYNY_0109
+ZkKAwe9SYNY_0112
+ZkKAwe9SYNY_0117
+ZkKAwe9SYNY_0126
+ZkKAwe9SYNY_0129
+ZkKAwe9SYNY_0131
+Zv1Us7bg2Ss_0009
+Zv1Us7bg2Ss_0013
+Zv1Us7bg2Ss_0047
+ZyvrbXsePgM_0000
+coj4VKJvp8E_0087
+coj4VKJvp8E_0111
+coj4VKJvp8E_0185
+coj4VKJvp8E_0203
+coj4VKJvp8E_0204
+coj4VKJvp8E_0206
+coj4VKJvp8E_0210
+coj4VKJvp8E_0269
+coj4VKJvp8E_0300
+cxktJLhV_Uk_0005
+cxktJLhV_Uk_0007
+cxktJLhV_Uk_0030
+cxktJLhV_Uk_0031
+cxktJLhV_Uk_0033
+cxktJLhV_Uk_0034
+cxktJLhV_Uk_0045
+cxktJLhV_Uk_0048
+cxktJLhV_Uk_0053
+dSuP1pRopO8_0000
+djRW_JRmkpE_0022
+djRW_JRmkpE_0242
+djRW_JRmkpE_0249
+egv_NFOZlq4_0046
+egv_NFOZlq4_0092
+egv_NFOZlq4_0098
+egv_NFOZlq4_0099
+egv_NFOZlq4_0101
+egv_NFOZlq4_0112
+fAZQrAoIHZs_0180
+fAZQrAoIHZs_0231
+fAZQrAoIHZs_0324
+fAZQrAoIHZs_0327
+fAZQrAoIHZs_0329
+fAZQrAoIHZs_0372
+fAZQrAoIHZs_0376
+fAZQrAoIHZs_0396
+fs1tJm743Gg_0110
+fs1tJm743Gg_0114
+fs1tJm743Gg_0122
+fs1tJm743Gg_0123
+fs1tJm743Gg_0124
+fs1tJm743Gg_0202
+fs1tJm743Gg_0204
+fs1tJm743Gg_0313
+fs1tJm743Gg_0317
+fs1tJm743Gg_0323
+fs1tJm743Gg_0332
+fs1tJm743Gg_0333
+fs1tJm743Gg_0334
+gF5P556luWY_0159
+gwTYYXTJP7s_0002
+gwTYYXTJP7s_0030
+gwTYYXTJP7s_0046
+gwTYYXTJP7s_0047
+gwTYYXTJP7s_0053
+gwTYYXTJP7s_0063
+gwTYYXTJP7s_0064
+gwTYYXTJP7s_0067
+gwTYYXTJP7s_0068
+gwTYYXTJP7s_0070
+gwTYYXTJP7s_0071
+gwTYYXTJP7s_0073
+gwTYYXTJP7s_0074
+gwTYYXTJP7s_0086
+gwTYYXTJP7s_0087
+gwTYYXTJP7s_0088
+gwTYYXTJP7s_0090
+gwTYYXTJP7s_0091
+gwTYYXTJP7s_0092
+gwTYYXTJP7s_0093
+gwTYYXTJP7s_0095
+gwTYYXTJP7s_0096
+gwTYYXTJP7s_0098
+gwTYYXTJP7s_0101
+gwTYYXTJP7s_0103
+gwTYYXTJP7s_0106
+gwTYYXTJP7s_0110
+gwTYYXTJP7s_0111
+gwTYYXTJP7s_0113
+gwTYYXTJP7s_0120
+gwTYYXTJP7s_0122
+gwTYYXTJP7s_0123
+gwTYYXTJP7s_0124
+gwTYYXTJP7s_0125
+gwTYYXTJP7s_0126
+gwTYYXTJP7s_0127
+gwTYYXTJP7s_0128
+gwTYYXTJP7s_0130
+gwTYYXTJP7s_0132
+gwTYYXTJP7s_0133
+gwTYYXTJP7s_0135
+gwTYYXTJP7s_0136
+gwTYYXTJP7s_0137
+gwTYYXTJP7s_0138
+gwTYYXTJP7s_0139
+gwTYYXTJP7s_0140
+gwTYYXTJP7s_0142
+gwTYYXTJP7s_0143
+gwTYYXTJP7s_0145
+gwTYYXTJP7s_0146
+gwTYYXTJP7s_0150
+gwTYYXTJP7s_0151
+gwTYYXTJP7s_0152
+gwTYYXTJP7s_0153
+gwTYYXTJP7s_0154
+gwTYYXTJP7s_0156
+gwTYYXTJP7s_0157
+gwTYYXTJP7s_0158
+gwTYYXTJP7s_0163
+gwTYYXTJP7s_0165
+gwTYYXTJP7s_0169
+gwTYYXTJP7s_0171
+gwTYYXTJP7s_0172
+gwTYYXTJP7s_0173
+gwTYYXTJP7s_0174
+gwTYYXTJP7s_0176
+gwTYYXTJP7s_0177
+gwTYYXTJP7s_0181
+gwTYYXTJP7s_0182
+gwTYYXTJP7s_0183
+gwTYYXTJP7s_0184
+gwTYYXTJP7s_0185
+gwTYYXTJP7s_0186
+gwTYYXTJP7s_0187
+gwTYYXTJP7s_0188
+gwTYYXTJP7s_0189
+gwTYYXTJP7s_0191
+gwTYYXTJP7s_0192
+gwTYYXTJP7s_0193
+gwTYYXTJP7s_0198
+gwTYYXTJP7s_0199
+gwTYYXTJP7s_0200
+gwTYYXTJP7s_0202
+gwTYYXTJP7s_0203
+gwTYYXTJP7s_0205
+gwTYYXTJP7s_0206
+gwTYYXTJP7s_0207
+gwTYYXTJP7s_0209
+gwTYYXTJP7s_0210
+gwTYYXTJP7s_0214
+gwTYYXTJP7s_0215
+gwTYYXTJP7s_0216
+gwTYYXTJP7s_0218
+gwTYYXTJP7s_0221
+h4q0ote54M0_0130
+hGQtaDy_wL8_0003
+hx0vzbF9Xls_0001
+ie7wvy3su7E_0003
+ie7wvy3su7E_0007
+ie7wvy3su7E_0009
+ie7wvy3su7E_0010
+ie7wvy3su7E_0012
+ie7wvy3su7E_0013
+ie7wvy3su7E_0015
+ie7wvy3su7E_0016
+ie7wvy3su7E_0017
+ie7wvy3su7E_0019
+ie7wvy3su7E_0020
+ie7wvy3su7E_0022
+ie7wvy3su7E_0027
+ie7wvy3su7E_0076
+ie7wvy3su7E_0080
+ie7wvy3su7E_0081
+ie7wvy3su7E_0085
+ieATkgEmuOo_0003
+ieATkgEmuOo_0006
+ieATkgEmuOo_0008
+ieATkgEmuOo_0011
+ieATkgEmuOo_0014
+ieATkgEmuOo_0016
+ieATkgEmuOo_0019
+ieATkgEmuOo_0021
+ieATkgEmuOo_0029
+ieATkgEmuOo_0032
+ieATkgEmuOo_0033
+ieATkgEmuOo_0034
+ieATkgEmuOo_0036
+ieATkgEmuOo_0037
+ieATkgEmuOo_0039
+ieATkgEmuOo_0041
+ieATkgEmuOo_0042
+ieATkgEmuOo_0045
+ieATkgEmuOo_0046
+ieATkgEmuOo_0048
+ieATkgEmuOo_0051
+ieATkgEmuOo_0052
+ieATkgEmuOo_0053
+ieATkgEmuOo_0054
+ieATkgEmuOo_0055
+ieATkgEmuOo_0056
+ieATkgEmuOo_0057
+ieATkgEmuOo_0059
+ieATkgEmuOo_0060
+ieATkgEmuOo_0061
+ieATkgEmuOo_0062
+iwZIu2B9Kxc_0008
+iwtErhAprUE_0062
+iwtErhAprUE_0068
+iwtErhAprUE_0070
+kCzp9tbRxqk_0002
+kCzp9tbRxqk_0004
+kJDrEJD3Cyk_0128
+ku6EHu0igME_0001
+ku6EHu0igME_0211
+ku6EHu0igME_0212
+ku6EHu0igME_0213
+ku6EHu0igME_0215
+kxTyHU_bt2s_0068
+lFdW2t9Hiog_0037
+lFdW2t9Hiog_0039
+lFdW2t9Hiog_0040
+lFdW2t9Hiog_0044
+lFdW2t9Hiog_0045
+lFdW2t9Hiog_0047
+lRiM5Kk4OvY_0063
+lRiM5Kk4OvY_0129
+lvuejJYTE1A_0053
+lvuejJYTE1A_0141
+lxIccBrKK3E_0011
+lxIccBrKK3E_0062
+nxX6KqK2DTE_0011
+nxX6KqK2DTE_0019
+oq45KIc251c_0007
+oq45KIc251c_0039
+oq45KIc251c_0043
+oq45KIc251c_0044
+oq45KIc251c_0048
+oq45KIc251c_0052
+oq45KIc251c_0057
+oq45KIc251c_0062
+ouOCfB5Unb0_0147
+ouOCfB5Unb0_0148
+ouOCfB5Unb0_0154
+pngzGTW8jpA_0018
+pngzGTW8jpA_0036
+pngzGTW8jpA_0047
+qJrPLkkBN7Q_0079
+qJrPLkkBN7Q_0082
+qJrPLkkBN7Q_0096
+qJrPLkkBN7Q_0101
+qJrPLkkBN7Q_0104
+qJrPLkkBN7Q_0107
+qLCzx7qzRk8_0007
+qLCzx7qzRk8_0014
+qLCzx7qzRk8_0019
+qLCzx7qzRk8_0024
+qfeHKniOvkY_0007
+qfeHKniOvkY_0021
+qfeHKniOvkY_0030
+qfeHKniOvkY_0032
+qfeHKniOvkY_0036
+qfeHKniOvkY_0069
+qfeHKniOvkY_0071
+qfeHKniOvkY_0087
+qfeHKniOvkY_0095
+qfeHKniOvkY_0101
+qfeHKniOvkY_0102
+qfeHKniOvkY_0103
+qfeHKniOvkY_0115
+qfeHKniOvkY_0116
+qfeHKniOvkY_0118
+qfeHKniOvkY_0119
+qfeHKniOvkY_0122
+quC_hC3p1tY_0008
+quC_hC3p1tY_0038
+quC_hC3p1tY_0046
+quC_hC3p1tY_0055
+quC_hC3p1tY_0056
+r8bvyjeT0aA_0035
+r8bvyjeT0aA_0052
+ryIQByN3HiE_0001
+ryIQByN3HiE_0010
+ryIQByN3HiE_0014
+ryIQByN3HiE_0181
+s1CGt5gHvzU_0061
+s7wzLT27NDk_0011
+s7wzLT27NDk_0039
+sPdwWCCHWYY_0004
+sPdwWCCHWYY_0020
+sPdwWCCHWYY_0026
+sPdwWCCHWYY_0027
+sPdwWCCHWYY_0028
+sPdwWCCHWYY_0034
+sPdwWCCHWYY_0036
+sPdwWCCHWYY_0037
+sPdwWCCHWYY_0046
+sPdwWCCHWYY_0048
+sPdwWCCHWYY_0056
+sPdwWCCHWYY_0100
+sPdwWCCHWYY_0118
+sPdwWCCHWYY_0121
+sPdwWCCHWYY_0157
+sPdwWCCHWYY_0162
+sPdwWCCHWYY_0187
+sPdwWCCHWYY_0190
+sPdwWCCHWYY_0194
+sPdwWCCHWYY_0198
+sPdwWCCHWYY_0206
+sPdwWCCHWYY_0208
+sPdwWCCHWYY_0209
+sPdwWCCHWYY_0210
+sPdwWCCHWYY_0211
+sPdwWCCHWYY_0212
+sPdwWCCHWYY_0217
+sPdwWCCHWYY_0220
+sPdwWCCHWYY_0221
+sPdwWCCHWYY_0224
+sPdwWCCHWYY_0227
+sPdwWCCHWYY_0230
+sPdwWCCHWYY_0231
+sPdwWCCHWYY_0235
+sPdwWCCHWYY_0237
+sPdwWCCHWYY_0242
+sPdwWCCHWYY_0244
+sPdwWCCHWYY_0247
+sPdwWCCHWYY_0286
+sPdwWCCHWYY_0287
+sPdwWCCHWYY_0288
+sPdwWCCHWYY_0289
+sPdwWCCHWYY_0290
+sPdwWCCHWYY_0291
+ssenWQx4YOo_0009
+ssenWQx4YOo_0010
+ssenWQx4YOo_0018
+tLdwxXhgdHM_0019
+tLdwxXhgdHM_0062
+tLdwxXhgdHM_0073
+tUWFFJq4450_0000
+tUWFFJq4450_0076
+tXVnxgd8uqQ_0002
+tXVnxgd8uqQ_0023
+tXVnxgd8uqQ_0025
+tXVnxgd8uqQ_0030
+tXVnxgd8uqQ_0038
+tXVnxgd8uqQ_0047
+tXVnxgd8uqQ_0049
+u0CNIMmG44c_0075
+u8RVt3d5_Sk_0005
+u8RVt3d5_Sk_0010
+u8RVt3d5_Sk_0025
+u8RVt3d5_Sk_0035
+u8RVt3d5_Sk_0063
+u8RVt3d5_Sk_0075
+u8RVt3d5_Sk_0077
+u8RVt3d5_Sk_0080
+u8RVt3d5_Sk_0082
+u8RVt3d5_Sk_0086
+u8RVt3d5_Sk_0087
+u8RVt3d5_Sk_0091
+u8RVt3d5_Sk_0095
+u8RVt3d5_Sk_0101
+u8RVt3d5_Sk_0113
+u8RVt3d5_Sk_0114
+u8RVt3d5_Sk_0124
+u8RVt3d5_Sk_0125
+u8RVt3d5_Sk_0128
+u8RVt3d5_Sk_0129
+u8RVt3d5_Sk_0135
+u8RVt3d5_Sk_0136
+u8RVt3d5_Sk_0139
+u8RVt3d5_Sk_0142
+u8RVt3d5_Sk_0146
+u8RVt3d5_Sk_0147
+u8RVt3d5_Sk_0152
+u8RVt3d5_Sk_0157
+u8RVt3d5_Sk_0164
+u8RVt3d5_Sk_0165
+u8RVt3d5_Sk_0166
+u8RVt3d5_Sk_0167
+u8RVt3d5_Sk_0173
+u8RVt3d5_Sk_0174
+u8RVt3d5_Sk_0178
+u8RVt3d5_Sk_0179
+u8RVt3d5_Sk_0181
+u8RVt3d5_Sk_0182
+u8RVt3d5_Sk_0187
+u8RVt3d5_Sk_0188
+u8RVt3d5_Sk_0192
+u8RVt3d5_Sk_0196
+u8RVt3d5_Sk_0197
+u8RVt3d5_Sk_0198
+u8RVt3d5_Sk_0199
+u8RVt3d5_Sk_0201
+u8RVt3d5_Sk_0202
+u8RVt3d5_Sk_0205
+u8RVt3d5_Sk_0213
+u8RVt3d5_Sk_0216
+u8RVt3d5_Sk_0223
+u8RVt3d5_Sk_0227
+u8RVt3d5_Sk_0228
+u8RVt3d5_Sk_0231
+u8RVt3d5_Sk_0232
+u8RVt3d5_Sk_0234
+u8RVt3d5_Sk_0235
+u8RVt3d5_Sk_0236
+u8RVt3d5_Sk_0240
+u8RVt3d5_Sk_0242
+ushxgQA_iIU_0106
+ushxgQA_iIU_0107
+ushxgQA_iIU_0112
+ushxgQA_iIU_0119
+ushxgQA_iIU_0126
+ushxgQA_iIU_0155
+ushxgQA_iIU_0218
+ushxgQA_iIU_0219
+uumAohMovts_0016
+vKCdpSN_luc_0076
+vKCdpSN_luc_0078
+vKCdpSN_luc_0081
+vKCdpSN_luc_0094
+vKCdpSN_luc_0145
+vKCdpSN_luc_0179
+vKCdpSN_luc_0181
+wGJpjV1jFQw_0051
+wGJpjV1jFQw_0071
+wGJpjV1jFQw_0103
+wGJpjV1jFQw_0105
+wGJpjV1jFQw_0110
+wGJpjV1jFQw_0113
+wrfIXsCbFKc_0116
+x2kw6MXDMpo_0076
+x2kw6MXDMpo_0087
+xT4A1kKMUAY_0003
+xT4A1kKMUAY_0016
+xT4A1kKMUAY_0028
+xT4A1kKMUAY_0037
+xsGHjobUDK0_0033
+xsGHjobUDK0_0067
+xsGHjobUDK0_0084
+xsGHjobUDK0_0122
+xsGHjobUDK0_0132
+xsGHjobUDK0_0140
+xsGHjobUDK0_0141
+xsGHjobUDK0_0182
+xsGHjobUDK0_0191
+yzo68izI7bI_0051
+zJucVwUfWww_0005
+zVSej40xDuE_0000
+zVSej40xDuE_0048
+zVSej40xDuE_0050
+zVSej40xDuE_0064
+zVSej40xDuE_0065
+zVSej40xDuE_0068
+zfBYJrhMZwQ_0235
diff --git a/egs2/jtubespeech/asr1/local/eval_normal_jun21.list b/egs2/jtubespeech/asr1/local/eval_normal_jun21.list
new file mode 100644
index 00000000000..0445d380dd6
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/eval_normal_jun21.list
@@ -0,0 +1,834 @@
+1P4ouTTTVNc_0033
+1Uy_W5iOHKA_0000
+1Uy_W5iOHKA_0001
+1Uy_W5iOHKA_0004
+1Uy_W5iOHKA_0006
+1Uy_W5iOHKA_0010
+1Uy_W5iOHKA_0011
+1Uy_W5iOHKA_0014
+1Uy_W5iOHKA_0019
+1Uy_W5iOHKA_0028
+1VK1jSCuS0I_0007
+1VK1jSCuS0I_0017
+1VK1jSCuS0I_0030
+1qP4vc3yTCw_0001
+1qP4vc3yTCw_0015
+1qP4vc3yTCw_0026
+1qP4vc3yTCw_0037
+1zQ1bxwtu24_0002
+1zQ1bxwtu24_0006
+1zQ1bxwtu24_0008
+1zQ1bxwtu24_0009
+1zQ1bxwtu24_0014
+1zQ1bxwtu24_0015
+2bWvx461Umc_0053
+3Nrbi6yRugo_0074
+3TLv_8tpMdA_0002
+3TLv_8tpMdA_0008
+3TLv_8tpMdA_0010
+3TLv_8tpMdA_0031
+3TLv_8tpMdA_0102
+3TLv_8tpMdA_0147
+3TLv_8tpMdA_0149
+3TLv_8tpMdA_0151
+3TLv_8tpMdA_0180
+3TLv_8tpMdA_0190
+3TLv_8tpMdA_0203
+3TLv_8tpMdA_0212
+3TLv_8tpMdA_0217
+4BCatS_OM0o_0006
+4BCatS_OM0o_0007
+4BCatS_OM0o_0011
+4BCatS_OM0o_0012
+4BCatS_OM0o_0014
+4BCatS_OM0o_0016
+4EePkO7vBTU_0005
+4EePkO7vBTU_0015
+4NhD0vZ9yPc_0067
+4NhD0vZ9yPc_0068
+4NhD0vZ9yPc_0069
+4NhD0vZ9yPc_0070
+4NhD0vZ9yPc_0071
+4NhD0vZ9yPc_0085
+4NhD0vZ9yPc_0137
+4NhD0vZ9yPc_0138
+5N1LmwPFh1g_0003
+5N1LmwPFh1g_0006
+5oOOuO5G1jY_0019
+5oOOuO5G1jY_0021
+5oOOuO5G1jY_0045
+5oOOuO5G1jY_0079
+5oOOuO5G1jY_0081
+5oOOuO5G1jY_0090
+5oOOuO5G1jY_0091
+5oOOuO5G1jY_0092
+5oOOuO5G1jY_0094
+5oOOuO5G1jY_0103
+5oOOuO5G1jY_0104
+5oOOuO5G1jY_0107
+5oOOuO5G1jY_0113
+5oOOuO5G1jY_0115
+5oOOuO5G1jY_0122
+5oOOuO5G1jY_0129
+5oOOuO5G1jY_0153
+5oOOuO5G1jY_0157
+5oOOuO5G1jY_0167
+5oOOuO5G1jY_0170
+5oOOuO5G1jY_0175
+5pf0oP5PQAE_0023
+5pf0oP5PQAE_0033
+5pf0oP5PQAE_0036
+5pf0oP5PQAE_0039
+5pf0oP5PQAE_0040
+5pf0oP5PQAE_0041
+5pf0oP5PQAE_0045
+5pf0oP5PQAE_0047
+5pf0oP5PQAE_0056
+5pf0oP5PQAE_0060
+5pf0oP5PQAE_0064
+5pf0oP5PQAE_0067
+5pf0oP5PQAE_0070
+5pf0oP5PQAE_0096
+5pf0oP5PQAE_0104
+5pf0oP5PQAE_0110
+5pf0oP5PQAE_0156
+5pf0oP5PQAE_0157
+5pf0oP5PQAE_0174
+5pf0oP5PQAE_0188
+5pf0oP5PQAE_0195
+5pf0oP5PQAE_0198
+5pf0oP5PQAE_0204
+5pf0oP5PQAE_0209
+5pf0oP5PQAE_0221
+5pf0oP5PQAE_0227
+5pf0oP5PQAE_0228
+5pf0oP5PQAE_0231
+64fFAL8d6GA_0099
+6QWSHPwlBTQ_0002
+6QWSHPwlBTQ_0003
+6QWSHPwlBTQ_0012
+6QWSHPwlBTQ_0158
+6QWSHPwlBTQ_0213
+6QWSHPwlBTQ_0216
+7h6xz2r2AMs_0004
+7h6xz2r2AMs_0005
+7h6xz2r2AMs_0047
+7kCGm71FJo0_0002
+7kCGm71FJo0_0005
+7kCGm71FJo0_0035
+7kCGm71FJo0_0036
+7kCGm71FJo0_0041
+8XIOuVh4lgM_0087
+8l3IHmW2KSk_0078
+8l3IHmW2KSk_0081
+8l3IHmW2KSk_0083
+98y1W2SRsX4_0003
+98y1W2SRsX4_0022
+98y1W2SRsX4_0027
+98y1W2SRsX4_0032
+98y1W2SRsX4_0034
+A3ZINKGArTU_0000
+A3ZINKGArTU_0007
+A3ZINKGArTU_0009
+AAdxdUhaIYU_0003
+AAdxdUhaIYU_0004
+AAdxdUhaIYU_0006
+AAdxdUhaIYU_0010
+AAdxdUhaIYU_0012
+AAdxdUhaIYU_0013
+AAdxdUhaIYU_0014
+AAdxdUhaIYU_0017
+AAdxdUhaIYU_0019
+AAdxdUhaIYU_0023
+AQAgvg3KHwo_0040
+AQAgvg3KHwo_0079
+AQAgvg3KHwo_0093
+AQAgvg3KHwo_0115
+AQAgvg3KHwo_0375
+BMBYqLshQkw_0000
+BMBYqLshQkw_0003
+BMBYqLshQkw_0009
+BMBYqLshQkw_0015
+BMBYqLshQkw_0026
+BMBYqLshQkw_0028
+BMBYqLshQkw_0036
+DLyKmfU2WLU_0005
+DLyKmfU2WLU_0010
+DYjooacju_8_0002
+DYjooacju_8_0003
+DYjooacju_8_0004
+DYjooacju_8_0052
+DYjooacju_8_0053
+DYjooacju_8_0059
+DYjooacju_8_0112
+DYjooacju_8_0120
+DYjooacju_8_0121
+DYjooacju_8_0123
+DYjooacju_8_0128
+DYjooacju_8_0129
+DYjooacju_8_0166
+DYjooacju_8_0167
+DYjooacju_8_0169
+E4PgIY36DTY_0074
+E4PgIY36DTY_0091
+E4PgIY36DTY_0106
+E4PgIY36DTY_0108
+E4PgIY36DTY_0181
+ELfL879J1ZM_0000
+ELfL879J1ZM_0001
+ELfL879J1ZM_0002
+ELfL879J1ZM_0003
+EXdqYY60y04_0010
+EXdqYY60y04_0025
+EXdqYY60y04_0029
+EXdqYY60y04_0042
+EXdqYY60y04_0060
+EXdqYY60y04_0092
+EXdqYY60y04_0094
+EXdqYY60y04_0095
+EXdqYY60y04_0123
+EXdqYY60y04_0129
+EXdqYY60y04_0130
+EXdqYY60y04_0135
+EXdqYY60y04_0180
+EXdqYY60y04_0183
+EXdqYY60y04_0188
+EXdqYY60y04_0205
+EZKBDWRn7wc_0055
+FmlGi6u7aIo_0000
+FmlGi6u7aIo_0009
+FmlGi6u7aIo_0017
+FmlGi6u7aIo_0018
+FmlGi6u7aIo_0019
+FmlGi6u7aIo_0020
+FmlGi6u7aIo_0021
+FocYa_DJkzM_0010
+FocYa_DJkzM_0017
+FocYa_DJkzM_0082
+GJccNIFQnTA_0063
+GJccNIFQnTA_0064
+GJccNIFQnTA_0065
+GJccNIFQnTA_0075
+GJccNIFQnTA_0087
+GJccNIFQnTA_0095
+GJccNIFQnTA_0110
+GTVsvvCjwjk_0002
+GTVsvvCjwjk_0004
+GTVsvvCjwjk_0005
+GTVsvvCjwjk_0007
+GTVsvvCjwjk_0008
+GTVsvvCjwjk_0061
+GTVsvvCjwjk_0079
+GTVsvvCjwjk_0080
+GTVsvvCjwjk_0081
+GTVsvvCjwjk_0083
+GTVsvvCjwjk_0084
+GTVsvvCjwjk_0085
+GTVsvvCjwjk_0086
+ICv_Liucpf0_0006
+Ibw0yLy8h20_0002
+Ibw0yLy8h20_0025
+Ibw0yLy8h20_0081
+Ibw0yLy8h20_0149
+Kop7BUYHaJ8_0001
+LGmey8AfKC0_0045
+LGmey8AfKC0_0047
+LGmey8AfKC0_0057
+LrJicbYDgBk_0007
+LrJicbYDgBk_0011
+LrJicbYDgBk_0056
+LyzaKXK15Vg_0015
+LyzaKXK15Vg_0017
+LyzaKXK15Vg_0021
+LyzaKXK15Vg_0025
+LyzaKXK15Vg_0030
+LyzaKXK15Vg_0032
+LyzaKXK15Vg_0033
+LyzaKXK15Vg_0037
+LyzaKXK15Vg_0041
+LyzaKXK15Vg_0058
+LyzaKXK15Vg_0060
+LyzaKXK15Vg_0094
+LyzaKXK15Vg_0139
+MP5mZaUupXc_0008
+MP5mZaUupXc_0043
+MP5mZaUupXc_0076
+MP5mZaUupXc_0077
+MP5mZaUupXc_0084
+MP5mZaUupXc_0120
+MQdCxzu9I7Y_0003
+MQdCxzu9I7Y_0007
+MQdCxzu9I7Y_0012
+MQdCxzu9I7Y_0014
+MQdCxzu9I7Y_0040
+MQdCxzu9I7Y_0054
+MWPsCJMSvZo_0001
+MWPsCJMSvZo_0005
+MWPsCJMSvZo_0034
+McQrH0iqFlY_0006
+McQrH0iqFlY_0008
+McQrH0iqFlY_0100
+McQrH0iqFlY_0108
+OCz0we8aiHM_0015
+OCz0we8aiHM_0040
+OCz0we8aiHM_0049
+OCz0we8aiHM_0059
+OCz0we8aiHM_0072
+OCz0we8aiHM_0080
+OCz0we8aiHM_0090
+OCz0we8aiHM_0101
+OCz0we8aiHM_0105
+ObVqn63Zc14_0002
+ObVqn63Zc14_0005
+ObVqn63Zc14_0010
+ObVqn63Zc14_0016
+ObVqn63Zc14_0029
+ObVqn63Zc14_0049
+ObVqn63Zc14_0050
+ObVqn63Zc14_0053
+QachBPW_RZY_0009
+QachBPW_RZY_0015
+QachBPW_RZY_0018
+QachBPW_RZY_0025
+QachBPW_RZY_0027
+QachBPW_RZY_0028
+RYIMKlZQpIs_0016
+RYIMKlZQpIs_0024
+RYIMKlZQpIs_0025
+RYIMKlZQpIs_0042
+RYIMKlZQpIs_0058
+RYIMKlZQpIs_0061
+RYIMKlZQpIs_0082
+RYIMKlZQpIs_0087
+RYIMKlZQpIs_0116
+RYIMKlZQpIs_0117
+RYIMKlZQpIs_0128
+RYIMKlZQpIs_0129
+RYIMKlZQpIs_0146
+RYIMKlZQpIs_0155
+RYIMKlZQpIs_0157
+RYIMKlZQpIs_0170
+S1yH7QxqkM0_0013
+S1yH7QxqkM0_0046
+S1yH7QxqkM0_0048
+SjhpH8CEvkM_0004
+SjhpH8CEvkM_0008
+SjhpH8CEvkM_0011
+SjhpH8CEvkM_0012
+SjhpH8CEvkM_0017
+SjhpH8CEvkM_0018
+SjhpH8CEvkM_0024
+SjhpH8CEvkM_0077
+SjhpH8CEvkM_0082
+SjhpH8CEvkM_0098
+SjhpH8CEvkM_0101
+SjhpH8CEvkM_0106
+SjhpH8CEvkM_0111
+SjhpH8CEvkM_0140
+SuCN2QHLBHU_0048
+SuCN2QHLBHU_0096
+SuCN2QHLBHU_0102
+SuCN2QHLBHU_0111
+TaAHiOFvCLA_0062
+U8iB8i0o6lM_0003
+U8iB8i0o6lM_0009
+U8iB8i0o6lM_0021
+U8iB8i0o6lM_0030
+U8iB8i0o6lM_0035
+U8iB8i0o6lM_0043
+U8iB8i0o6lM_0048
+U8iB8i0o6lM_0066
+UPhG598XWOQ_0010
+UPhG598XWOQ_0048
+UPhG598XWOQ_0050
+UPhG598XWOQ_0051
+UPhG598XWOQ_0056
+UPhG598XWOQ_0058
+UPhG598XWOQ_0059
+UPhG598XWOQ_0061
+UPhG598XWOQ_0064
+UWgHRNwSBPs_0001
+UWgHRNwSBPs_0002
+UdzWBW_GVdA_0000
+UdzWBW_GVdA_0011
+UdzWBW_GVdA_0022
+UdzWBW_GVdA_0031
+UdzWBW_GVdA_0034
+UdzWBW_GVdA_0037
+UdzWBW_GVdA_0039
+UdzWBW_GVdA_0053
+UdzWBW_GVdA_0058
+UdzWBW_GVdA_0059
+UdzWBW_GVdA_0067
+UdzWBW_GVdA_0078
+UdzWBW_GVdA_0081
+UdzWBW_GVdA_0091
+UdzWBW_GVdA_0097
+UdzWBW_GVdA_0102
+UdzWBW_GVdA_0108
+UdzWBW_GVdA_0118
+UdzWBW_GVdA_0121
+UdzWBW_GVdA_0127
+UdzWBW_GVdA_0131
+UkspwzJI9D0_0155
+UkspwzJI9D0_0157
+UkspwzJI9D0_0172
+VmGfENnSrtE_0002
+VmGfENnSrtE_0046
+VmGfENnSrtE_0065
+VmGfENnSrtE_0066
+VmGfENnSrtE_0071
+Vr0_vZpLDOA_0001
+Vr0_vZpLDOA_0004
+Vr0_vZpLDOA_0005
+Vr0_vZpLDOA_0011
+Vr0_vZpLDOA_0013
+Vr0_vZpLDOA_0015
+Vr0_vZpLDOA_0016
+Vr0_vZpLDOA_0017
+Vr0_vZpLDOA_0020
+Vr0_vZpLDOA_0021
+Vr0_vZpLDOA_0046
+W1rJdHPRM_0_0010
+W1rJdHPRM_0_0025
+W1rJdHPRM_0_0026
+W1rJdHPRM_0_0093
+W1rJdHPRM_0_0119
+W1rJdHPRM_0_0152
+WYMGWr6NVgc_0000
+WYMGWr6NVgc_0003
+WYMGWr6NVgc_0007
+WYMGWr6NVgc_0020
+WYMGWr6NVgc_0027
+XKy80mmqQbE_0007
+XKy80mmqQbE_0008
+XKy80mmqQbE_0022
+XKy80mmqQbE_0043
+XKy80mmqQbE_0045
+XKy80mmqQbE_0073
+XKy80mmqQbE_0076
+XdfLK4pXgNU_0003
+XdfLK4pXgNU_0004
+XdfLK4pXgNU_0006
+YHzoGWjcLRg_0002
+YHzoGWjcLRg_0006
+YHzoGWjcLRg_0007
+YHzoGWjcLRg_0016
+YHzoGWjcLRg_0021
+YHzoGWjcLRg_0024
+YHzoGWjcLRg_0026
+YHzoGWjcLRg_0029
+YHzoGWjcLRg_0049
+YHzoGWjcLRg_0171
+YHzoGWjcLRg_0174
+YHzoGWjcLRg_0181
+YHzoGWjcLRg_0186
+YHzoGWjcLRg_0190
+YHzoGWjcLRg_0193
+YHzoGWjcLRg_0196
+YHzoGWjcLRg_0197
+YHzoGWjcLRg_0202
+YHzoGWjcLRg_0213
+YxYY8V9MPCI_0067
+Z5nP4YB93KM_0004
+Z5nP4YB93KM_0009
+Z5nP4YB93KM_0010
+ZAmgWLDdyZ4_0003
+ZAmgWLDdyZ4_0009
+ZAmgWLDdyZ4_0010
+ZAmgWLDdyZ4_0020
+ZAmgWLDdyZ4_0049
+ZAmgWLDdyZ4_0058
+ZAmgWLDdyZ4_0102
+ZAmgWLDdyZ4_0109
+ZAmgWLDdyZ4_0116
+ZAmgWLDdyZ4_0118
+ZAmgWLDdyZ4_0132
+ZAmgWLDdyZ4_0137
+ZAmgWLDdyZ4_0140
+ZbRpQnrMwQM_0019
+ZbRpQnrMwQM_0023
+ZbRpQnrMwQM_0027
+ZbRpQnrMwQM_0031
+ZbRpQnrMwQM_0032
+ZbRpQnrMwQM_0037
+ZbRpQnrMwQM_0110
+ZkKAwe9SYNY_0002
+ZkKAwe9SYNY_0005
+ZkKAwe9SYNY_0007
+ZkKAwe9SYNY_0008
+ZkKAwe9SYNY_0013
+ZkKAwe9SYNY_0014
+ZkKAwe9SYNY_0015
+ZkKAwe9SYNY_0021
+ZkKAwe9SYNY_0030
+ZkKAwe9SYNY_0031
+ZkKAwe9SYNY_0059
+ZkKAwe9SYNY_0060
+ZkKAwe9SYNY_0071
+ZkKAwe9SYNY_0072
+ZkKAwe9SYNY_0075
+ZkKAwe9SYNY_0076
+ZkKAwe9SYNY_0079
+ZkKAwe9SYNY_0084
+ZkKAwe9SYNY_0089
+ZkKAwe9SYNY_0091
+ZkKAwe9SYNY_0095
+ZkKAwe9SYNY_0105
+ZkKAwe9SYNY_0115
+ZkKAwe9SYNY_0120
+ZkKAwe9SYNY_0123
+ZkKAwe9SYNY_0124
+ZkKAwe9SYNY_0128
+ZkKAwe9SYNY_0153
+Zv1Us7bg2Ss_0002
+Zv1Us7bg2Ss_0011
+Zv1Us7bg2Ss_0012
+Zv1Us7bg2Ss_0020
+Zv1Us7bg2Ss_0024
+akhR1yAibr4_0010
+akhR1yAibr4_0057
+akhR1yAibr4_0058
+bM7S5UX48w8_0010
+bM7S5UX48w8_0027
+c7q6typ92cQ_0000
+coj4VKJvp8E_0009
+coj4VKJvp8E_0010
+coj4VKJvp8E_0024
+coj4VKJvp8E_0025
+coj4VKJvp8E_0026
+coj4VKJvp8E_0029
+coj4VKJvp8E_0054
+coj4VKJvp8E_0096
+coj4VKJvp8E_0187
+coj4VKJvp8E_0207
+coj4VKJvp8E_0234
+coj4VKJvp8E_0248
+coj4VKJvp8E_0260
+coj4VKJvp8E_0335
+coj4VKJvp8E_0357
+cxktJLhV_Uk_0001
+cxktJLhV_Uk_0002
+cxktJLhV_Uk_0010
+cxktJLhV_Uk_0019
+cxktJLhV_Uk_0022
+cxktJLhV_Uk_0024
+cxktJLhV_Uk_0027
+cxktJLhV_Uk_0028
+cxktJLhV_Uk_0029
+cxktJLhV_Uk_0032
+cxktJLhV_Uk_0035
+cxktJLhV_Uk_0036
+cxktJLhV_Uk_0037
+cxktJLhV_Uk_0043
+cxktJLhV_Uk_0051
+dSuP1pRopO8_0001
+dSuP1pRopO8_0011
+dSuP1pRopO8_0066
+djRW_JRmkpE_0007
+djRW_JRmkpE_0101
+djRW_JRmkpE_0179
+djRW_JRmkpE_0189
+djRW_JRmkpE_0190
+djRW_JRmkpE_0194
+djRW_JRmkpE_0229
+djRW_JRmkpE_0246
+djRW_JRmkpE_0247
+djRW_JRmkpE_0248
+djRW_JRmkpE_0258
+djRW_JRmkpE_0259
+egv_NFOZlq4_0000
+egv_NFOZlq4_0007
+egv_NFOZlq4_0014
+egv_NFOZlq4_0021
+egv_NFOZlq4_0026
+egv_NFOZlq4_0027
+egv_NFOZlq4_0030
+egv_NFOZlq4_0093
+egv_NFOZlq4_0102
+egv_NFOZlq4_0108
+egv_NFOZlq4_0109
+egv_NFOZlq4_0120
+egv_NFOZlq4_0138
+fAZQrAoIHZs_0000
+fAZQrAoIHZs_0004
+fAZQrAoIHZs_0011
+fAZQrAoIHZs_0012
+fAZQrAoIHZs_0019
+fAZQrAoIHZs_0062
+fAZQrAoIHZs_0120
+fAZQrAoIHZs_0173
+fAZQrAoIHZs_0188
+fAZQrAoIHZs_0190
+fAZQrAoIHZs_0210
+fAZQrAoIHZs_0219
+fAZQrAoIHZs_0237
+fAZQrAoIHZs_0292
+fAZQrAoIHZs_0319
+fAZQrAoIHZs_0370
+fAZQrAoIHZs_0383
+fAZQrAoIHZs_0392
+fAZQrAoIHZs_0393
+fAZQrAoIHZs_0397
+fAZQrAoIHZs_0404
+fs1tJm743Gg_0004
+fs1tJm743Gg_0013
+fs1tJm743Gg_0024
+fs1tJm743Gg_0026
+fs1tJm743Gg_0033
+fs1tJm743Gg_0065
+fs1tJm743Gg_0119
+fs1tJm743Gg_0209
+fs1tJm743Gg_0211
+fs1tJm743Gg_0304
+fs1tJm743Gg_0308
+fuJBI8SCy4k_0002
+fuJBI8SCy4k_0005
+fuJBI8SCy4k_0014
+fuJBI8SCy4k_0030
+fuJBI8SCy4k_0031
+fuJBI8SCy4k_0037
+fuJBI8SCy4k_0064
+g_mYumqTEdQ_0002
+g_mYumqTEdQ_0069
+g_mYumqTEdQ_0071
+g_mYumqTEdQ_0073
+g_mYumqTEdQ_0098
+g_mYumqTEdQ_0137
+g_mYumqTEdQ_0164
+g_mYumqTEdQ_0196
+g_mYumqTEdQ_0212
+g_mYumqTEdQ_0215
+gwTYYXTJP7s_0000
+gwTYYXTJP7s_0031
+gwTYYXTJP7s_0055
+gwTYYXTJP7s_0065
+gwTYYXTJP7s_0078
+gwTYYXTJP7s_0161
+gwTYYXTJP7s_0162
+gwTYYXTJP7s_0166
+gwTYYXTJP7s_0170
+gwTYYXTJP7s_0197
+gwTYYXTJP7s_0204
+gwTYYXTJP7s_0208
+gwTYYXTJP7s_0213
+gwTYYXTJP7s_0217
+gwTYYXTJP7s_0219
+hGQtaDy_wL8_0002
+hGQtaDy_wL8_0004
+hGQtaDy_wL8_0013
+hGQtaDy_wL8_0025
+hGQtaDy_wL8_0028
+hGQtaDy_wL8_0044
+hx0vzbF9Xls_0004
+ie7wvy3su7E_0000
+ie7wvy3su7E_0002
+ie7wvy3su7E_0004
+ie7wvy3su7E_0005
+ie7wvy3su7E_0008
+ie7wvy3su7E_0011
+ie7wvy3su7E_0023
+ie7wvy3su7E_0025
+ie7wvy3su7E_0026
+ie7wvy3su7E_0029
+ie7wvy3su7E_0030
+ie7wvy3su7E_0031
+ie7wvy3su7E_0032
+ie7wvy3su7E_0033
+ie7wvy3su7E_0034
+ie7wvy3su7E_0045
+ie7wvy3su7E_0053
+ie7wvy3su7E_0055
+ie7wvy3su7E_0072
+ie7wvy3su7E_0078
+ie7wvy3su7E_0079
+ie7wvy3su7E_0082
+ieATkgEmuOo_0002
+ieATkgEmuOo_0004
+ieATkgEmuOo_0040
+ieATkgEmuOo_0044
+ieATkgEmuOo_0047
+ieATkgEmuOo_0049
+iwZIu2B9Kxc_0009
+iwZIu2B9Kxc_0011
+iwZIu2B9Kxc_0012
+iwZIu2B9Kxc_0015
+iwZIu2B9Kxc_0018
+iwtErhAprUE_0005
+iwtErhAprUE_0007
+iwtErhAprUE_0059
+iwtErhAprUE_0063
+jvepzgCLrI8_0000
+jvepzgCLrI8_0013
+jvepzgCLrI8_0014
+jvepzgCLrI8_0015
+jvepzgCLrI8_0045
+jvepzgCLrI8_0061
+jvepzgCLrI8_0062
+jvepzgCLrI8_0063
+jvepzgCLrI8_0064
+kCzp9tbRxqk_0000
+kCzp9tbRxqk_0005
+kCzp9tbRxqk_0009
+kCzp9tbRxqk_0018
+kCzp9tbRxqk_0025
+kCzp9tbRxqk_0066
+kCzp9tbRxqk_0074
+kCzp9tbRxqk_0111
+kJDrEJD3Cyk_0126
+kJDrEJD3Cyk_0133
+kJDrEJD3Cyk_0158
+ku6EHu0igME_0066
+ku6EHu0igME_0210
+ku6EHu0igME_0214
+kxTyHU_bt2s_0000
+kxTyHU_bt2s_0011
+kxTyHU_bt2s_0046
+kxTyHU_bt2s_0054
+lFdW2t9Hiog_0002
+lFdW2t9Hiog_0041
+lRiM5Kk4OvY_0064
+lRiM5Kk4OvY_0073
+lvuejJYTE1A_0057
+lxIccBrKK3E_0002
+lxIccBrKK3E_0007
+nBkwCNUezrw_0091
+nBkwCNUezrw_0211
+nBkwCNUezrw_0216
+nouG8nHNy38_0027
+nouG8nHNy38_0028
+nxX6KqK2DTE_0048
+oq45KIc251c_0001
+oq45KIc251c_0011
+oq45KIc251c_0014
+oq45KIc251c_0017
+oq45KIc251c_0038
+oq45KIc251c_0050
+oq45KIc251c_0056
+oq45KIc251c_0058
+ouOCfB5Unb0_0165
+pngzGTW8jpA_0007
+pngzGTW8jpA_0033
+q91xEjb1SZQ_0002
+q91xEjb1SZQ_0006
+q91xEjb1SZQ_0041
+q91xEjb1SZQ_0042
+qLCzx7qzRk8_0000
+qLCzx7qzRk8_0021
+qLCzx7qzRk8_0029
+qfeHKniOvkY_0078
+qfeHKniOvkY_0079
+qfeHKniOvkY_0113
+quC_hC3p1tY_0004
+quC_hC3p1tY_0035
+quC_hC3p1tY_0043
+quC_hC3p1tY_0044
+r8bvyjeT0aA_0028
+r8bvyjeT0aA_0036
+r8bvyjeT0aA_0065
+ryIQByN3HiE_0003
+ryIQByN3HiE_0110
+ryIQByN3HiE_0137
+sPdwWCCHWYY_0001
+sPdwWCCHWYY_0007
+sPdwWCCHWYY_0010
+sPdwWCCHWYY_0024
+sPdwWCCHWYY_0029
+sPdwWCCHWYY_0030
+sPdwWCCHWYY_0033
+sPdwWCCHWYY_0035
+sPdwWCCHWYY_0041
+sPdwWCCHWYY_0043
+sPdwWCCHWYY_0051
+sPdwWCCHWYY_0075
+sPdwWCCHWYY_0076
+sPdwWCCHWYY_0079
+sPdwWCCHWYY_0086
+sPdwWCCHWYY_0088
+sPdwWCCHWYY_0095
+sPdwWCCHWYY_0096
+sPdwWCCHWYY_0101
+sPdwWCCHWYY_0102
+sPdwWCCHWYY_0103
+sPdwWCCHWYY_0150
+sPdwWCCHWYY_0152
+sPdwWCCHWYY_0192
+sPdwWCCHWYY_0205
+sPdwWCCHWYY_0214
+sPdwWCCHWYY_0215
+sPdwWCCHWYY_0219
+sPdwWCCHWYY_0229
+sPdwWCCHWYY_0241
+sPdwWCCHWYY_0250
+sPdwWCCHWYY_0251
+sPdwWCCHWYY_0254
+sPdwWCCHWYY_0255
+sPdwWCCHWYY_0256
+sPdwWCCHWYY_0263
+sPdwWCCHWYY_0276
+sPdwWCCHWYY_0293
+ssenWQx4YOo_0007
+svS3ZRTRaPQ_0147
+svS3ZRTRaPQ_0153
+tLdwxXhgdHM_0064
+tUWFFJq4450_0003
+tUWFFJq4450_0041
+tUWFFJq4450_0067
+tUWFFJq4450_0068
+tXVnxgd8uqQ_0004
+tXVnxgd8uqQ_0022
+tXVnxgd8uqQ_0028
+tXVnxgd8uqQ_0035
+tXVnxgd8uqQ_0046
+u8RVt3d5_Sk_0094
+u8RVt3d5_Sk_0096
+u8RVt3d5_Sk_0100
+u8RVt3d5_Sk_0151
+u8RVt3d5_Sk_0153
+u8RVt3d5_Sk_0207
+u8RVt3d5_Sk_0208
+u8RVt3d5_Sk_0211
+u8RVt3d5_Sk_0217
+u8RVt3d5_Sk_0218
+u8RVt3d5_Sk_0219
+u8RVt3d5_Sk_0239
+u8RVt3d5_Sk_0246
+ushxgQA_iIU_0236
+vKCdpSN_luc_0057
+vKCdpSN_luc_0074
+vKCdpSN_luc_0079
+vKCdpSN_luc_0082
+vKCdpSN_luc_0083
+vKCdpSN_luc_0084
+vKCdpSN_luc_0102
+vKCdpSN_luc_0149
+vKCdpSN_luc_0180
+vKCdpSN_luc_0185
+wGJpjV1jFQw_0013
+wGJpjV1jFQw_0062
+wGJpjV1jFQw_0122
+wrfIXsCbFKc_0026
+wrfIXsCbFKc_0030
+wrfIXsCbFKc_0118
+wrfIXsCbFKc_0141
+wrfIXsCbFKc_0152
+wrfIXsCbFKc_0180
+x2kw6MXDMpo_0030
+xT4A1kKMUAY_0000
+xT4A1kKMUAY_0009
+xT4A1kKMUAY_0029
+xsGHjobUDK0_0027
+xsGHjobUDK0_0112
+xsGHjobUDK0_0127
+xsGHjobUDK0_0131
+xsGHjobUDK0_0135
+xsGHjobUDK0_0156
+yzo68izI7bI_0057
+zJucVwUfWww_0034
+zVSej40xDuE_0007
+zVSej40xDuE_0041
+zVSej40xDuE_0049
+zVSej40xDuE_0063
+zVSej40xDuE_0066
+zVSej40xDuE_0073
+zfBYJrhMZwQ_0234
diff --git a/egs2/jtubespeech/asr1/local/eval_speaker_list b/egs2/jtubespeech/asr1/local/eval_speaker_list
new file mode 100644
index 00000000000..387e2ea247b
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/eval_speaker_list
@@ -0,0 +1,162 @@
+1P4ouTTTVNc
+1Uy_W5iOHKA
+1VK1jSCuS0I
+1qP4vc3yTCw
+1zQ1bxwtu24
+2bWvx461Umc
+3AFobAY8H_8
+3I1uHfht20k
+3Nrbi6yRugo
+3TLv_8tpMdA
+415VX3QX4cU
+4BCatS_OM0o
+4EePkO7vBTU
+4NhD0vZ9yPc
+4fpOXK3Znxs
+5N1LmwPFh1g
+5oOOuO5G1jY
+5pf0oP5PQAE
+64fFAL8d6GA
+6QWSHPwlBTQ
+7h6xz2r2AMs
+7kCGm71FJo0
+81UiANYtcOg
+8XIOuVh4lgM
+8l3IHmW2KSk
+98y1W2SRsX4
+A3ZINKGArTU
+AAdxdUhaIYU
+AQAgvg3KHwo
+BMBYqLshQkw
+DLyKmfU2WLU
+DYjooacju_8
+E4PgIY36DTY
+ELfL879J1ZM
+EXdqYY60y04
+EZKBDWRn7wc
+FO19oi80mkU
+FYZGO5W3n4I
+FmlGi6u7aIo
+FocYa_DJkzM
+GJccNIFQnTA
+GTVsvvCjwjk
+ICv_Liucpf0
+Ibw0yLy8h20
+Kop7BUYHaJ8
+LGmey8AfKC0
+LrJicbYDgBk
+LyzaKXK15Vg
+MP5mZaUupXc
+MQdCxzu9I7Y
+MWPsCJMSvZo
+McQrH0iqFlY
+NvZhSVE5PWQ
+OCz0we8aiHM
+ObVqn63Zc14
+PWFevMuRRks
+QachBPW_RZY
+RYIMKlZQpIs
+S1yH7QxqkM0
+SjhpH8CEvkM
+SuCN2QHLBHU
+TaAHiOFvCLA
+TrSURh_zqG8
+U8iB8i0o6lM
+UPhG598XWOQ
+UWgHRNwSBPs
+UdzWBW_GVdA
+UkspwzJI9D0
+VmGfENnSrtE
+VnLmGUGUFHI
+Vr0_vZpLDOA
+W1rJdHPRM_0
+WYMGWr6NVgc
+XKy80mmqQbE
+XdfLK4pXgNU
+YHzoGWjcLRg
+YPLGuwXomYQ
+YlH1K27gP1Y
+YxYY8V9MPCI
+Z5nP4YB93KM
+ZAmgWLDdyZ4
+ZbRpQnrMwQM
+ZkKAwe9SYNY
+Zv1Us7bg2Ss
+ZyvrbXsePgM
+akhR1yAibr4
+bM7S5UX48w8
+c7q6typ92cQ
+coj4VKJvp8E
+cxktJLhV_Uk
+dSuP1pRopO8
+djRW_JRmkpE
+egv_NFOZlq4
+fAZQrAoIHZs
+fhqR6xke3a4
+fs1tJm743Gg
+fuJBI8SCy4k
+gF5P556luWY
+g_mYumqTEdQ
+gwTYYXTJP7s
+h4q0ote54M0
+hGQtaDy_wL8
+hx0vzbF9Xls
+ie7wvy3su7E
+ieATkgEmuOo
+iwZIu2B9Kxc
+iwtErhAprUE
+jvepzgCLrI8
+kCzp9tbRxqk
+kJDrEJD3Cyk
+ku6EHu0igME
+kxTyHU_bt2s
+l1HZxq3wLFo
+lFdW2t9Hiog
+lRiM5Kk4OvY
+lbIrL7PKjNQ
+lvuejJYTE1A
+lxIccBrKK3E
+mWmLeqdo2Mo
+nBkwCNUezrw
+nouG8nHNy38
+nrOPH73XQDE
+nxX6KqK2DTE
+oq45KIc251c
+ouOCfB5Unb0
+pngzGTW8jpA
+q1jwWz48XBA
+q91xEjb1SZQ
+qJrPLkkBN7Q
+qLCzx7qzRk8
+qfeHKniOvkY
+quC_hC3p1tY
+r8bvyjeT0aA
+rFAdlU2ETjU
+rqrTPS3p878
+rtXTEG2yKYo
+ryIQByN3HiE
+s1CGt5gHvzU
+s7wzLT27NDk
+sPdwWCCHWYY
+ssenWQx4YOo
+svS3ZRTRaPQ
+sy6DLF455LA
+tLdwxXhgdHM
+tUWFFJq4450
+tXVnxgd8uqQ
+u0CNIMmG44c
+u8RVt3d5_Sk
+ushxgQA_iIU
+uumAohMovts
+vKCdpSN_luc
+wGJpjV1jFQw
+wrfIXsCbFKc
+x2kw6MXDMpo
+xT4A1kKMUAY
+xkuS6rTZNek
+xsGHjobUDK0
+yzo68izI7bI
+zI8EmxDe4eI
+zJucVwUfWww
+zVSej40xDuE
+zfBYJrhMZwQ
diff --git a/egs2/jtubespeech/asr1/local/path.sh b/egs2/jtubespeech/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/jtubespeech/asr1/local/test_normal_jun21.list b/egs2/jtubespeech/asr1/local/test_normal_jun21.list
new file mode 100644
index 00000000000..512fb4172b5
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/test_normal_jun21.list
@@ -0,0 +1,3484 @@
+1P4ouTTTVNc_0033
+1Uy_W5iOHKA_0000
+1Uy_W5iOHKA_0001
+1Uy_W5iOHKA_0004
+1Uy_W5iOHKA_0006
+1Uy_W5iOHKA_0008
+1Uy_W5iOHKA_0010
+1Uy_W5iOHKA_0011
+1Uy_W5iOHKA_0012
+1Uy_W5iOHKA_0013
+1Uy_W5iOHKA_0014
+1Uy_W5iOHKA_0019
+1Uy_W5iOHKA_0020
+1Uy_W5iOHKA_0022
+1Uy_W5iOHKA_0024
+1Uy_W5iOHKA_0025
+1Uy_W5iOHKA_0028
+1Uy_W5iOHKA_0029
+1Uy_W5iOHKA_0030
+1VK1jSCuS0I_0007
+1VK1jSCuS0I_0017
+1VK1jSCuS0I_0030
+1VK1jSCuS0I_0036
+1qP4vc3yTCw_0001
+1qP4vc3yTCw_0015
+1qP4vc3yTCw_0026
+1qP4vc3yTCw_0037
+1qP4vc3yTCw_0047
+1zQ1bxwtu24_0002
+1zQ1bxwtu24_0006
+1zQ1bxwtu24_0008
+1zQ1bxwtu24_0009
+1zQ1bxwtu24_0014
+1zQ1bxwtu24_0015
+2bWvx461Umc_0006
+2bWvx461Umc_0009
+2bWvx461Umc_0021
+2bWvx461Umc_0022
+2bWvx461Umc_0027
+2bWvx461Umc_0032
+2bWvx461Umc_0044
+2bWvx461Umc_0049
+2bWvx461Umc_0053
+2bWvx461Umc_0070
+33OWZ_X2__U_0003
+33OWZ_X2__U_0005
+33OWZ_X2__U_0008
+33OWZ_X2__U_0009
+33OWZ_X2__U_0012
+3Nrbi6yRugo_0013
+3Nrbi6yRugo_0017
+3Nrbi6yRugo_0018
+3Nrbi6yRugo_0073
+3Nrbi6yRugo_0074
+3TLv_8tpMdA_0001
+3TLv_8tpMdA_0002
+3TLv_8tpMdA_0003
+3TLv_8tpMdA_0005
+3TLv_8tpMdA_0008
+3TLv_8tpMdA_0010
+3TLv_8tpMdA_0031
+3TLv_8tpMdA_0102
+3TLv_8tpMdA_0136
+3TLv_8tpMdA_0137
+3TLv_8tpMdA_0139
+3TLv_8tpMdA_0147
+3TLv_8tpMdA_0149
+3TLv_8tpMdA_0151
+3TLv_8tpMdA_0180
+3TLv_8tpMdA_0190
+3TLv_8tpMdA_0192
+3TLv_8tpMdA_0202
+3TLv_8tpMdA_0203
+3TLv_8tpMdA_0204
+3TLv_8tpMdA_0205
+3TLv_8tpMdA_0206
+3TLv_8tpMdA_0212
+3TLv_8tpMdA_0213
+3TLv_8tpMdA_0214
+3TLv_8tpMdA_0215
+3TLv_8tpMdA_0216
+3TLv_8tpMdA_0217
+3TLv_8tpMdA_0218
+3TLv_8tpMdA_0219
+3TLv_8tpMdA_0220
+3aILBytvlks_0001
+3aILBytvlks_0003
+3aILBytvlks_0004
+3aILBytvlks_0005
+3aILBytvlks_0006
+3aILBytvlks_0007
+3aILBytvlks_0008
+3aILBytvlks_0011
+3aILBytvlks_0014
+3aILBytvlks_0017
+3aILBytvlks_0019
+3aILBytvlks_0043
+3aILBytvlks_0050
+3aILBytvlks_0062
+3aILBytvlks_0080
+3aILBytvlks_0109
+3aILBytvlks_0111
+3aILBytvlks_0114
+3aILBytvlks_0144
+3aILBytvlks_0148
+3aILBytvlks_0164
+48gSJYp482w_0000
+48gSJYp482w_0001
+48gSJYp482w_0007
+4BCatS_OM0o_0003
+4BCatS_OM0o_0006
+4BCatS_OM0o_0007
+4BCatS_OM0o_0011
+4BCatS_OM0o_0012
+4BCatS_OM0o_0014
+4BCatS_OM0o_0016
+4BCatS_OM0o_0029
+4EePkO7vBTU_0005
+4EePkO7vBTU_0015
+4NhD0vZ9yPc_0067
+4NhD0vZ9yPc_0068
+4NhD0vZ9yPc_0069
+4NhD0vZ9yPc_0070
+4NhD0vZ9yPc_0071
+4NhD0vZ9yPc_0085
+4NhD0vZ9yPc_0137
+4NhD0vZ9yPc_0138
+5N1LmwPFh1g_0003
+5N1LmwPFh1g_0006
+5N1LmwPFh1g_0022
+5N1LmwPFh1g_0027
+5QlqblKceJk_0000
+5QlqblKceJk_0001
+5QlqblKceJk_0002
+5QlqblKceJk_0005
+5QlqblKceJk_0006
+5QlqblKceJk_0007
+5QlqblKceJk_0008
+5QlqblKceJk_0009
+5QlqblKceJk_0011
+5QlqblKceJk_0012
+5QlqblKceJk_0018
+5QlqblKceJk_0019
+5QlqblKceJk_0024
+5ef8wKGqQbw_0002
+5ef8wKGqQbw_0004
+5ef8wKGqQbw_0007
+5ef8wKGqQbw_0063
+5ef8wKGqQbw_0065
+5ef8wKGqQbw_0071
+5ef8wKGqQbw_0074
+5ef8wKGqQbw_0079
+5ef8wKGqQbw_0089
+5ef8wKGqQbw_0091
+5ef8wKGqQbw_0092
+5oOOuO5G1jY_0003
+5oOOuO5G1jY_0005
+5oOOuO5G1jY_0008
+5oOOuO5G1jY_0019
+5oOOuO5G1jY_0021
+5oOOuO5G1jY_0036
+5oOOuO5G1jY_0044
+5oOOuO5G1jY_0045
+5oOOuO5G1jY_0067
+5oOOuO5G1jY_0068
+5oOOuO5G1jY_0075
+5oOOuO5G1jY_0078
+5oOOuO5G1jY_0079
+5oOOuO5G1jY_0080
+5oOOuO5G1jY_0081
+5oOOuO5G1jY_0083
+5oOOuO5G1jY_0084
+5oOOuO5G1jY_0090
+5oOOuO5G1jY_0091
+5oOOuO5G1jY_0092
+5oOOuO5G1jY_0094
+5oOOuO5G1jY_0096
+5oOOuO5G1jY_0101
+5oOOuO5G1jY_0103
+5oOOuO5G1jY_0104
+5oOOuO5G1jY_0107
+5oOOuO5G1jY_0108
+5oOOuO5G1jY_0111
+5oOOuO5G1jY_0112
+5oOOuO5G1jY_0113
+5oOOuO5G1jY_0115
+5oOOuO5G1jY_0116
+5oOOuO5G1jY_0118
+5oOOuO5G1jY_0122
+5oOOuO5G1jY_0126
+5oOOuO5G1jY_0129
+5oOOuO5G1jY_0130
+5oOOuO5G1jY_0132
+5oOOuO5G1jY_0136
+5oOOuO5G1jY_0138
+5oOOuO5G1jY_0140
+5oOOuO5G1jY_0141
+5oOOuO5G1jY_0142
+5oOOuO5G1jY_0143
+5oOOuO5G1jY_0145
+5oOOuO5G1jY_0146
+5oOOuO5G1jY_0147
+5oOOuO5G1jY_0150
+5oOOuO5G1jY_0153
+5oOOuO5G1jY_0154
+5oOOuO5G1jY_0156
+5oOOuO5G1jY_0157
+5oOOuO5G1jY_0159
+5oOOuO5G1jY_0161
+5oOOuO5G1jY_0162
+5oOOuO5G1jY_0166
+5oOOuO5G1jY_0167
+5oOOuO5G1jY_0169
+5oOOuO5G1jY_0170
+5oOOuO5G1jY_0171
+5oOOuO5G1jY_0173
+5oOOuO5G1jY_0175
+5pf0oP5PQAE_0021
+5pf0oP5PQAE_0023
+5pf0oP5PQAE_0029
+5pf0oP5PQAE_0033
+5pf0oP5PQAE_0036
+5pf0oP5PQAE_0037
+5pf0oP5PQAE_0038
+5pf0oP5PQAE_0039
+5pf0oP5PQAE_0040
+5pf0oP5PQAE_0041
+5pf0oP5PQAE_0045
+5pf0oP5PQAE_0047
+5pf0oP5PQAE_0056
+5pf0oP5PQAE_0058
+5pf0oP5PQAE_0060
+5pf0oP5PQAE_0062
+5pf0oP5PQAE_0064
+5pf0oP5PQAE_0067
+5pf0oP5PQAE_0068
+5pf0oP5PQAE_0070
+5pf0oP5PQAE_0072
+5pf0oP5PQAE_0078
+5pf0oP5PQAE_0079
+5pf0oP5PQAE_0083
+5pf0oP5PQAE_0084
+5pf0oP5PQAE_0086
+5pf0oP5PQAE_0091
+5pf0oP5PQAE_0096
+5pf0oP5PQAE_0098
+5pf0oP5PQAE_0099
+5pf0oP5PQAE_0100
+5pf0oP5PQAE_0101
+5pf0oP5PQAE_0104
+5pf0oP5PQAE_0105
+5pf0oP5PQAE_0107
+5pf0oP5PQAE_0110
+5pf0oP5PQAE_0112
+5pf0oP5PQAE_0113
+5pf0oP5PQAE_0114
+5pf0oP5PQAE_0115
+5pf0oP5PQAE_0116
+5pf0oP5PQAE_0118
+5pf0oP5PQAE_0121
+5pf0oP5PQAE_0125
+5pf0oP5PQAE_0128
+5pf0oP5PQAE_0129
+5pf0oP5PQAE_0134
+5pf0oP5PQAE_0139
+5pf0oP5PQAE_0141
+5pf0oP5PQAE_0142
+5pf0oP5PQAE_0146
+5pf0oP5PQAE_0147
+5pf0oP5PQAE_0149
+5pf0oP5PQAE_0151
+5pf0oP5PQAE_0155
+5pf0oP5PQAE_0156
+5pf0oP5PQAE_0157
+5pf0oP5PQAE_0161
+5pf0oP5PQAE_0163
+5pf0oP5PQAE_0164
+5pf0oP5PQAE_0166
+5pf0oP5PQAE_0168
+5pf0oP5PQAE_0169
+5pf0oP5PQAE_0170
+5pf0oP5PQAE_0173
+5pf0oP5PQAE_0174
+5pf0oP5PQAE_0175
+5pf0oP5PQAE_0176
+5pf0oP5PQAE_0184
+5pf0oP5PQAE_0187
+5pf0oP5PQAE_0188
+5pf0oP5PQAE_0192
+5pf0oP5PQAE_0194
+5pf0oP5PQAE_0195
+5pf0oP5PQAE_0198
+5pf0oP5PQAE_0199
+5pf0oP5PQAE_0200
+5pf0oP5PQAE_0204
+5pf0oP5PQAE_0208
+5pf0oP5PQAE_0209
+5pf0oP5PQAE_0212
+5pf0oP5PQAE_0214
+5pf0oP5PQAE_0216
+5pf0oP5PQAE_0221
+5pf0oP5PQAE_0223
+5pf0oP5PQAE_0224
+5pf0oP5PQAE_0227
+5pf0oP5PQAE_0228
+5pf0oP5PQAE_0230
+5pf0oP5PQAE_0231
+5pf0oP5PQAE_0236
+5pf0oP5PQAE_0237
+5w6KTNJ1XxY_0002
+5w6KTNJ1XxY_0008
+5w6KTNJ1XxY_0009
+5w6KTNJ1XxY_0012
+5w6KTNJ1XxY_0013
+5w6KTNJ1XxY_0016
+5w6KTNJ1XxY_0017
+5w6KTNJ1XxY_0019
+5w6KTNJ1XxY_0029
+61eExIhQrcY_0005
+61eExIhQrcY_0112
+61eExIhQrcY_0115
+64fFAL8d6GA_0099
+6F97M3iktks_0006
+6F97M3iktks_0008
+6F97M3iktks_0009
+6F97M3iktks_0013
+6F97M3iktks_0018
+6F97M3iktks_0032
+6QWSHPwlBTQ_0002
+6QWSHPwlBTQ_0003
+6QWSHPwlBTQ_0012
+6QWSHPwlBTQ_0158
+6QWSHPwlBTQ_0213
+6QWSHPwlBTQ_0214
+6QWSHPwlBTQ_0216
+7FU9Yb92o7U_0000
+7FU9Yb92o7U_0005
+7FU9Yb92o7U_0007
+7FU9Yb92o7U_0009
+7FU9Yb92o7U_0012
+7FU9Yb92o7U_0018
+7FU9Yb92o7U_0019
+7FU9Yb92o7U_0034
+7FU9Yb92o7U_0083
+7FU9Yb92o7U_0094
+7FU9Yb92o7U_0098
+7FU9Yb92o7U_0105
+7FU9Yb92o7U_0112
+7FU9Yb92o7U_0118
+7FU9Yb92o7U_0119
+7FU9Yb92o7U_0142
+7FU9Yb92o7U_0144
+7FU9Yb92o7U_0161
+7FU9Yb92o7U_0172
+7FU9Yb92o7U_0173
+7FU9Yb92o7U_0177
+7FU9Yb92o7U_0178
+7FU9Yb92o7U_0179
+7FU9Yb92o7U_0181
+7FU9Yb92o7U_0184
+7FU9Yb92o7U_0185
+7FU9Yb92o7U_0197
+7FU9Yb92o7U_0204
+7FU9Yb92o7U_0224
+7h6xz2r2AMs_0004
+7h6xz2r2AMs_0005
+7h6xz2r2AMs_0037
+7h6xz2r2AMs_0038
+7h6xz2r2AMs_0047
+7kCGm71FJo0_0002
+7kCGm71FJo0_0005
+7kCGm71FJo0_0011
+7kCGm71FJo0_0035
+7kCGm71FJo0_0036
+7kCGm71FJo0_0041
+8AQlJEqWBXg_0050
+8RiD9yjvI88_0004
+8RiD9yjvI88_0005
+8XIOuVh4lgM_0087
+8XIOuVh4lgM_0092
+8l3IHmW2KSk_0078
+8l3IHmW2KSk_0081
+8l3IHmW2KSk_0083
+98y1W2SRsX4_0003
+98y1W2SRsX4_0022
+98y1W2SRsX4_0027
+98y1W2SRsX4_0032
+98y1W2SRsX4_0034
+A3ZINKGArTU_0000
+A3ZINKGArTU_0007
+A3ZINKGArTU_0009
+A3ZINKGArTU_0014
+AAdxdUhaIYU_0003
+AAdxdUhaIYU_0004
+AAdxdUhaIYU_0006
+AAdxdUhaIYU_0010
+AAdxdUhaIYU_0012
+AAdxdUhaIYU_0013
+AAdxdUhaIYU_0014
+AAdxdUhaIYU_0017
+AAdxdUhaIYU_0019
+AAdxdUhaIYU_0023
+AQAgvg3KHwo_0040
+AQAgvg3KHwo_0079
+AQAgvg3KHwo_0093
+AQAgvg3KHwo_0115
+AQAgvg3KHwo_0255
+AQAgvg3KHwo_0361
+AQAgvg3KHwo_0375
+AQAgvg3KHwo_0397
+AQAgvg3KHwo_0431
+ARsrJTRa2MQ_0047
+ARsrJTRa2MQ_0077
+BDtdR0Lplc0_0001
+BDtdR0Lplc0_0002
+BDtdR0Lplc0_0006
+BDtdR0Lplc0_0009
+BDtdR0Lplc0_0010
+BDtdR0Lplc0_0014
+BDtdR0Lplc0_0015
+BDtdR0Lplc0_0022
+BDtdR0Lplc0_0035
+BDtdR0Lplc0_0038
+BDtdR0Lplc0_0044
+BDtdR0Lplc0_0046
+BDtdR0Lplc0_0047
+BDz56lr0QEI_0033
+BDz56lr0QEI_0149
+BLMI9jdtnFc_0000
+BLMI9jdtnFc_0001
+BLMI9jdtnFc_0002
+BLMI9jdtnFc_0004
+BLMI9jdtnFc_0005
+BLMI9jdtnFc_0014
+BLMI9jdtnFc_0020
+BLMI9jdtnFc_0026
+BLMI9jdtnFc_0027
+BLMI9jdtnFc_0029
+BLMI9jdtnFc_0034
+BLMI9jdtnFc_0037
+BLMI9jdtnFc_0049
+BLMI9jdtnFc_0055
+BLMI9jdtnFc_0098
+BMBYqLshQkw_0000
+BMBYqLshQkw_0001
+BMBYqLshQkw_0003
+BMBYqLshQkw_0009
+BMBYqLshQkw_0015
+BMBYqLshQkw_0022
+BMBYqLshQkw_0024
+BMBYqLshQkw_0026
+BMBYqLshQkw_0028
+BMBYqLshQkw_0036
+BOoxjHw6c_I_0007
+BOoxjHw6c_I_0041
+BOoxjHw6c_I_0051
+BOoxjHw6c_I_0054
+BOoxjHw6c_I_0070
+BOoxjHw6c_I_0084
+BOoxjHw6c_I_0086
+BOoxjHw6c_I_0087
+BOoxjHw6c_I_0088
+BOoxjHw6c_I_0104
+BOoxjHw6c_I_0202
+BY3aalyDpXk_0000
+BY3aalyDpXk_0001
+BY3aalyDpXk_0013
+BY3aalyDpXk_0017
+BY3aalyDpXk_0047
+BY3aalyDpXk_0052
+Bab8j1Ek4jc_0003
+C5ETpmxVicY_0002
+C5ETpmxVicY_0013
+C5ETpmxVicY_0015
+C5ETpmxVicY_0016
+C5ETpmxVicY_0022
+C5ETpmxVicY_0025
+C5ETpmxVicY_0026
+C5ETpmxVicY_0027
+C76kmEOvVEE_0026
+C76kmEOvVEE_0027
+C76kmEOvVEE_0028
+C76kmEOvVEE_0029
+C76kmEOvVEE_0032
+C76kmEOvVEE_0034
+C76kmEOvVEE_0035
+C76kmEOvVEE_0036
+C76kmEOvVEE_0037
+C76kmEOvVEE_0038
+C76kmEOvVEE_0039
+C76kmEOvVEE_0040
+C76kmEOvVEE_0041
+C76kmEOvVEE_0042
+C76kmEOvVEE_0043
+C76kmEOvVEE_0044
+C76kmEOvVEE_0045
+C76kmEOvVEE_0046
+C76kmEOvVEE_0048
+C76kmEOvVEE_0049
+C76kmEOvVEE_0050
+C76kmEOvVEE_0051
+C76kmEOvVEE_0052
+C76kmEOvVEE_0053
+C76kmEOvVEE_0054
+C76kmEOvVEE_0056
+C76kmEOvVEE_0059
+C76kmEOvVEE_0065
+C76kmEOvVEE_0066
+C76kmEOvVEE_0067
+C76kmEOvVEE_0068
+C76kmEOvVEE_0069
+C76kmEOvVEE_0071
+C76kmEOvVEE_0073
+C76kmEOvVEE_0077
+C76kmEOvVEE_0081
+C76kmEOvVEE_0082
+C76kmEOvVEE_0083
+C76kmEOvVEE_0085
+C76kmEOvVEE_0086
+C76kmEOvVEE_0090
+C76kmEOvVEE_0093
+C76kmEOvVEE_0094
+C76kmEOvVEE_0095
+C76kmEOvVEE_0096
+C76kmEOvVEE_0097
+C76kmEOvVEE_0104
+C76kmEOvVEE_0106
+C76kmEOvVEE_0107
+C76kmEOvVEE_0108
+C76kmEOvVEE_0109
+C76kmEOvVEE_0110
+C76kmEOvVEE_0113
+C76kmEOvVEE_0115
+C76kmEOvVEE_0116
+C76kmEOvVEE_0118
+C76kmEOvVEE_0120
+C76kmEOvVEE_0122
+C76kmEOvVEE_0127
+C76kmEOvVEE_0129
+C76kmEOvVEE_0131
+C76kmEOvVEE_0134
+C76kmEOvVEE_0137
+C76kmEOvVEE_0145
+C76kmEOvVEE_0146
+C76kmEOvVEE_0149
+C76kmEOvVEE_0150
+C76kmEOvVEE_0151
+C76kmEOvVEE_0153
+C76kmEOvVEE_0154
+C76kmEOvVEE_0157
+C76kmEOvVEE_0158
+C76kmEOvVEE_0159
+C76kmEOvVEE_0160
+C76kmEOvVEE_0163
+C76kmEOvVEE_0164
+C76kmEOvVEE_0168
+C76kmEOvVEE_0169
+C76kmEOvVEE_0170
+C76kmEOvVEE_0177
+C76kmEOvVEE_0178
+C76kmEOvVEE_0179
+C76kmEOvVEE_0180
+C76kmEOvVEE_0182
+C76kmEOvVEE_0187
+CJJV4xe8N4M_0067
+CJJV4xe8N4M_0068
+CJJV4xe8N4M_0075
+CJJV4xe8N4M_0090
+CJJV4xe8N4M_0093
+CJJV4xe8N4M_0098
+CJJV4xe8N4M_0143
+CJJV4xe8N4M_0150
+CJJV4xe8N4M_0151
+CJJV4xe8N4M_0152
+CJJV4xe8N4M_0165
+CJJV4xe8N4M_0169
+CJJV4xe8N4M_0172
+CJJV4xe8N4M_0174
+CJJV4xe8N4M_0176
+CJJV4xe8N4M_0225
+CJJV4xe8N4M_0232
+CJJV4xe8N4M_0237
+CJJV4xe8N4M_0238
+CJJV4xe8N4M_0312
+DJ7D7kJhYEA_0065
+DJ7D7kJhYEA_0072
+DJ7D7kJhYEA_0074
+DJ7D7kJhYEA_0075
+DJ7D7kJhYEA_0078
+DJ7D7kJhYEA_0137
+DLyKmfU2WLU_0005
+DLyKmfU2WLU_0010
+DLyKmfU2WLU_0135
+DLyKmfU2WLU_0145
+DLyKmfU2WLU_0150
+DLyKmfU2WLU_0155
+DLyKmfU2WLU_0156
+DLyKmfU2WLU_0161
+DLyKmfU2WLU_0172
+DO8RJgIujEY_0011
+DO8RJgIujEY_0025
+DO8RJgIujEY_0041
+DO8RJgIujEY_0058
+DYjooacju_8_0002
+DYjooacju_8_0003
+DYjooacju_8_0004
+DYjooacju_8_0052
+DYjooacju_8_0053
+DYjooacju_8_0059
+DYjooacju_8_0109
+DYjooacju_8_0112
+DYjooacju_8_0114
+DYjooacju_8_0115
+DYjooacju_8_0119
+DYjooacju_8_0120
+DYjooacju_8_0121
+DYjooacju_8_0122
+DYjooacju_8_0123
+DYjooacju_8_0127
+DYjooacju_8_0128
+DYjooacju_8_0129
+DYjooacju_8_0166
+DYjooacju_8_0167
+DYjooacju_8_0169
+DYjooacju_8_0173
+E4PgIY36DTY_0074
+E4PgIY36DTY_0091
+E4PgIY36DTY_0106
+E4PgIY36DTY_0108
+E4PgIY36DTY_0180
+E4PgIY36DTY_0181
+E4PgIY36DTY_0189
+ELfL879J1ZM_0000
+ELfL879J1ZM_0001
+ELfL879J1ZM_0002
+ELfL879J1ZM_0003
+EUti6bV7iOQ_0013
+EUti6bV7iOQ_0015
+EUti6bV7iOQ_0019
+EUti6bV7iOQ_0024
+EUti6bV7iOQ_0029
+EUti6bV7iOQ_0057
+EUti6bV7iOQ_0074
+EUti6bV7iOQ_0087
+EUti6bV7iOQ_0094
+EUti6bV7iOQ_0100
+EXdqYY60y04_0001
+EXdqYY60y04_0010
+EXdqYY60y04_0019
+EXdqYY60y04_0025
+EXdqYY60y04_0029
+EXdqYY60y04_0042
+EXdqYY60y04_0046
+EXdqYY60y04_0047
+EXdqYY60y04_0052
+EXdqYY60y04_0059
+EXdqYY60y04_0060
+EXdqYY60y04_0077
+EXdqYY60y04_0084
+EXdqYY60y04_0092
+EXdqYY60y04_0094
+EXdqYY60y04_0095
+EXdqYY60y04_0096
+EXdqYY60y04_0097
+EXdqYY60y04_0106
+EXdqYY60y04_0111
+EXdqYY60y04_0123
+EXdqYY60y04_0129
+EXdqYY60y04_0130
+EXdqYY60y04_0135
+EXdqYY60y04_0137
+EXdqYY60y04_0147
+EXdqYY60y04_0164
+EXdqYY60y04_0180
+EXdqYY60y04_0183
+EXdqYY60y04_0188
+EXdqYY60y04_0191
+EXdqYY60y04_0205
+EZKBDWRn7wc_0055
+EZKBDWRn7wc_0072
+EZKBDWRn7wc_0081
+EoTFGgvBce4_0023
+Ex7vU3VmPk0_0002
+Ex7vU3VmPk0_0004
+Ex7vU3VmPk0_0045
+Ex7vU3VmPk0_0066
+Ex7vU3VmPk0_0067
+Ex7vU3VmPk0_0074
+Ex7vU3VmPk0_0075
+F9F6rG5Ghig_0000
+F9F6rG5Ghig_0002
+F9F6rG5Ghig_0008
+F9F6rG5Ghig_0020
+F9F6rG5Ghig_0029
+FI0HEpoOL4k_0001
+FI0HEpoOL4k_0002
+FI0HEpoOL4k_0004
+FI0HEpoOL4k_0006
+FI0HEpoOL4k_0007
+FI0HEpoOL4k_0010
+FI0HEpoOL4k_0011
+FI0HEpoOL4k_0021
+FI0HEpoOL4k_0022
+FI0HEpoOL4k_0023
+FI0HEpoOL4k_0024
+FI0HEpoOL4k_0025
+FI0HEpoOL4k_0026
+FI0HEpoOL4k_0027
+FI0HEpoOL4k_0029
+FI0HEpoOL4k_0050
+FSIXO_G96_U_0023
+FSIXO_G96_U_0053
+FUMRQqNIAmg_0005
+FUMRQqNIAmg_0006
+FUMRQqNIAmg_0009
+FUMRQqNIAmg_0011
+FUMRQqNIAmg_0017
+FUMRQqNIAmg_0060
+FWKleFPNEPQ_0002
+FWKleFPNEPQ_0014
+FWKleFPNEPQ_0020
+FWKleFPNEPQ_0021
+FWKleFPNEPQ_0028
+FWKleFPNEPQ_0032
+FWKleFPNEPQ_0036
+FWKleFPNEPQ_0039
+FWKleFPNEPQ_0042
+FWKleFPNEPQ_0054
+FWKleFPNEPQ_0068
+FWKleFPNEPQ_0075
+FWKleFPNEPQ_0077
+FWKleFPNEPQ_0078
+FWKleFPNEPQ_0091
+FWKleFPNEPQ_0103
+FWKleFPNEPQ_0112
+FWKleFPNEPQ_0113
+FWKleFPNEPQ_0126
+FWKleFPNEPQ_0141
+FWKleFPNEPQ_0151
+FWKleFPNEPQ_0160
+FWKleFPNEPQ_0161
+FWKleFPNEPQ_0168
+FWKleFPNEPQ_0171
+FWKleFPNEPQ_0172
+FWKleFPNEPQ_0188
+FWKleFPNEPQ_0215
+FWKleFPNEPQ_0224
+FWKleFPNEPQ_0229
+FWKleFPNEPQ_0232
+FWKleFPNEPQ_0233
+FWKleFPNEPQ_0234
+FWKleFPNEPQ_0237
+FWKleFPNEPQ_0244
+FWKleFPNEPQ_0266
+FmlGi6u7aIo_0000
+FmlGi6u7aIo_0009
+FmlGi6u7aIo_0017
+FmlGi6u7aIo_0018
+FmlGi6u7aIo_0019
+FmlGi6u7aIo_0020
+FmlGi6u7aIo_0021
+FmlGi6u7aIo_0036
+FmlGi6u7aIo_0068
+FocYa_DJkzM_0010
+FocYa_DJkzM_0017
+FocYa_DJkzM_0082
+FxFz9nT4y7U_0000
+FxFz9nT4y7U_0002
+FxFz9nT4y7U_0004
+G6ypXVO_Fm0_0036
+G6ypXVO_Fm0_0101
+G6ypXVO_Fm0_0106
+G6ypXVO_Fm0_0107
+G6ypXVO_Fm0_0112
+G6ypXVO_Fm0_0113
+G6ypXVO_Fm0_0117
+G6ypXVO_Fm0_0121
+G6ypXVO_Fm0_0124
+G6ypXVO_Fm0_0132
+G6ypXVO_Fm0_0144
+G6ypXVO_Fm0_0155
+G6ypXVO_Fm0_0189
+G6ypXVO_Fm0_0191
+G6ypXVO_Fm0_0201
+G6ypXVO_Fm0_0202
+G6ypXVO_Fm0_0203
+G6ypXVO_Fm0_0207
+G6ypXVO_Fm0_0209
+G6ypXVO_Fm0_0214
+G6ypXVO_Fm0_0219
+G6ypXVO_Fm0_0220
+G6ypXVO_Fm0_0221
+G6ypXVO_Fm0_0225
+G6ypXVO_Fm0_0227
+G6ypXVO_Fm0_0234
+G6ypXVO_Fm0_0243
+G6ypXVO_Fm0_0244
+G6ypXVO_Fm0_0246
+G6ypXVO_Fm0_0249
+G6ypXVO_Fm0_0250
+G6ypXVO_Fm0_0252
+G6ypXVO_Fm0_0254
+G9FZzpuiLm4_0000
+G9FZzpuiLm4_0001
+G9FZzpuiLm4_0002
+G9FZzpuiLm4_0004
+G9FZzpuiLm4_0007
+G9FZzpuiLm4_0008
+G9FZzpuiLm4_0011
+G9FZzpuiLm4_0017
+G9FZzpuiLm4_0021
+G9FZzpuiLm4_0022
+GJccNIFQnTA_0063
+GJccNIFQnTA_0064
+GJccNIFQnTA_0065
+GJccNIFQnTA_0075
+GJccNIFQnTA_0087
+GJccNIFQnTA_0095
+GJccNIFQnTA_0110
+GLgLvNDTatU_0055
+GLgLvNDTatU_0062
+GLgLvNDTatU_0066
+GLgLvNDTatU_0070
+GLgLvNDTatU_0071
+GTVsvvCjwjk_0002
+GTVsvvCjwjk_0004
+GTVsvvCjwjk_0005
+GTVsvvCjwjk_0007
+GTVsvvCjwjk_0008
+GTVsvvCjwjk_0061
+GTVsvvCjwjk_0079
+GTVsvvCjwjk_0080
+GTVsvvCjwjk_0081
+GTVsvvCjwjk_0082
+GTVsvvCjwjk_0083
+GTVsvvCjwjk_0084
+GTVsvvCjwjk_0085
+GTVsvvCjwjk_0086
+GTVsvvCjwjk_0087
+GTVsvvCjwjk_0089
+HeTB6P4Rxt0_0005
+HeTB6P4Rxt0_0007
+HeTB6P4Rxt0_0010
+HeTB6P4Rxt0_0018
+HeTB6P4Rxt0_0050
+HeTB6P4Rxt0_0053
+HeTB6P4Rxt0_0099
+HeTB6P4Rxt0_0106
+Ht86Exdrlok_0004
+Ht86Exdrlok_0005
+Ht86Exdrlok_0006
+Ht86Exdrlok_0009
+Ht86Exdrlok_0014
+Ht86Exdrlok_0016
+Ht86Exdrlok_0043
+Ht86Exdrlok_0046
+Ht86Exdrlok_0047
+Ht86Exdrlok_0061
+I3fhTd0iRn4_0095
+I3fhTd0iRn4_0107
+I3fhTd0iRn4_0132
+I3fhTd0iRn4_0135
+ICv_Liucpf0_0006
+IEcl2DZFZj8_0004
+IEcl2DZFZj8_0006
+IEcl2DZFZj8_0007
+IEcl2DZFZj8_0009
+IEcl2DZFZj8_0012
+IEcl2DZFZj8_0014
+ILFrJIMTFNc_0004
+ILFrJIMTFNc_0052
+ILFrJIMTFNc_0066
+ILFrJIMTFNc_0068
+ILFrJIMTFNc_0129
+ILFrJIMTFNc_0137
+ILFrJIMTFNc_0138
+ILFrJIMTFNc_0143
+ILFrJIMTFNc_0149
+ILFrJIMTFNc_0168
+ILFrJIMTFNc_0169
+ILFrJIMTFNc_0178
+ILFrJIMTFNc_0179
+ILFrJIMTFNc_0182
+ILFrJIMTFNc_0184
+ILFrJIMTFNc_0185
+ILFrJIMTFNc_0197
+ILFrJIMTFNc_0222
+ILFrJIMTFNc_0247
+ILFrJIMTFNc_0260
+ILFrJIMTFNc_0264
+ILFrJIMTFNc_0268
+ILFrJIMTFNc_0269
+ILFrJIMTFNc_0270
+ILFrJIMTFNc_0271
+ILFrJIMTFNc_0273
+ILFrJIMTFNc_0274
+ILFrJIMTFNc_0275
+ILFrJIMTFNc_0277
+ILFrJIMTFNc_0282
+ILFrJIMTFNc_0286
+Ibw0yLy8h20_0002
+Ibw0yLy8h20_0025
+Ibw0yLy8h20_0081
+Ibw0yLy8h20_0149
+J30dHOAUu4g_0000
+J30dHOAUu4g_0001
+J30dHOAUu4g_0002
+J30dHOAUu4g_0003
+J30dHOAUu4g_0004
+J30dHOAUu4g_0005
+J30dHOAUu4g_0006
+J30dHOAUu4g_0008
+J30dHOAUu4g_0010
+J30dHOAUu4g_0011
+J30dHOAUu4g_0013
+J30dHOAUu4g_0014
+J30dHOAUu4g_0015
+J30dHOAUu4g_0017
+J30dHOAUu4g_0018
+J30dHOAUu4g_0020
+J30dHOAUu4g_0021
+J30dHOAUu4g_0037
+J30dHOAUu4g_0061
+J30dHOAUu4g_0063
+J30dHOAUu4g_0065
+J30dHOAUu4g_0068
+J30dHOAUu4g_0070
+J30dHOAUu4g_0071
+J30dHOAUu4g_0072
+J30dHOAUu4g_0073
+J30dHOAUu4g_0075
+J30dHOAUu4g_0076
+J30dHOAUu4g_0077
+J30dHOAUu4g_0078
+J30dHOAUu4g_0080
+J30dHOAUu4g_0091
+J30dHOAUu4g_0092
+J30dHOAUu4g_0093
+J30dHOAUu4g_0098
+J30dHOAUu4g_0099
+J30dHOAUu4g_0101
+J30dHOAUu4g_0102
+J30dHOAUu4g_0103
+J30dHOAUu4g_0104
+J30dHOAUu4g_0105
+JRARprI9z6k_0192
+JRARprI9z6k_0193
+Kop7BUYHaJ8_0001
+LGmey8AfKC0_0045
+LGmey8AfKC0_0047
+LGmey8AfKC0_0057
+Ln7QBexLksQ_0034
+Ln7QBexLksQ_0044
+Ln7QBexLksQ_0077
+Ln7QBexLksQ_0083
+LrJicbYDgBk_0007
+LrJicbYDgBk_0011
+LrJicbYDgBk_0056
+LrJicbYDgBk_0182
+LrJicbYDgBk_0208
+LyzaKXK15Vg_0015
+LyzaKXK15Vg_0017
+LyzaKXK15Vg_0021
+LyzaKXK15Vg_0024
+LyzaKXK15Vg_0025
+LyzaKXK15Vg_0030
+LyzaKXK15Vg_0032
+LyzaKXK15Vg_0033
+LyzaKXK15Vg_0037
+LyzaKXK15Vg_0041
+LyzaKXK15Vg_0042
+LyzaKXK15Vg_0044
+LyzaKXK15Vg_0058
+LyzaKXK15Vg_0060
+LyzaKXK15Vg_0094
+LyzaKXK15Vg_0139
+M5rIjd_VWoA_0035
+M5rIjd_VWoA_0123
+M5rIjd_VWoA_0203
+M5rIjd_VWoA_0208
+M5rIjd_VWoA_0216
+MLKARJ4gqD4_0013
+MLKARJ4gqD4_0014
+MLKARJ4gqD4_0054
+MLKARJ4gqD4_0060
+MLKARJ4gqD4_0065
+MP5mZaUupXc_0008
+MP5mZaUupXc_0043
+MP5mZaUupXc_0076
+MP5mZaUupXc_0077
+MP5mZaUupXc_0082
+MP5mZaUupXc_0084
+MP5mZaUupXc_0120
+MQdCxzu9I7Y_0003
+MQdCxzu9I7Y_0007
+MQdCxzu9I7Y_0012
+MQdCxzu9I7Y_0014
+MQdCxzu9I7Y_0035
+MQdCxzu9I7Y_0040
+MQdCxzu9I7Y_0054
+MWPsCJMSvZo_0001
+MWPsCJMSvZo_0005
+MWPsCJMSvZo_0034
+McQrH0iqFlY_0006
+McQrH0iqFlY_0008
+McQrH0iqFlY_0043
+McQrH0iqFlY_0055
+McQrH0iqFlY_0057
+McQrH0iqFlY_0100
+McQrH0iqFlY_0102
+McQrH0iqFlY_0108
+McQrH0iqFlY_0119
+McQrH0iqFlY_0126
+MrnxGpMxGNE_0000
+MrnxGpMxGNE_0007
+MrnxGpMxGNE_0104
+MrnxGpMxGNE_0112
+MrnxGpMxGNE_0186
+MrnxGpMxGNE_0304
+MrnxGpMxGNE_0305
+NDmYS82ztFI_0049
+NDmYS82ztFI_0054
+NDmYS82ztFI_0062
+NDmYS82ztFI_0068
+NDmYS82ztFI_0109
+NDmYS82ztFI_0112
+NDmYS82ztFI_0133
+NDmYS82ztFI_0141
+NDmYS82ztFI_0143
+NDmYS82ztFI_0150
+NDmYS82ztFI_0152
+NDmYS82ztFI_0153
+NDmYS82ztFI_0154
+NDmYS82ztFI_0162
+NDmYS82ztFI_0163
+NDmYS82ztFI_0164
+NDmYS82ztFI_0165
+O9sTLQSZfwo_0161
+O9sTLQSZfwo_0173
+O9sTLQSZfwo_0201
+O9sTLQSZfwo_0264
+O9sTLQSZfwo_0270
+OCz0we8aiHM_0015
+OCz0we8aiHM_0040
+OCz0we8aiHM_0049
+OCz0we8aiHM_0059
+OCz0we8aiHM_0072
+OCz0we8aiHM_0080
+OCz0we8aiHM_0090
+OCz0we8aiHM_0101
+OCz0we8aiHM_0105
+OJRoBJlpssA_0023
+OJRoBJlpssA_0116
+OJRoBJlpssA_0208
+OJRoBJlpssA_0230
+OJRoBJlpssA_0292
+ObVqn63Zc14_0002
+ObVqn63Zc14_0005
+ObVqn63Zc14_0010
+ObVqn63Zc14_0016
+ObVqn63Zc14_0029
+ObVqn63Zc14_0049
+ObVqn63Zc14_0050
+ObVqn63Zc14_0053
+ObVqn63Zc14_0058
+OyQH5sZOaYg_0000
+OyQH5sZOaYg_0001
+OyQH5sZOaYg_0002
+OyQH5sZOaYg_0004
+OyQH5sZOaYg_0006
+OyQH5sZOaYg_0008
+OyQH5sZOaYg_0009
+OyQH5sZOaYg_0010
+OyQH5sZOaYg_0012
+OyQH5sZOaYg_0013
+OyQH5sZOaYg_0014
+OyQH5sZOaYg_0018
+OyQH5sZOaYg_0019
+OyQH5sZOaYg_0026
+OyQH5sZOaYg_0027
+OyQH5sZOaYg_0028
+OyQH5sZOaYg_0031
+OyQH5sZOaYg_0056
+OyQH5sZOaYg_0059
+OyQH5sZOaYg_0090
+OyQH5sZOaYg_0101
+OyQH5sZOaYg_0103
+OyQH5sZOaYg_0104
+OyQH5sZOaYg_0106
+OyQH5sZOaYg_0114
+OyQH5sZOaYg_0117
+P8kmaj89DoE_0044
+P8kmaj89DoE_0086
+P8kmaj89DoE_0117
+P8kmaj89DoE_0142
+P8kmaj89DoE_0153
+P8kmaj89DoE_0160
+P8kmaj89DoE_0169
+P8kmaj89DoE_0223
+PRF7sEOLaJ0_0048
+PRF7sEOLaJ0_0089
+PRF7sEOLaJ0_0091
+PRF7sEOLaJ0_0092
+PRF7sEOLaJ0_0093
+PRF7sEOLaJ0_0095
+PRF7sEOLaJ0_0096
+PWFevMuRRks_0044
+PWFevMuRRks_0074
+Pkc1C0YDgwI_0056
+Q6RgtFU5hdE_0001
+Q6RgtFU5hdE_0006
+Q6RgtFU5hdE_0007
+Q6RgtFU5hdE_0010
+Q6RgtFU5hdE_0024
+Q6RgtFU5hdE_0026
+Q6RgtFU5hdE_0029
+Q6RgtFU5hdE_0033
+Q6RgtFU5hdE_0036
+Q6RgtFU5hdE_0039
+Q6RgtFU5hdE_0040
+Q6RgtFU5hdE_0046
+Q6RgtFU5hdE_0047
+Q6RgtFU5hdE_0048
+Q6RgtFU5hdE_0062
+Q6RgtFU5hdE_0066
+Q6RgtFU5hdE_0070
+Q6RgtFU5hdE_0072
+Q6RgtFU5hdE_0073
+Q6RgtFU5hdE_0081
+Q6RgtFU5hdE_0082
+Q6RgtFU5hdE_0083
+Q6RgtFU5hdE_0084
+Q6RgtFU5hdE_0165
+Q6RgtFU5hdE_0199
+Q6RgtFU5hdE_0209
+Q6RgtFU5hdE_0211
+Q6RgtFU5hdE_0213
+Q6RgtFU5hdE_0214
+Q6RgtFU5hdE_0217
+Q6RgtFU5hdE_0220
+Q6RgtFU5hdE_0222
+Q6RgtFU5hdE_0223
+Q6RgtFU5hdE_0224
+Q6RgtFU5hdE_0226
+Q6RgtFU5hdE_0227
+Q6RgtFU5hdE_0229
+Q6RgtFU5hdE_0231
+Q6RgtFU5hdE_0232
+Q6RgtFU5hdE_0233
+Q6RgtFU5hdE_0235
+Q6RgtFU5hdE_0236
+Q6RgtFU5hdE_0237
+Q6RgtFU5hdE_0241
+Q6RgtFU5hdE_0243
+Q6RgtFU5hdE_0248
+Q6RgtFU5hdE_0294
+Q6RgtFU5hdE_0295
+Q6RgtFU5hdE_0296
+Q6RgtFU5hdE_0299
+Q6RgtFU5hdE_0300
+Q6RgtFU5hdE_0301
+Q6RgtFU5hdE_0302
+Q6RgtFU5hdE_0303
+Q6RgtFU5hdE_0304
+Q6RgtFU5hdE_0306
+Q6RgtFU5hdE_0307
+Q6RgtFU5hdE_0309
+Q6RgtFU5hdE_0310
+Q6RgtFU5hdE_0311
+Q6RgtFU5hdE_0312
+Q6RgtFU5hdE_0315
+Q6RgtFU5hdE_0317
+Q6RgtFU5hdE_0318
+Q6RgtFU5hdE_0327
+Q6RgtFU5hdE_0346
+Q6RgtFU5hdE_0348
+Q6RgtFU5hdE_0350
+Q6RgtFU5hdE_0352
+Q6RgtFU5hdE_0356
+Q6RgtFU5hdE_0357
+Q6RgtFU5hdE_0360
+Q6RgtFU5hdE_0361
+Q6RgtFU5hdE_0404
+Q6RgtFU5hdE_0405
+Q6RgtFU5hdE_0430
+Q6RgtFU5hdE_0431
+Q6RgtFU5hdE_0452
+Q6RgtFU5hdE_0453
+Q6RgtFU5hdE_0455
+Q6RgtFU5hdE_0456
+Q6RgtFU5hdE_0457
+Q6RgtFU5hdE_0464
+Q6RgtFU5hdE_0469
+Q6RgtFU5hdE_0470
+Q6RgtFU5hdE_0472
+Q6RgtFU5hdE_0474
+Q6RgtFU5hdE_0477
+Q6RgtFU5hdE_0478
+Q6RgtFU5hdE_0493
+QachBPW_RZY_0003
+QachBPW_RZY_0009
+QachBPW_RZY_0015
+QachBPW_RZY_0018
+QachBPW_RZY_0025
+QachBPW_RZY_0027
+QachBPW_RZY_0028
+QachBPW_RZY_0031
+QqQIwB1tFbU_0000
+QqQIwB1tFbU_0009
+QqQIwB1tFbU_0010
+QqQIwB1tFbU_0014
+QqQIwB1tFbU_0022
+R1m_I9A40EM_0001
+R1m_I9A40EM_0036
+R1m_I9A40EM_0052
+R1m_I9A40EM_0084
+R1m_I9A40EM_0085
+R1m_I9A40EM_0087
+R1m_I9A40EM_0088
+R1m_I9A40EM_0090
+R1m_I9A40EM_0095
+R1m_I9A40EM_0096
+RBv5SiEoqbs_0001
+RBv5SiEoqbs_0003
+RBv5SiEoqbs_0004
+RBv5SiEoqbs_0052
+RBv5SiEoqbs_0053
+ROOgtnxJWIQ_0006
+RYIMKlZQpIs_0016
+RYIMKlZQpIs_0024
+RYIMKlZQpIs_0025
+RYIMKlZQpIs_0042
+RYIMKlZQpIs_0052
+RYIMKlZQpIs_0057
+RYIMKlZQpIs_0058
+RYIMKlZQpIs_0061
+RYIMKlZQpIs_0065
+RYIMKlZQpIs_0082
+RYIMKlZQpIs_0087
+RYIMKlZQpIs_0093
+RYIMKlZQpIs_0094
+RYIMKlZQpIs_0095
+RYIMKlZQpIs_0097
+RYIMKlZQpIs_0098
+RYIMKlZQpIs_0116
+RYIMKlZQpIs_0117
+RYIMKlZQpIs_0118
+RYIMKlZQpIs_0119
+RYIMKlZQpIs_0121
+RYIMKlZQpIs_0126
+RYIMKlZQpIs_0128
+RYIMKlZQpIs_0129
+RYIMKlZQpIs_0130
+RYIMKlZQpIs_0132
+RYIMKlZQpIs_0146
+RYIMKlZQpIs_0149
+RYIMKlZQpIs_0151
+RYIMKlZQpIs_0155
+RYIMKlZQpIs_0157
+RYIMKlZQpIs_0161
+RYIMKlZQpIs_0168
+RYIMKlZQpIs_0169
+RYIMKlZQpIs_0170
+RYIMKlZQpIs_0181
+S1yH7QxqkM0_0013
+S1yH7QxqkM0_0046
+S1yH7QxqkM0_0048
+SCiRzd_4qoQ_0000
+SCiRzd_4qoQ_0021
+SCiRzd_4qoQ_0026
+SCiRzd_4qoQ_0029
+SCiRzd_4qoQ_0043
+SCiRzd_4qoQ_0054
+SCiRzd_4qoQ_0062
+SCiRzd_4qoQ_0078
+SCiRzd_4qoQ_0080
+SCiRzd_4qoQ_0093
+SCiRzd_4qoQ_0095
+SCiRzd_4qoQ_0134
+SCiRzd_4qoQ_0136
+SCiRzd_4qoQ_0137
+SCiRzd_4qoQ_0140
+SXWaaPsiXZM_0000
+SXWaaPsiXZM_0014
+SXWaaPsiXZM_0019
+SXWaaPsiXZM_0021
+SXWaaPsiXZM_0042
+SXWaaPsiXZM_0061
+SXWaaPsiXZM_0062
+SXWaaPsiXZM_0067
+SXWaaPsiXZM_0068
+SXWaaPsiXZM_0071
+SXWaaPsiXZM_0072
+SXWaaPsiXZM_0076
+SXWaaPsiXZM_0083
+SXWaaPsiXZM_0086
+SXWaaPsiXZM_0089
+SXWaaPsiXZM_0094
+SXWaaPsiXZM_0097
+SXWaaPsiXZM_0099
+SXWaaPsiXZM_0100
+SXWaaPsiXZM_0109
+SXWaaPsiXZM_0124
+SXWaaPsiXZM_0125
+SXWaaPsiXZM_0129
+SXWaaPsiXZM_0130
+SXWaaPsiXZM_0134
+SXWaaPsiXZM_0135
+SXWaaPsiXZM_0136
+SXWaaPsiXZM_0145
+SXWaaPsiXZM_0148
+SXWaaPsiXZM_0149
+SXWaaPsiXZM_0150
+SXWaaPsiXZM_0154
+SXWaaPsiXZM_0155
+SXWaaPsiXZM_0164
+SXWaaPsiXZM_0170
+SXWaaPsiXZM_0171
+SXWaaPsiXZM_0174
+SXWaaPsiXZM_0178
+SXWaaPsiXZM_0179
+SXWaaPsiXZM_0180
+SXWaaPsiXZM_0181
+SXWaaPsiXZM_0182
+SXWaaPsiXZM_0183
+SXWaaPsiXZM_0191
+SXWaaPsiXZM_0192
+SXkUUg5_nuE_0001
+SXkUUg5_nuE_0002
+SXkUUg5_nuE_0003
+SXkUUg5_nuE_0004
+SXkUUg5_nuE_0007
+SXkUUg5_nuE_0009
+SXkUUg5_nuE_0010
+SXkUUg5_nuE_0012
+SXkUUg5_nuE_0013
+SXkUUg5_nuE_0016
+SXkUUg5_nuE_0019
+SXkUUg5_nuE_0024
+SXkUUg5_nuE_0025
+SXkUUg5_nuE_0034
+SXkUUg5_nuE_0047
+SXkUUg5_nuE_0049
+SXkUUg5_nuE_0056
+SXkUUg5_nuE_0060
+SXkUUg5_nuE_0061
+SXkUUg5_nuE_0066
+SjE9k0L1PRI_0009
+SjE9k0L1PRI_0054
+SjE9k0L1PRI_0055
+SjE9k0L1PRI_0058
+SjE9k0L1PRI_0081
+SjhpH8CEvkM_0000
+SjhpH8CEvkM_0004
+SjhpH8CEvkM_0005
+SjhpH8CEvkM_0006
+SjhpH8CEvkM_0008
+SjhpH8CEvkM_0011
+SjhpH8CEvkM_0012
+SjhpH8CEvkM_0017
+SjhpH8CEvkM_0018
+SjhpH8CEvkM_0022
+SjhpH8CEvkM_0024
+SjhpH8CEvkM_0026
+SjhpH8CEvkM_0062
+SjhpH8CEvkM_0065
+SjhpH8CEvkM_0067
+SjhpH8CEvkM_0076
+SjhpH8CEvkM_0077
+SjhpH8CEvkM_0080
+SjhpH8CEvkM_0082
+SjhpH8CEvkM_0098
+SjhpH8CEvkM_0101
+SjhpH8CEvkM_0105
+SjhpH8CEvkM_0106
+SjhpH8CEvkM_0111
+SjhpH8CEvkM_0140
+SkpqV4wnnsg_0004
+SkpqV4wnnsg_0005
+SkpqV4wnnsg_0029
+SkpqV4wnnsg_0033
+SuCN2QHLBHU_0046
+SuCN2QHLBHU_0048
+SuCN2QHLBHU_0096
+SuCN2QHLBHU_0099
+SuCN2QHLBHU_0102
+SuCN2QHLBHU_0111
+SuE0EtNhm10_0003
+SuE0EtNhm10_0013
+SuE0EtNhm10_0014
+SuE0EtNhm10_0047
+SuE0EtNhm10_0075
+TaAHiOFvCLA_0062
+Tp8azIZH2Zg_0007
+Tp8azIZH2Zg_0021
+Tp8azIZH2Zg_0023
+Tp8azIZH2Zg_0029
+Tp8azIZH2Zg_0038
+TrSURh_zqG8_0001
+U8iB8i0o6lM_0003
+U8iB8i0o6lM_0009
+U8iB8i0o6lM_0021
+U8iB8i0o6lM_0030
+U8iB8i0o6lM_0031
+U8iB8i0o6lM_0035
+U8iB8i0o6lM_0036
+U8iB8i0o6lM_0037
+U8iB8i0o6lM_0038
+U8iB8i0o6lM_0043
+U8iB8i0o6lM_0048
+U8iB8i0o6lM_0066
+U8iB8i0o6lM_0078
+U8iB8i0o6lM_0086
+U8iB8i0o6lM_0106
+U8iB8i0o6lM_0109
+U8iB8i0o6lM_0110
+UHf9F8AjihA_0001
+UHf9F8AjihA_0003
+UHf9F8AjihA_0031
+UHf9F8AjihA_0041
+UPhG598XWOQ_0010
+UPhG598XWOQ_0048
+UPhG598XWOQ_0049
+UPhG598XWOQ_0050
+UPhG598XWOQ_0051
+UPhG598XWOQ_0056
+UPhG598XWOQ_0058
+UPhG598XWOQ_0059
+UPhG598XWOQ_0060
+UPhG598XWOQ_0061
+UPhG598XWOQ_0064
+UW6YE4kTm6s_0000
+UW6YE4kTm6s_0009
+UW6YE4kTm6s_0010
+UW6YE4kTm6s_0017
+UW6YE4kTm6s_0049
+UW6YE4kTm6s_0050
+UW6YE4kTm6s_0080
+UWgHRNwSBPs_0001
+UWgHRNwSBPs_0002
+UdzWBW_GVdA_0000
+UdzWBW_GVdA_0011
+UdzWBW_GVdA_0020
+UdzWBW_GVdA_0022
+UdzWBW_GVdA_0031
+UdzWBW_GVdA_0034
+UdzWBW_GVdA_0036
+UdzWBW_GVdA_0037
+UdzWBW_GVdA_0039
+UdzWBW_GVdA_0043
+UdzWBW_GVdA_0046
+UdzWBW_GVdA_0048
+UdzWBW_GVdA_0053
+UdzWBW_GVdA_0054
+UdzWBW_GVdA_0055
+UdzWBW_GVdA_0058
+UdzWBW_GVdA_0059
+UdzWBW_GVdA_0064
+UdzWBW_GVdA_0066
+UdzWBW_GVdA_0067
+UdzWBW_GVdA_0069
+UdzWBW_GVdA_0072
+UdzWBW_GVdA_0078
+UdzWBW_GVdA_0081
+UdzWBW_GVdA_0091
+UdzWBW_GVdA_0092
+UdzWBW_GVdA_0095
+UdzWBW_GVdA_0096
+UdzWBW_GVdA_0097
+UdzWBW_GVdA_0102
+UdzWBW_GVdA_0108
+UdzWBW_GVdA_0116
+UdzWBW_GVdA_0118
+UdzWBW_GVdA_0121
+UdzWBW_GVdA_0122
+UdzWBW_GVdA_0123
+UdzWBW_GVdA_0126
+UdzWBW_GVdA_0127
+UdzWBW_GVdA_0128
+UdzWBW_GVdA_0131
+UdzWBW_GVdA_0134
+UkspwzJI9D0_0155
+UkspwzJI9D0_0156
+UkspwzJI9D0_0157
+UkspwzJI9D0_0172
+UzoRm0lHL4Y_0000
+UzoRm0lHL4Y_0003
+UzoRm0lHL4Y_0006
+UzoRm0lHL4Y_0087
+UzoRm0lHL4Y_0091
+UzoRm0lHL4Y_0100
+UzoRm0lHL4Y_0129
+UzoRm0lHL4Y_0130
+UzoRm0lHL4Y_0131
+UzoRm0lHL4Y_0169
+VmGfENnSrtE_0002
+VmGfENnSrtE_0046
+VmGfENnSrtE_0057
+VmGfENnSrtE_0058
+VmGfENnSrtE_0063
+VmGfENnSrtE_0065
+VmGfENnSrtE_0066
+VmGfENnSrtE_0071
+Vr0_vZpLDOA_0001
+Vr0_vZpLDOA_0004
+Vr0_vZpLDOA_0005
+Vr0_vZpLDOA_0011
+Vr0_vZpLDOA_0013
+Vr0_vZpLDOA_0015
+Vr0_vZpLDOA_0016
+Vr0_vZpLDOA_0017
+Vr0_vZpLDOA_0020
+Vr0_vZpLDOA_0021
+Vr0_vZpLDOA_0046
+Vr0_vZpLDOA_0053
+Vr0_vZpLDOA_0058
+VzKY1Gx53zc_0004
+VzKY1Gx53zc_0007
+VzKY1Gx53zc_0024
+W1rJdHPRM_0_0010
+W1rJdHPRM_0_0025
+W1rJdHPRM_0_0026
+W1rJdHPRM_0_0093
+W1rJdHPRM_0_0119
+W1rJdHPRM_0_0152
+WI3uSA37jTQ_0000
+WI3uSA37jTQ_0008
+WI3uSA37jTQ_0017
+WI3uSA37jTQ_0019
+WYMGWr6NVgc_0000
+WYMGWr6NVgc_0003
+WYMGWr6NVgc_0007
+WYMGWr6NVgc_0020
+WYMGWr6NVgc_0027
+WsIoM7MZ5iI_0113
+WsIoM7MZ5iI_0134
+X7AKyI7FRpA_0034
+X7AKyI7FRpA_0091
+X7AKyI7FRpA_0092
+X7AKyI7FRpA_0116
+X7AKyI7FRpA_0133
+X7AKyI7FRpA_0140
+X7AKyI7FRpA_0141
+X7AKyI7FRpA_0145
+X7AKyI7FRpA_0149
+X7AKyI7FRpA_0155
+X7AKyI7FRpA_0162
+X7AKyI7FRpA_0163
+X7AKyI7FRpA_0235
+X7AKyI7FRpA_0296
+XDRQQTZ6b9w_0027
+XDRQQTZ6b9w_0060
+XDRQQTZ6b9w_0062
+XDRQQTZ6b9w_0085
+XDRQQTZ6b9w_0170
+XDRQQTZ6b9w_0212
+XDRQQTZ6b9w_0219
+XDRQQTZ6b9w_0226
+XDRQQTZ6b9w_0232
+XDRQQTZ6b9w_0235
+XDRQQTZ6b9w_0238
+XDRQQTZ6b9w_0253
+XJrFjEgq5Tk_0000
+XJrFjEgq5Tk_0005
+XJrFjEgq5Tk_0028
+XJrFjEgq5Tk_0038
+XJrFjEgq5Tk_0049
+XJrFjEgq5Tk_0055
+XJrFjEgq5Tk_0102
+XJrFjEgq5Tk_0103
+XJrFjEgq5Tk_0105
+XJrFjEgq5Tk_0106
+XJrFjEgq5Tk_0108
+XJrFjEgq5Tk_0112
+XJrFjEgq5Tk_0114
+XJrFjEgq5Tk_0117
+XJrFjEgq5Tk_0121
+XJrFjEgq5Tk_0123
+XJrFjEgq5Tk_0124
+XJrFjEgq5Tk_0126
+XJrFjEgq5Tk_0130
+XKy80mmqQbE_0007
+XKy80mmqQbE_0008
+XKy80mmqQbE_0015
+XKy80mmqQbE_0022
+XKy80mmqQbE_0043
+XKy80mmqQbE_0045
+XKy80mmqQbE_0073
+XKy80mmqQbE_0076
+XdfLK4pXgNU_0001
+XdfLK4pXgNU_0003
+XdfLK4pXgNU_0004
+XdfLK4pXgNU_0006
+XwQaaMOU6oM_0002
+XwQaaMOU6oM_0005
+XwQaaMOU6oM_0011
+XwQaaMOU6oM_0012
+XwQaaMOU6oM_0013
+XwQaaMOU6oM_0016
+XwQaaMOU6oM_0024
+XwQaaMOU6oM_0025
+XwQaaMOU6oM_0028
+XwQaaMOU6oM_0048
+XwQaaMOU6oM_0049
+XwQaaMOU6oM_0055
+XwQaaMOU6oM_0056
+XwQaaMOU6oM_0057
+XwQaaMOU6oM_0059
+XwQaaMOU6oM_0060
+XwQaaMOU6oM_0061
+XwQaaMOU6oM_0063
+XwQaaMOU6oM_0064
+XwQaaMOU6oM_0065
+XwQaaMOU6oM_0070
+XwQaaMOU6oM_0075
+XwQaaMOU6oM_0076
+XwQaaMOU6oM_0078
+XwQaaMOU6oM_0080
+XwQaaMOU6oM_0096
+XwQaaMOU6oM_0098
+XwQaaMOU6oM_0101
+XwQaaMOU6oM_0104
+XwQaaMOU6oM_0107
+XwQaaMOU6oM_0112
+XwQaaMOU6oM_0114
+XwQaaMOU6oM_0115
+XwQaaMOU6oM_0119
+Y5naGnjTMi0_0010
+Y5naGnjTMi0_0028
+Y5naGnjTMi0_0033
+YHzoGWjcLRg_0002
+YHzoGWjcLRg_0006
+YHzoGWjcLRg_0007
+YHzoGWjcLRg_0012
+YHzoGWjcLRg_0016
+YHzoGWjcLRg_0019
+YHzoGWjcLRg_0020
+YHzoGWjcLRg_0021
+YHzoGWjcLRg_0022
+YHzoGWjcLRg_0024
+YHzoGWjcLRg_0026
+YHzoGWjcLRg_0029
+YHzoGWjcLRg_0049
+YHzoGWjcLRg_0108
+YHzoGWjcLRg_0120
+YHzoGWjcLRg_0168
+YHzoGWjcLRg_0171
+YHzoGWjcLRg_0173
+YHzoGWjcLRg_0174
+YHzoGWjcLRg_0175
+YHzoGWjcLRg_0176
+YHzoGWjcLRg_0177
+YHzoGWjcLRg_0178
+YHzoGWjcLRg_0179
+YHzoGWjcLRg_0180
+YHzoGWjcLRg_0181
+YHzoGWjcLRg_0182
+YHzoGWjcLRg_0183
+YHzoGWjcLRg_0184
+YHzoGWjcLRg_0185
+YHzoGWjcLRg_0186
+YHzoGWjcLRg_0187
+YHzoGWjcLRg_0188
+YHzoGWjcLRg_0189
+YHzoGWjcLRg_0190
+YHzoGWjcLRg_0191
+YHzoGWjcLRg_0192
+YHzoGWjcLRg_0193
+YHzoGWjcLRg_0194
+YHzoGWjcLRg_0195
+YHzoGWjcLRg_0196
+YHzoGWjcLRg_0197
+YHzoGWjcLRg_0198
+YHzoGWjcLRg_0199
+YHzoGWjcLRg_0200
+YHzoGWjcLRg_0201
+YHzoGWjcLRg_0202
+YHzoGWjcLRg_0205
+YHzoGWjcLRg_0206
+YHzoGWjcLRg_0208
+YHzoGWjcLRg_0209
+YHzoGWjcLRg_0210
+YHzoGWjcLRg_0211
+YHzoGWjcLRg_0212
+YHzoGWjcLRg_0213
+YHzoGWjcLRg_0214
+YHzoGWjcLRg_0215
+YSIxx0eOK4U_0004
+YSIxx0eOK4U_0061
+YSIxx0eOK4U_0078
+YSIxx0eOK4U_0133
+YSIxx0eOK4U_0136
+YSIxx0eOK4U_0143
+YSIxx0eOK4U_0149
+YSIxx0eOK4U_0161
+YSIxx0eOK4U_0206
+YSIxx0eOK4U_0212
+YSIxx0eOK4U_0214
+YSIxx0eOK4U_0215
+YSIxx0eOK4U_0217
+YSIxx0eOK4U_0218
+YSIxx0eOK4U_0220
+YSIxx0eOK4U_0221
+YSIxx0eOK4U_0226
+YSIxx0eOK4U_0231
+YSIxx0eOK4U_0233
+YSIxx0eOK4U_0235
+YSIxx0eOK4U_0257
+YSIxx0eOK4U_0262
+YSIxx0eOK4U_0264
+YSIxx0eOK4U_0268
+YSIxx0eOK4U_0275
+YSIxx0eOK4U_0299
+YSIxx0eOK4U_0327
+YSIxx0eOK4U_0333
+YSIxx0eOK4U_0335
+YSIxx0eOK4U_0339
+YSIxx0eOK4U_0340
+YSIxx0eOK4U_0342
+YSIxx0eOK4U_0361
+YSIxx0eOK4U_0404
+YSIxx0eOK4U_0409
+YSIxx0eOK4U_0427
+YSIxx0eOK4U_0431
+YSIxx0eOK4U_0436
+YSIxx0eOK4U_0437
+YSIxx0eOK4U_0438
+Yq3MgleQKSc_0002
+Yq3MgleQKSc_0004
+Yq3MgleQKSc_0007
+Yq3MgleQKSc_0008
+Yq3MgleQKSc_0012
+Yq3MgleQKSc_0023
+Yq3MgleQKSc_0027
+Yq3MgleQKSc_0039
+Yq3MgleQKSc_0042
+Yq3MgleQKSc_0046
+Yq3MgleQKSc_0047
+YrUcUmQSufE_0002
+YrUcUmQSufE_0003
+YrUcUmQSufE_0033
+YrUcUmQSufE_0043
+YrUcUmQSufE_0079
+YrUcUmQSufE_0080
+YrUcUmQSufE_0081
+YrUcUmQSufE_0090
+YrUcUmQSufE_0093
+YrUcUmQSufE_0098
+YxYY8V9MPCI_0060
+YxYY8V9MPCI_0062
+YxYY8V9MPCI_0063
+YxYY8V9MPCI_0067
+Z5nP4YB93KM_0004
+Z5nP4YB93KM_0009
+Z5nP4YB93KM_0010
+Z5nP4YB93KM_0044
+Z5nP4YB93KM_0089
+Z5nP4YB93KM_0113
+ZAmgWLDdyZ4_0003
+ZAmgWLDdyZ4_0009
+ZAmgWLDdyZ4_0010
+ZAmgWLDdyZ4_0020
+ZAmgWLDdyZ4_0049
+ZAmgWLDdyZ4_0058
+ZAmgWLDdyZ4_0102
+ZAmgWLDdyZ4_0109
+ZAmgWLDdyZ4_0113
+ZAmgWLDdyZ4_0116
+ZAmgWLDdyZ4_0118
+ZAmgWLDdyZ4_0121
+ZAmgWLDdyZ4_0122
+ZAmgWLDdyZ4_0126
+ZAmgWLDdyZ4_0127
+ZAmgWLDdyZ4_0128
+ZAmgWLDdyZ4_0129
+ZAmgWLDdyZ4_0131
+ZAmgWLDdyZ4_0132
+ZAmgWLDdyZ4_0137
+ZAmgWLDdyZ4_0140
+ZAmgWLDdyZ4_0141
+ZAmgWLDdyZ4_0145
+ZAmgWLDdyZ4_0146
+ZAmgWLDdyZ4_0147
+ZAmgWLDdyZ4_0148
+ZAmgWLDdyZ4_0150
+ZAmgWLDdyZ4_0151
+ZAmgWLDdyZ4_0153
+Z_MQqWzXWjk_0005
+Z_MQqWzXWjk_0009
+Z_MQqWzXWjk_0015
+Z_MQqWzXWjk_0038
+Z_MQqWzXWjk_0043
+Z_MQqWzXWjk_0047
+Z_MQqWzXWjk_0051
+ZbRpQnrMwQM_0019
+ZbRpQnrMwQM_0023
+ZbRpQnrMwQM_0027
+ZbRpQnrMwQM_0031
+ZbRpQnrMwQM_0032
+ZbRpQnrMwQM_0037
+ZbRpQnrMwQM_0110
+ZkKAwe9SYNY_0002
+ZkKAwe9SYNY_0005
+ZkKAwe9SYNY_0007
+ZkKAwe9SYNY_0008
+ZkKAwe9SYNY_0012
+ZkKAwe9SYNY_0013
+ZkKAwe9SYNY_0014
+ZkKAwe9SYNY_0015
+ZkKAwe9SYNY_0016
+ZkKAwe9SYNY_0021
+ZkKAwe9SYNY_0023
+ZkKAwe9SYNY_0026
+ZkKAwe9SYNY_0030
+ZkKAwe9SYNY_0031
+ZkKAwe9SYNY_0055
+ZkKAwe9SYNY_0059
+ZkKAwe9SYNY_0060
+ZkKAwe9SYNY_0068
+ZkKAwe9SYNY_0069
+ZkKAwe9SYNY_0070
+ZkKAwe9SYNY_0071
+ZkKAwe9SYNY_0072
+ZkKAwe9SYNY_0073
+ZkKAwe9SYNY_0075
+ZkKAwe9SYNY_0076
+ZkKAwe9SYNY_0077
+ZkKAwe9SYNY_0078
+ZkKAwe9SYNY_0079
+ZkKAwe9SYNY_0084
+ZkKAwe9SYNY_0086
+ZkKAwe9SYNY_0087
+ZkKAwe9SYNY_0089
+ZkKAwe9SYNY_0090
+ZkKAwe9SYNY_0091
+ZkKAwe9SYNY_0092
+ZkKAwe9SYNY_0094
+ZkKAwe9SYNY_0095
+ZkKAwe9SYNY_0096
+ZkKAwe9SYNY_0097
+ZkKAwe9SYNY_0099
+ZkKAwe9SYNY_0102
+ZkKAwe9SYNY_0105
+ZkKAwe9SYNY_0107
+ZkKAwe9SYNY_0109
+ZkKAwe9SYNY_0112
+ZkKAwe9SYNY_0115
+ZkKAwe9SYNY_0117
+ZkKAwe9SYNY_0120
+ZkKAwe9SYNY_0123
+ZkKAwe9SYNY_0124
+ZkKAwe9SYNY_0126
+ZkKAwe9SYNY_0128
+ZkKAwe9SYNY_0129
+ZkKAwe9SYNY_0131
+ZkKAwe9SYNY_0153
+Zv1Us7bg2Ss_0002
+Zv1Us7bg2Ss_0009
+Zv1Us7bg2Ss_0011
+Zv1Us7bg2Ss_0012
+Zv1Us7bg2Ss_0013
+Zv1Us7bg2Ss_0020
+Zv1Us7bg2Ss_0024
+Zv1Us7bg2Ss_0047
+ZyvrbXsePgM_0000
+_eZaDTcAeHo_0022
+_eZaDTcAeHo_0034
+_eZaDTcAeHo_0059
+_eZaDTcAeHo_0066
+_eZaDTcAeHo_0076
+_eZaDTcAeHo_0092
+_gxknz2jr70_0000
+_gxknz2jr70_0001
+_gxknz2jr70_0003
+_gxknz2jr70_0004
+_gxknz2jr70_0005
+_gxknz2jr70_0011
+_oXRvYMp36E_0021
+_oXRvYMp36E_0024
+_oXRvYMp36E_0032
+_oXRvYMp36E_0158
+_oXRvYMp36E_0208
+_oXRvYMp36E_0215
+aJDTMo_pkLE_0000
+aJDTMo_pkLE_0008
+aJDTMo_pkLE_0010
+aJDTMo_pkLE_0012
+aJDTMo_pkLE_0013
+aJDTMo_pkLE_0014
+aJDTMo_pkLE_0038
+akhR1yAibr4_0010
+akhR1yAibr4_0057
+akhR1yAibr4_0058
+as1dYNWR1_s_0000
+as1dYNWR1_s_0004
+as1dYNWR1_s_0006
+as1dYNWR1_s_0007
+as1dYNWR1_s_0012
+as1dYNWR1_s_0013
+as1dYNWR1_s_0014
+as1dYNWR1_s_0021
+as1dYNWR1_s_0023
+as1dYNWR1_s_0025
+as1dYNWR1_s_0030
+ay9u8V11JP4_0035
+ay9u8V11JP4_0077
+ay9u8V11JP4_0079
+ay9u8V11JP4_0122
+ay9u8V11JP4_0126
+ay9u8V11JP4_0231
+ay9u8V11JP4_0234
+ay9u8V11JP4_0242
+ay9u8V11JP4_0249
+ay9u8V11JP4_0275
+ay9u8V11JP4_0276
+ay9u8V11JP4_0280
+b2wLhmcPTNw_0000
+b2wLhmcPTNw_0006
+b2wLhmcPTNw_0007
+b2wLhmcPTNw_0008
+b2wLhmcPTNw_0053
+b2wLhmcPTNw_0055
+b2wLhmcPTNw_0057
+b2wLhmcPTNw_0059
+b2wLhmcPTNw_0061
+b2wLhmcPTNw_0074
+b2wLhmcPTNw_0081
+b2wLhmcPTNw_0086
+b2wLhmcPTNw_0089
+b2wLhmcPTNw_0093
+b2wLhmcPTNw_0094
+b2wLhmcPTNw_0097
+b2wLhmcPTNw_0101
+b2wLhmcPTNw_0116
+b2wLhmcPTNw_0127
+b2wLhmcPTNw_0138
+b2wLhmcPTNw_0140
+b2wLhmcPTNw_0143
+b2wLhmcPTNw_0149
+bCyhdG3dCdM_0004
+bCyhdG3dCdM_0056
+bM7S5UX48w8_0010
+bM7S5UX48w8_0027
+bV4JkpgcNOs_0002
+bV4JkpgcNOs_0013
+bV4JkpgcNOs_0018
+bV4JkpgcNOs_0059
+b_EoBrEr_tQ_0002
+b_EoBrEr_tQ_0056
+c7q6typ92cQ_0000
+cTHlQsTG_Go_0002
+cc1W6qLdzVo_0000
+cc1W6qLdzVo_0006
+cc1W6qLdzVo_0007
+cc1W6qLdzVo_0011
+cc1W6qLdzVo_0013
+cc1W6qLdzVo_0037
+cc1W6qLdzVo_0119
+cc1W6qLdzVo_0120
+cc1W6qLdzVo_0121
+cc1W6qLdzVo_0122
+cc1W6qLdzVo_0123
+coj4VKJvp8E_0009
+coj4VKJvp8E_0010
+coj4VKJvp8E_0024
+coj4VKJvp8E_0025
+coj4VKJvp8E_0026
+coj4VKJvp8E_0029
+coj4VKJvp8E_0054
+coj4VKJvp8E_0087
+coj4VKJvp8E_0096
+coj4VKJvp8E_0111
+coj4VKJvp8E_0185
+coj4VKJvp8E_0187
+coj4VKJvp8E_0203
+coj4VKJvp8E_0204
+coj4VKJvp8E_0206
+coj4VKJvp8E_0207
+coj4VKJvp8E_0210
+coj4VKJvp8E_0234
+coj4VKJvp8E_0248
+coj4VKJvp8E_0260
+coj4VKJvp8E_0269
+coj4VKJvp8E_0300
+coj4VKJvp8E_0335
+coj4VKJvp8E_0357
+cxktJLhV_Uk_0001
+cxktJLhV_Uk_0002
+cxktJLhV_Uk_0005
+cxktJLhV_Uk_0007
+cxktJLhV_Uk_0010
+cxktJLhV_Uk_0019
+cxktJLhV_Uk_0022
+cxktJLhV_Uk_0024
+cxktJLhV_Uk_0027
+cxktJLhV_Uk_0028
+cxktJLhV_Uk_0029
+cxktJLhV_Uk_0030
+cxktJLhV_Uk_0031
+cxktJLhV_Uk_0032
+cxktJLhV_Uk_0033
+cxktJLhV_Uk_0034
+cxktJLhV_Uk_0035
+cxktJLhV_Uk_0036
+cxktJLhV_Uk_0037
+cxktJLhV_Uk_0043
+cxktJLhV_Uk_0045
+cxktJLhV_Uk_0048
+cxktJLhV_Uk_0051
+cxktJLhV_Uk_0053
+d3GnwrM7r28_0001
+d3GnwrM7r28_0002
+d3GnwrM7r28_0004
+d3GnwrM7r28_0007
+d3GnwrM7r28_0009
+d3GnwrM7r28_0021
+d3GnwrM7r28_0026
+d3GnwrM7r28_0029
+d3GnwrM7r28_0033
+d3GnwrM7r28_0037
+d3GnwrM7r28_0038
+d3GnwrM7r28_0040
+d3GnwrM7r28_0041
+d3GnwrM7r28_0042
+d3GnwrM7r28_0044
+d3GnwrM7r28_0046
+d3GnwrM7r28_0049
+d3GnwrM7r28_0053
+d3GnwrM7r28_0056
+d3GnwrM7r28_0058
+d3GnwrM7r28_0060
+d3GnwrM7r28_0062
+d3GnwrM7r28_0066
+d3GnwrM7r28_0067
+d3GnwrM7r28_0068
+d3GnwrM7r28_0069
+d3GnwrM7r28_0070
+d3GnwrM7r28_0073
+d3GnwrM7r28_0074
+d3GnwrM7r28_0076
+d3GnwrM7r28_0077
+d3GnwrM7r28_0079
+d3GnwrM7r28_0080
+d3GnwrM7r28_0081
+d3GnwrM7r28_0082
+dEhjANwDnwQ_0005
+dEhjANwDnwQ_0008
+dEhjANwDnwQ_0015
+dEhjANwDnwQ_0017
+dEhjANwDnwQ_0018
+dEhjANwDnwQ_0021
+dEhjANwDnwQ_0023
+dEhjANwDnwQ_0026
+dEhjANwDnwQ_0030
+dEhjANwDnwQ_0033
+dEhjANwDnwQ_0035
+dEhjANwDnwQ_0036
+dEhjANwDnwQ_0064
+dEhjANwDnwQ_0076
+dEhjANwDnwQ_0077
+dEhjANwDnwQ_0093
+dEhjANwDnwQ_0125
+dEhjANwDnwQ_0127
+dEhjANwDnwQ_0135
+dEhjANwDnwQ_0144
+dEhjANwDnwQ_0151
+dEhjANwDnwQ_0159
+dEhjANwDnwQ_0160
+dEhjANwDnwQ_0163
+dEhjANwDnwQ_0165
+dEhjANwDnwQ_0166
+dEhjANwDnwQ_0174
+dEhjANwDnwQ_0175
+dEhjANwDnwQ_0176
+dEhjANwDnwQ_0179
+dEhjANwDnwQ_0181
+dEhjANwDnwQ_0215
+dEhjANwDnwQ_0222
+dEhjANwDnwQ_0232
+dEhjANwDnwQ_0243
+dEhjANwDnwQ_0252
+dEhjANwDnwQ_0253
+dEhjANwDnwQ_0262
+dEhjANwDnwQ_0269
+dEhjANwDnwQ_0271
+dEhjANwDnwQ_0276
+dEhjANwDnwQ_0278
+dEhjANwDnwQ_0294
+dEhjANwDnwQ_0295
+dEhjANwDnwQ_0297
+dEhjANwDnwQ_0299
+dEhjANwDnwQ_0306
+dEhjANwDnwQ_0307
+dSuP1pRopO8_0000
+dSuP1pRopO8_0001
+dSuP1pRopO8_0011
+dSuP1pRopO8_0066
+djRW_JRmkpE_0007
+djRW_JRmkpE_0022
+djRW_JRmkpE_0101
+djRW_JRmkpE_0179
+djRW_JRmkpE_0189
+djRW_JRmkpE_0190
+djRW_JRmkpE_0194
+djRW_JRmkpE_0229
+djRW_JRmkpE_0242
+djRW_JRmkpE_0246
+djRW_JRmkpE_0247
+djRW_JRmkpE_0248
+djRW_JRmkpE_0249
+djRW_JRmkpE_0258
+djRW_JRmkpE_0259
+dqrkV7DZWOk_0000
+dqrkV7DZWOk_0002
+dqrkV7DZWOk_0005
+dqrkV7DZWOk_0009
+dqrkV7DZWOk_0011
+dqrkV7DZWOk_0025
+egv_NFOZlq4_0000
+egv_NFOZlq4_0007
+egv_NFOZlq4_0014
+egv_NFOZlq4_0021
+egv_NFOZlq4_0026
+egv_NFOZlq4_0027
+egv_NFOZlq4_0030
+egv_NFOZlq4_0046
+egv_NFOZlq4_0092
+egv_NFOZlq4_0093
+egv_NFOZlq4_0098
+egv_NFOZlq4_0099
+egv_NFOZlq4_0101
+egv_NFOZlq4_0102
+egv_NFOZlq4_0108
+egv_NFOZlq4_0109
+egv_NFOZlq4_0112
+egv_NFOZlq4_0120
+egv_NFOZlq4_0138
+fAZQrAoIHZs_0000
+fAZQrAoIHZs_0004
+fAZQrAoIHZs_0011
+fAZQrAoIHZs_0012
+fAZQrAoIHZs_0019
+fAZQrAoIHZs_0062
+fAZQrAoIHZs_0120
+fAZQrAoIHZs_0173
+fAZQrAoIHZs_0180
+fAZQrAoIHZs_0188
+fAZQrAoIHZs_0190
+fAZQrAoIHZs_0210
+fAZQrAoIHZs_0219
+fAZQrAoIHZs_0231
+fAZQrAoIHZs_0237
+fAZQrAoIHZs_0292
+fAZQrAoIHZs_0319
+fAZQrAoIHZs_0324
+fAZQrAoIHZs_0327
+fAZQrAoIHZs_0329
+fAZQrAoIHZs_0370
+fAZQrAoIHZs_0372
+fAZQrAoIHZs_0376
+fAZQrAoIHZs_0383
+fAZQrAoIHZs_0392
+fAZQrAoIHZs_0393
+fAZQrAoIHZs_0396
+fAZQrAoIHZs_0397
+fAZQrAoIHZs_0404
+fFoULWCEha8_0004
+fFoULWCEha8_0008
+fFoULWCEha8_0009
+fFoULWCEha8_0013
+fFoULWCEha8_0016
+fFoULWCEha8_0021
+fFoULWCEha8_0033
+fZ4iXEWx9Xs_0024
+fZ4iXEWx9Xs_0025
+fZ4iXEWx9Xs_0042
+fZ4iXEWx9Xs_0059
+fZ4iXEWx9Xs_0060
+fZ4iXEWx9Xs_0064
+fZ4iXEWx9Xs_0066
+fZ4iXEWx9Xs_0072
+fZ4iXEWx9Xs_0073
+fZ4iXEWx9Xs_0078
+fZ4iXEWx9Xs_0087
+fZ4iXEWx9Xs_0099
+fZ4iXEWx9Xs_0105
+fZ4iXEWx9Xs_0110
+fZ4iXEWx9Xs_0121
+fZ4iXEWx9Xs_0130
+fZ4iXEWx9Xs_0138
+fZ4iXEWx9Xs_0144
+fZ4iXEWx9Xs_0146
+fZ4iXEWx9Xs_0151
+fZ4iXEWx9Xs_0154
+fZ4iXEWx9Xs_0163
+fZ4iXEWx9Xs_0167
+fZ4iXEWx9Xs_0186
+fZ4iXEWx9Xs_0208
+fZ4iXEWx9Xs_0212
+fZ4iXEWx9Xs_0218
+fZ4iXEWx9Xs_0222
+fZ4iXEWx9Xs_0224
+fZ4iXEWx9Xs_0231
+fZ4iXEWx9Xs_0238
+fZ4iXEWx9Xs_0246
+fZ4iXEWx9Xs_0248
+fpBEupKjHR8_0058
+fpBEupKjHR8_0059
+fpBEupKjHR8_0072
+fpBEupKjHR8_0080
+fs1tJm743Gg_0004
+fs1tJm743Gg_0013
+fs1tJm743Gg_0024
+fs1tJm743Gg_0026
+fs1tJm743Gg_0033
+fs1tJm743Gg_0065
+fs1tJm743Gg_0110
+fs1tJm743Gg_0114
+fs1tJm743Gg_0119
+fs1tJm743Gg_0122
+fs1tJm743Gg_0123
+fs1tJm743Gg_0124
+fs1tJm743Gg_0202
+fs1tJm743Gg_0204
+fs1tJm743Gg_0209
+fs1tJm743Gg_0211
+fs1tJm743Gg_0304
+fs1tJm743Gg_0308
+fs1tJm743Gg_0313
+fs1tJm743Gg_0317
+fs1tJm743Gg_0323
+fs1tJm743Gg_0332
+fs1tJm743Gg_0333
+fs1tJm743Gg_0334
+fuJBI8SCy4k_0002
+fuJBI8SCy4k_0005
+fuJBI8SCy4k_0014
+fuJBI8SCy4k_0030
+fuJBI8SCy4k_0031
+fuJBI8SCy4k_0037
+fuJBI8SCy4k_0064
+fus6vSQVcFo_0003
+fus6vSQVcFo_0005
+fus6vSQVcFo_0008
+fus6vSQVcFo_0078
+fus6vSQVcFo_0080
+gF5P556luWY_0159
+g_mYumqTEdQ_0002
+g_mYumqTEdQ_0069
+g_mYumqTEdQ_0071
+g_mYumqTEdQ_0073
+g_mYumqTEdQ_0098
+g_mYumqTEdQ_0137
+g_mYumqTEdQ_0164
+g_mYumqTEdQ_0196
+g_mYumqTEdQ_0212
+g_mYumqTEdQ_0215
+gm_Nu_I33tY_0036
+gm_Nu_I33tY_0041
+gwTYYXTJP7s_0000
+gwTYYXTJP7s_0002
+gwTYYXTJP7s_0030
+gwTYYXTJP7s_0031
+gwTYYXTJP7s_0046
+gwTYYXTJP7s_0047
+gwTYYXTJP7s_0053
+gwTYYXTJP7s_0055
+gwTYYXTJP7s_0063
+gwTYYXTJP7s_0064
+gwTYYXTJP7s_0065
+gwTYYXTJP7s_0067
+gwTYYXTJP7s_0068
+gwTYYXTJP7s_0070
+gwTYYXTJP7s_0071
+gwTYYXTJP7s_0073
+gwTYYXTJP7s_0074
+gwTYYXTJP7s_0078
+gwTYYXTJP7s_0086
+gwTYYXTJP7s_0087
+gwTYYXTJP7s_0088
+gwTYYXTJP7s_0090
+gwTYYXTJP7s_0091
+gwTYYXTJP7s_0092
+gwTYYXTJP7s_0093
+gwTYYXTJP7s_0095
+gwTYYXTJP7s_0096
+gwTYYXTJP7s_0098
+gwTYYXTJP7s_0101
+gwTYYXTJP7s_0103
+gwTYYXTJP7s_0106
+gwTYYXTJP7s_0110
+gwTYYXTJP7s_0111
+gwTYYXTJP7s_0113
+gwTYYXTJP7s_0120
+gwTYYXTJP7s_0122
+gwTYYXTJP7s_0123
+gwTYYXTJP7s_0124
+gwTYYXTJP7s_0125
+gwTYYXTJP7s_0126
+gwTYYXTJP7s_0127
+gwTYYXTJP7s_0128
+gwTYYXTJP7s_0130
+gwTYYXTJP7s_0132
+gwTYYXTJP7s_0133
+gwTYYXTJP7s_0135
+gwTYYXTJP7s_0136
+gwTYYXTJP7s_0137
+gwTYYXTJP7s_0138
+gwTYYXTJP7s_0139
+gwTYYXTJP7s_0140
+gwTYYXTJP7s_0142
+gwTYYXTJP7s_0143
+gwTYYXTJP7s_0145
+gwTYYXTJP7s_0146
+gwTYYXTJP7s_0150
+gwTYYXTJP7s_0151
+gwTYYXTJP7s_0152
+gwTYYXTJP7s_0153
+gwTYYXTJP7s_0154
+gwTYYXTJP7s_0156
+gwTYYXTJP7s_0157
+gwTYYXTJP7s_0158
+gwTYYXTJP7s_0161
+gwTYYXTJP7s_0162
+gwTYYXTJP7s_0163
+gwTYYXTJP7s_0165
+gwTYYXTJP7s_0166
+gwTYYXTJP7s_0169
+gwTYYXTJP7s_0170
+gwTYYXTJP7s_0171
+gwTYYXTJP7s_0172
+gwTYYXTJP7s_0173
+gwTYYXTJP7s_0174
+gwTYYXTJP7s_0176
+gwTYYXTJP7s_0177
+gwTYYXTJP7s_0181
+gwTYYXTJP7s_0182
+gwTYYXTJP7s_0183
+gwTYYXTJP7s_0184
+gwTYYXTJP7s_0185
+gwTYYXTJP7s_0186
+gwTYYXTJP7s_0187
+gwTYYXTJP7s_0188
+gwTYYXTJP7s_0189
+gwTYYXTJP7s_0191
+gwTYYXTJP7s_0192
+gwTYYXTJP7s_0193
+gwTYYXTJP7s_0197
+gwTYYXTJP7s_0198
+gwTYYXTJP7s_0199
+gwTYYXTJP7s_0200
+gwTYYXTJP7s_0202
+gwTYYXTJP7s_0203
+gwTYYXTJP7s_0204
+gwTYYXTJP7s_0205
+gwTYYXTJP7s_0206
+gwTYYXTJP7s_0207
+gwTYYXTJP7s_0208
+gwTYYXTJP7s_0209
+gwTYYXTJP7s_0210
+gwTYYXTJP7s_0213
+gwTYYXTJP7s_0214
+gwTYYXTJP7s_0215
+gwTYYXTJP7s_0216
+gwTYYXTJP7s_0217
+gwTYYXTJP7s_0218
+gwTYYXTJP7s_0219
+gwTYYXTJP7s_0221
+gxdT0bCS8CQ_0007
+gxdT0bCS8CQ_0011
+gxdT0bCS8CQ_0013
+gxdT0bCS8CQ_0014
+gxdT0bCS8CQ_0017
+gxdT0bCS8CQ_0022
+gxdT0bCS8CQ_0023
+gxdT0bCS8CQ_0024
+gxdT0bCS8CQ_0029
+gxdT0bCS8CQ_0033
+gxdT0bCS8CQ_0035
+gxdT0bCS8CQ_0042
+gxdT0bCS8CQ_0043
+gxdT0bCS8CQ_0044
+gxdT0bCS8CQ_0045
+gxdT0bCS8CQ_0046
+gxdT0bCS8CQ_0047
+gxdT0bCS8CQ_0052
+gxdT0bCS8CQ_0055
+gxdT0bCS8CQ_0056
+h4q0ote54M0_0130
+hGQtaDy_wL8_0002
+hGQtaDy_wL8_0003
+hGQtaDy_wL8_0004
+hGQtaDy_wL8_0013
+hGQtaDy_wL8_0025
+hGQtaDy_wL8_0028
+hGQtaDy_wL8_0044
+heRyH9ETDsY_0007
+heRyH9ETDsY_0062
+hx0vzbF9Xls_0001
+hx0vzbF9Xls_0004
+i4jo61blz2c_0003
+i4jo61blz2c_0005
+i4jo61blz2c_0007
+i4jo61blz2c_0008
+i4jo61blz2c_0043
+i4jo61blz2c_0044
+i4jo61blz2c_0049
+i4jo61blz2c_0053
+iQpe0PKPQo4_0000
+ie7wvy3su7E_0000
+ie7wvy3su7E_0002
+ie7wvy3su7E_0003
+ie7wvy3su7E_0004
+ie7wvy3su7E_0005
+ie7wvy3su7E_0007
+ie7wvy3su7E_0008
+ie7wvy3su7E_0009
+ie7wvy3su7E_0010
+ie7wvy3su7E_0011
+ie7wvy3su7E_0012
+ie7wvy3su7E_0013
+ie7wvy3su7E_0015
+ie7wvy3su7E_0016
+ie7wvy3su7E_0017
+ie7wvy3su7E_0019
+ie7wvy3su7E_0020
+ie7wvy3su7E_0022
+ie7wvy3su7E_0023
+ie7wvy3su7E_0025
+ie7wvy3su7E_0026
+ie7wvy3su7E_0027
+ie7wvy3su7E_0029
+ie7wvy3su7E_0030
+ie7wvy3su7E_0031
+ie7wvy3su7E_0032
+ie7wvy3su7E_0033
+ie7wvy3su7E_0034
+ie7wvy3su7E_0045
+ie7wvy3su7E_0053
+ie7wvy3su7E_0055
+ie7wvy3su7E_0072
+ie7wvy3su7E_0076
+ie7wvy3su7E_0078
+ie7wvy3su7E_0079
+ie7wvy3su7E_0080
+ie7wvy3su7E_0081
+ie7wvy3su7E_0082
+ie7wvy3su7E_0085
+ieATkgEmuOo_0002
+ieATkgEmuOo_0003
+ieATkgEmuOo_0004
+ieATkgEmuOo_0006
+ieATkgEmuOo_0008
+ieATkgEmuOo_0011
+ieATkgEmuOo_0014
+ieATkgEmuOo_0016
+ieATkgEmuOo_0019
+ieATkgEmuOo_0021
+ieATkgEmuOo_0029
+ieATkgEmuOo_0032
+ieATkgEmuOo_0033
+ieATkgEmuOo_0034
+ieATkgEmuOo_0036
+ieATkgEmuOo_0037
+ieATkgEmuOo_0039
+ieATkgEmuOo_0040
+ieATkgEmuOo_0041
+ieATkgEmuOo_0042
+ieATkgEmuOo_0044
+ieATkgEmuOo_0045
+ieATkgEmuOo_0046
+ieATkgEmuOo_0047
+ieATkgEmuOo_0048
+ieATkgEmuOo_0049
+ieATkgEmuOo_0051
+ieATkgEmuOo_0052
+ieATkgEmuOo_0053
+ieATkgEmuOo_0054
+ieATkgEmuOo_0055
+ieATkgEmuOo_0056
+ieATkgEmuOo_0057
+ieATkgEmuOo_0059
+ieATkgEmuOo_0060
+ieATkgEmuOo_0061
+ieATkgEmuOo_0062
+iwZIu2B9Kxc_0008
+iwZIu2B9Kxc_0009
+iwZIu2B9Kxc_0011
+iwZIu2B9Kxc_0012
+iwZIu2B9Kxc_0015
+iwZIu2B9Kxc_0018
+iwtErhAprUE_0005
+iwtErhAprUE_0007
+iwtErhAprUE_0059
+iwtErhAprUE_0062
+iwtErhAprUE_0063
+iwtErhAprUE_0068
+iwtErhAprUE_0070
+jGjiMMmZ5I4_0025
+jGjiMMmZ5I4_0026
+jGjiMMmZ5I4_0027
+jUPVspqB9oI_0000
+jUPVspqB9oI_0001
+jUPVspqB9oI_0002
+jUPVspqB9oI_0003
+jUPVspqB9oI_0004
+jUPVspqB9oI_0005
+jUPVspqB9oI_0006
+jUPVspqB9oI_0009
+jUPVspqB9oI_0020
+jUPVspqB9oI_0025
+jUPVspqB9oI_0030
+jUPVspqB9oI_0032
+jWbXM0z79DU_0000
+jWbXM0z79DU_0001
+jWbXM0z79DU_0031
+jWbXM0z79DU_0057
+jWbXM0z79DU_0109
+jvepzgCLrI8_0000
+jvepzgCLrI8_0013
+jvepzgCLrI8_0014
+jvepzgCLrI8_0015
+jvepzgCLrI8_0045
+jvepzgCLrI8_0061
+jvepzgCLrI8_0062
+jvepzgCLrI8_0063
+jvepzgCLrI8_0064
+kAjaNQmT174_0005
+kAjaNQmT174_0012
+kAjaNQmT174_0023
+kAjaNQmT174_0025
+kAjaNQmT174_0027
+kAjaNQmT174_0055
+kAjaNQmT174_0059
+kAjaNQmT174_0062
+kAjaNQmT174_0074
+kAjaNQmT174_0078
+kAjaNQmT174_0080
+kAjaNQmT174_0139
+kAjaNQmT174_0140
+kAjaNQmT174_0141
+kAjaNQmT174_0143
+kAjaNQmT174_0149
+kAjaNQmT174_0184
+kAjaNQmT174_0185
+kAjaNQmT174_0187
+kAjaNQmT174_0188
+kAjaNQmT174_0195
+kAjaNQmT174_0196
+kAjaNQmT174_0197
+kAjaNQmT174_0200
+kAjaNQmT174_0208
+kAjaNQmT174_0209
+kAjaNQmT174_0247
+kAjaNQmT174_0258
+kAjaNQmT174_0259
+kAjaNQmT174_0261
+kAjaNQmT174_0262
+kAjaNQmT174_0264
+kAjaNQmT174_0267
+kAjaNQmT174_0268
+kAjaNQmT174_0269
+kAjaNQmT174_0272
+kAjaNQmT174_0297
+kAjaNQmT174_0304
+kAjaNQmT174_0307
+kAjaNQmT174_0314
+kAjaNQmT174_0316
+kAjaNQmT174_0318
+kAjaNQmT174_0319
+kAjaNQmT174_0322
+kAjaNQmT174_0326
+kAjaNQmT174_0327
+kAjaNQmT174_0332
+kAjaNQmT174_0333
+kAjaNQmT174_0335
+kAjaNQmT174_0339
+kAjaNQmT174_0345
+kAjaNQmT174_0346
+kAjaNQmT174_0349
+kAjaNQmT174_0353
+kAjaNQmT174_0357
+kAjaNQmT174_0358
+kAjaNQmT174_0359
+kAjaNQmT174_0362
+kAjaNQmT174_0365
+kAjaNQmT174_0369
+kAjaNQmT174_0371
+kAjaNQmT174_0373
+kAjaNQmT174_0374
+kAjaNQmT174_0375
+kAjaNQmT174_0376
+kAjaNQmT174_0377
+kAjaNQmT174_0378
+kAjaNQmT174_0381
+kAjaNQmT174_0382
+kAjaNQmT174_0385
+kAjaNQmT174_0387
+kAjaNQmT174_0388
+kAjaNQmT174_0389
+kAjaNQmT174_0395
+kAjaNQmT174_0407
+kAjaNQmT174_0417
+kAjaNQmT174_0419
+kAjaNQmT174_0421
+kAjaNQmT174_0430
+kAjaNQmT174_0433
+kAjaNQmT174_0435
+kAjaNQmT174_0436
+kAjaNQmT174_0455
+kAjaNQmT174_0458
+kAjaNQmT174_0467
+kAjaNQmT174_0472
+kAjaNQmT174_0474
+kAjaNQmT174_0478
+kAjaNQmT174_0483
+kAjaNQmT174_0484
+kAjaNQmT174_0486
+kAjaNQmT174_0487
+kAjaNQmT174_0489
+kAjaNQmT174_0490
+kAjaNQmT174_0491
+kAjaNQmT174_0494
+kAjaNQmT174_0495
+kAjaNQmT174_0496
+kCzp9tbRxqk_0000
+kCzp9tbRxqk_0002
+kCzp9tbRxqk_0004
+kCzp9tbRxqk_0005
+kCzp9tbRxqk_0009
+kCzp9tbRxqk_0018
+kCzp9tbRxqk_0025
+kCzp9tbRxqk_0066
+kCzp9tbRxqk_0074
+kCzp9tbRxqk_0111
+kHQJzFdsz2Q_0001
+kHQJzFdsz2Q_0002
+kHQJzFdsz2Q_0003
+kHQJzFdsz2Q_0005
+kHQJzFdsz2Q_0008
+kHQJzFdsz2Q_0010
+kHQJzFdsz2Q_0015
+kHQJzFdsz2Q_0019
+kHQJzFdsz2Q_0022
+kHQJzFdsz2Q_0028
+kHQJzFdsz2Q_0033
+kHQJzFdsz2Q_0036
+kHQJzFdsz2Q_0037
+kHQJzFdsz2Q_0039
+kHQJzFdsz2Q_0040
+kHQJzFdsz2Q_0042
+kHQJzFdsz2Q_0043
+kHQJzFdsz2Q_0046
+kHQJzFdsz2Q_0047
+kHQJzFdsz2Q_0062
+kHQJzFdsz2Q_0063
+kHQJzFdsz2Q_0065
+kHQJzFdsz2Q_0066
+kHQJzFdsz2Q_0067
+kHQJzFdsz2Q_0068
+kHQJzFdsz2Q_0069
+kHQJzFdsz2Q_0071
+kHQJzFdsz2Q_0074
+kHQJzFdsz2Q_0075
+kHQJzFdsz2Q_0079
+kHQJzFdsz2Q_0080
+kHQJzFdsz2Q_0081
+kHQJzFdsz2Q_0082
+kHQJzFdsz2Q_0083
+kHQJzFdsz2Q_0084
+kHQJzFdsz2Q_0085
+kHQJzFdsz2Q_0086
+kHQJzFdsz2Q_0087
+kHQJzFdsz2Q_0088
+kHQJzFdsz2Q_0089
+kHQJzFdsz2Q_0091
+kHQJzFdsz2Q_0092
+kHQJzFdsz2Q_0093
+kHQJzFdsz2Q_0094
+kHQJzFdsz2Q_0097
+kHQJzFdsz2Q_0098
+kHQJzFdsz2Q_0099
+kHQJzFdsz2Q_0100
+kJDrEJD3Cyk_0126
+kJDrEJD3Cyk_0128
+kJDrEJD3Cyk_0133
+kJDrEJD3Cyk_0158
+kMPHV8VsWRg_0001
+kMPHV8VsWRg_0026
+kMPHV8VsWRg_0027
+kMPHV8VsWRg_0028
+kMPHV8VsWRg_0033
+kMPHV8VsWRg_0035
+kMPHV8VsWRg_0044
+kMPHV8VsWRg_0045
+kMPHV8VsWRg_0056
+kMPHV8VsWRg_0060
+kMPHV8VsWRg_0065
+kMPHV8VsWRg_0071
+kMPHV8VsWRg_0075
+kMPHV8VsWRg_0079
+kMPHV8VsWRg_0087
+kMPHV8VsWRg_0105
+kMPHV8VsWRg_0114
+kMPHV8VsWRg_0128
+kMPHV8VsWRg_0130
+kMPHV8VsWRg_0132
+kMPHV8VsWRg_0138
+kMPHV8VsWRg_0151
+kMPHV8VsWRg_0159
+kMPHV8VsWRg_0174
+kMPHV8VsWRg_0192
+kj0YTiexoVI_0070
+ku6EHu0igME_0001
+ku6EHu0igME_0066
+ku6EHu0igME_0210
+ku6EHu0igME_0211
+ku6EHu0igME_0212
+ku6EHu0igME_0213
+ku6EHu0igME_0214
+ku6EHu0igME_0215
+kxTyHU_bt2s_0000
+kxTyHU_bt2s_0011
+kxTyHU_bt2s_0046
+kxTyHU_bt2s_0054
+kxTyHU_bt2s_0068
+lBXbibUlOrw_0040
+lBXbibUlOrw_0046
+lBXbibUlOrw_0047
+lBXbibUlOrw_0052
+lBXbibUlOrw_0053
+lBXbibUlOrw_0057
+lBXbibUlOrw_0059
+lCUSA9DJAsU_0020
+lChWD345fO4_0000
+lChWD345fO4_0009
+lChWD345fO4_0011
+lChWD345fO4_0014
+lChWD345fO4_0017
+lChWD345fO4_0030
+lChWD345fO4_0037
+lChWD345fO4_0040
+lChWD345fO4_0042
+lChWD345fO4_0044
+lChWD345fO4_0051
+lChWD345fO4_0068
+lChWD345fO4_0070
+lChWD345fO4_0074
+lChWD345fO4_0078
+lChWD345fO4_0079
+lChWD345fO4_0080
+lChWD345fO4_0081
+lChWD345fO4_0082
+lChWD345fO4_0083
+lChWD345fO4_0087
+lChWD345fO4_0096
+lChWD345fO4_0097
+lChWD345fO4_0098
+lChWD345fO4_0107
+lChWD345fO4_0109
+lChWD345fO4_0117
+lChWD345fO4_0118
+lChWD345fO4_0120
+lChWD345fO4_0121
+lChWD345fO4_0122
+lChWD345fO4_0123
+lChWD345fO4_0124
+lChWD345fO4_0126
+lChWD345fO4_0129
+lChWD345fO4_0130
+lChWD345fO4_0134
+lChWD345fO4_0135
+lChWD345fO4_0136
+lChWD345fO4_0137
+lChWD345fO4_0140
+lChWD345fO4_0141
+lChWD345fO4_0143
+lChWD345fO4_0145
+lChWD345fO4_0146
+lChWD345fO4_0147
+lChWD345fO4_0148
+lChWD345fO4_0149
+lChWD345fO4_0150
+lChWD345fO4_0154
+lChWD345fO4_0155
+lChWD345fO4_0158
+lChWD345fO4_0159
+lChWD345fO4_0160
+lChWD345fO4_0162
+lChWD345fO4_0165
+lChWD345fO4_0166
+lChWD345fO4_0170
+lFdW2t9Hiog_0002
+lFdW2t9Hiog_0037
+lFdW2t9Hiog_0039
+lFdW2t9Hiog_0040
+lFdW2t9Hiog_0041
+lFdW2t9Hiog_0044
+lFdW2t9Hiog_0045
+lFdW2t9Hiog_0047
+lRiM5Kk4OvY_0063
+lRiM5Kk4OvY_0064
+lRiM5Kk4OvY_0073
+lRiM5Kk4OvY_0129
+lS7o9qTAEhg_0053
+lS7o9qTAEhg_0055
+lXCDVcrIimg_0020
+lXCDVcrIimg_0066
+lvuejJYTE1A_0053
+lvuejJYTE1A_0057
+lvuejJYTE1A_0141
+lxIccBrKK3E_0002
+lxIccBrKK3E_0007
+lxIccBrKK3E_0011
+lxIccBrKK3E_0062
+mZo8sR71dSA_0008
+mZo8sR71dSA_0109
+mZo8sR71dSA_0110
+m_8IrmviDDI_0019
+m_8IrmviDDI_0026
+m_8IrmviDDI_0100
+m_8IrmviDDI_0108
+m_8IrmviDDI_0126
+m_8IrmviDDI_0180
+m_8IrmviDDI_0243
+m_8IrmviDDI_0249
+m_8IrmviDDI_0255
+m_8IrmviDDI_0302
+m_8IrmviDDI_0336
+m_8IrmviDDI_0369
+m_8IrmviDDI_0378
+mr18y16hcA0_0071
+mr18y16hcA0_0128
+mr18y16hcA0_0167
+mr18y16hcA0_0171
+nBkwCNUezrw_0091
+nBkwCNUezrw_0211
+nBkwCNUezrw_0216
+n_18ajaN788_0014
+n_18ajaN788_0024
+nclwKlzwQtg_0169
+nouG8nHNy38_0027
+nouG8nHNy38_0028
+nsG1XGv8RU0_0017
+nsG1XGv8RU0_0065
+nsG1XGv8RU0_0071
+nsG1XGv8RU0_0072
+nsG1XGv8RU0_0074
+nsG1XGv8RU0_0075
+nsG1XGv8RU0_0076
+nsG1XGv8RU0_0089
+nsG1XGv8RU0_0090
+nsG1XGv8RU0_0091
+nsG1XGv8RU0_0093
+nsG1XGv8RU0_0098
+nsG1XGv8RU0_0099
+nsG1XGv8RU0_0112
+nsG1XGv8RU0_0113
+nsG1XGv8RU0_0120
+nsG1XGv8RU0_0125
+nsG1XGv8RU0_0128
+nsG1XGv8RU0_0130
+nsG1XGv8RU0_0137
+nsG1XGv8RU0_0138
+nsG1XGv8RU0_0163
+nsG1XGv8RU0_0166
+nsG1XGv8RU0_0168
+nxX6KqK2DTE_0011
+nxX6KqK2DTE_0019
+nxX6KqK2DTE_0048
+oq45KIc251c_0001
+oq45KIc251c_0007
+oq45KIc251c_0011
+oq45KIc251c_0014
+oq45KIc251c_0017
+oq45KIc251c_0038
+oq45KIc251c_0039
+oq45KIc251c_0043
+oq45KIc251c_0044
+oq45KIc251c_0048
+oq45KIc251c_0050
+oq45KIc251c_0052
+oq45KIc251c_0056
+oq45KIc251c_0057
+oq45KIc251c_0058
+oq45KIc251c_0062
+ouOCfB5Unb0_0147
+ouOCfB5Unb0_0148
+ouOCfB5Unb0_0154
+ouOCfB5Unb0_0165
+p0wcln3EaGs_0057
+p0wcln3EaGs_0064
+p0wcln3EaGs_0072
+p0wcln3EaGs_0073
+p0wcln3EaGs_0075
+p0wcln3EaGs_0152
+pngzGTW8jpA_0007
+pngzGTW8jpA_0018
+pngzGTW8jpA_0033
+pngzGTW8jpA_0036
+pngzGTW8jpA_0047
+prB2ZcYi_bE_0000
+q91xEjb1SZQ_0002
+q91xEjb1SZQ_0006
+q91xEjb1SZQ_0041
+q91xEjb1SZQ_0042
+qJrPLkkBN7Q_0079
+qJrPLkkBN7Q_0082
+qJrPLkkBN7Q_0096
+qJrPLkkBN7Q_0101
+qJrPLkkBN7Q_0104
+qJrPLkkBN7Q_0107
+qK3bdBvf8Sc_0035
+qLCzx7qzRk8_0000
+qLCzx7qzRk8_0007
+qLCzx7qzRk8_0014
+qLCzx7qzRk8_0019
+qLCzx7qzRk8_0021
+qLCzx7qzRk8_0024
+qLCzx7qzRk8_0029
+qfeHKniOvkY_0007
+qfeHKniOvkY_0021
+qfeHKniOvkY_0030
+qfeHKniOvkY_0032
+qfeHKniOvkY_0036
+qfeHKniOvkY_0069
+qfeHKniOvkY_0071
+qfeHKniOvkY_0078
+qfeHKniOvkY_0079
+qfeHKniOvkY_0087
+qfeHKniOvkY_0095
+qfeHKniOvkY_0101
+qfeHKniOvkY_0102
+qfeHKniOvkY_0103
+qfeHKniOvkY_0113
+qfeHKniOvkY_0115
+qfeHKniOvkY_0116
+qfeHKniOvkY_0118
+qfeHKniOvkY_0119
+qfeHKniOvkY_0122
+quC_hC3p1tY_0004
+quC_hC3p1tY_0008
+quC_hC3p1tY_0035
+quC_hC3p1tY_0038
+quC_hC3p1tY_0043
+quC_hC3p1tY_0044
+quC_hC3p1tY_0046
+quC_hC3p1tY_0055
+quC_hC3p1tY_0056
+r5zJmnjKQvw_0027
+r5zJmnjKQvw_0047
+r5zJmnjKQvw_0081
+r5zJmnjKQvw_0090
+r5zJmnjKQvw_0091
+r5zJmnjKQvw_0092
+r5zJmnjKQvw_0096
+r8bvyjeT0aA_0028
+r8bvyjeT0aA_0035
+r8bvyjeT0aA_0036
+r8bvyjeT0aA_0052
+r8bvyjeT0aA_0065
+rRwfAdl4oo8_0002
+rRwfAdl4oo8_0003
+rRwfAdl4oo8_0008
+rRwfAdl4oo8_0013
+rRwfAdl4oo8_0014
+rRwfAdl4oo8_0016
+rRwfAdl4oo8_0039
+rRwfAdl4oo8_0072
+rRwfAdl4oo8_0073
+rRwfAdl4oo8_0084
+rRwfAdl4oo8_0085
+rRwfAdl4oo8_0147
+rRwfAdl4oo8_0148
+rRwfAdl4oo8_0208
+rRwfAdl4oo8_0209
+rRwfAdl4oo8_0210
+rRwfAdl4oo8_0212
+rRwfAdl4oo8_0213
+rRwfAdl4oo8_0214
+ryIQByN3HiE_0001
+ryIQByN3HiE_0003
+ryIQByN3HiE_0010
+ryIQByN3HiE_0014
+ryIQByN3HiE_0110
+ryIQByN3HiE_0137
+ryIQByN3HiE_0181
+s1CGt5gHvzU_0061
+s4of2bbqItk_0121
+s7wzLT27NDk_0011
+s7wzLT27NDk_0039
+sPdwWCCHWYY_0001
+sPdwWCCHWYY_0004
+sPdwWCCHWYY_0007
+sPdwWCCHWYY_0010
+sPdwWCCHWYY_0020
+sPdwWCCHWYY_0024
+sPdwWCCHWYY_0026
+sPdwWCCHWYY_0027
+sPdwWCCHWYY_0028
+sPdwWCCHWYY_0029
+sPdwWCCHWYY_0030
+sPdwWCCHWYY_0033
+sPdwWCCHWYY_0034
+sPdwWCCHWYY_0035
+sPdwWCCHWYY_0036
+sPdwWCCHWYY_0037
+sPdwWCCHWYY_0041
+sPdwWCCHWYY_0043
+sPdwWCCHWYY_0046
+sPdwWCCHWYY_0048
+sPdwWCCHWYY_0051
+sPdwWCCHWYY_0056
+sPdwWCCHWYY_0075
+sPdwWCCHWYY_0076
+sPdwWCCHWYY_0079
+sPdwWCCHWYY_0086
+sPdwWCCHWYY_0088
+sPdwWCCHWYY_0095
+sPdwWCCHWYY_0096
+sPdwWCCHWYY_0100
+sPdwWCCHWYY_0101
+sPdwWCCHWYY_0102
+sPdwWCCHWYY_0103
+sPdwWCCHWYY_0118
+sPdwWCCHWYY_0121
+sPdwWCCHWYY_0150
+sPdwWCCHWYY_0152
+sPdwWCCHWYY_0157
+sPdwWCCHWYY_0162
+sPdwWCCHWYY_0187
+sPdwWCCHWYY_0190
+sPdwWCCHWYY_0192
+sPdwWCCHWYY_0194
+sPdwWCCHWYY_0198
+sPdwWCCHWYY_0205
+sPdwWCCHWYY_0206
+sPdwWCCHWYY_0208
+sPdwWCCHWYY_0209
+sPdwWCCHWYY_0210
+sPdwWCCHWYY_0211
+sPdwWCCHWYY_0212
+sPdwWCCHWYY_0214
+sPdwWCCHWYY_0215
+sPdwWCCHWYY_0217
+sPdwWCCHWYY_0219
+sPdwWCCHWYY_0220
+sPdwWCCHWYY_0221
+sPdwWCCHWYY_0224
+sPdwWCCHWYY_0227
+sPdwWCCHWYY_0229
+sPdwWCCHWYY_0230
+sPdwWCCHWYY_0231
+sPdwWCCHWYY_0235
+sPdwWCCHWYY_0237
+sPdwWCCHWYY_0241
+sPdwWCCHWYY_0242
+sPdwWCCHWYY_0244
+sPdwWCCHWYY_0247
+sPdwWCCHWYY_0250
+sPdwWCCHWYY_0251
+sPdwWCCHWYY_0254
+sPdwWCCHWYY_0255
+sPdwWCCHWYY_0256
+sPdwWCCHWYY_0263
+sPdwWCCHWYY_0276
+sPdwWCCHWYY_0286
+sPdwWCCHWYY_0287
+sPdwWCCHWYY_0288
+sPdwWCCHWYY_0289
+sPdwWCCHWYY_0290
+sPdwWCCHWYY_0291
+sPdwWCCHWYY_0293
+sdqKdbyb0Ts_0005
+sdqKdbyb0Ts_0016
+sdqKdbyb0Ts_0020
+sdqKdbyb0Ts_0053
+sdqKdbyb0Ts_0054
+sdqKdbyb0Ts_0057
+sdqKdbyb0Ts_0058
+sdqKdbyb0Ts_0068
+sdqKdbyb0Ts_0069
+ssenWQx4YOo_0007
+ssenWQx4YOo_0009
+ssenWQx4YOo_0010
+ssenWQx4YOo_0018
+svS3ZRTRaPQ_0147
+svS3ZRTRaPQ_0153
+syOmmDr_M4k_0005
+syOmmDr_M4k_0006
+syOmmDr_M4k_0023
+syOmmDr_M4k_0076
+syOmmDr_M4k_0083
+syOmmDr_M4k_0089
+syOmmDr_M4k_0091
+syOmmDr_M4k_0092
+syOmmDr_M4k_0093
+syOmmDr_M4k_0097
+syOmmDr_M4k_0101
+syOmmDr_M4k_0102
+syOmmDr_M4k_0106
+syOmmDr_M4k_0115
+syOmmDr_M4k_0116
+syOmmDr_M4k_0122
+syOmmDr_M4k_0125
+syOmmDr_M4k_0126
+syOmmDr_M4k_0127
+syOmmDr_M4k_0130
+syOmmDr_M4k_0131
+syOmmDr_M4k_0133
+syOmmDr_M4k_0135
+syOmmDr_M4k_0136
+syOmmDr_M4k_0139
+syOmmDr_M4k_0140
+syOmmDr_M4k_0143
+syOmmDr_M4k_0144
+syOmmDr_M4k_0145
+syOmmDr_M4k_0148
+syOmmDr_M4k_0170
+syOmmDr_M4k_0175
+syOmmDr_M4k_0179
+syOmmDr_M4k_0181
+syOmmDr_M4k_0186
+syOmmDr_M4k_0187
+syOmmDr_M4k_0188
+syOmmDr_M4k_0189
+syOmmDr_M4k_0203
+syOmmDr_M4k_0204
+syOmmDr_M4k_0205
+syOmmDr_M4k_0239
+syOmmDr_M4k_0241
+syOmmDr_M4k_0242
+syOmmDr_M4k_0243
+syOmmDr_M4k_0245
+syOmmDr_M4k_0246
+syOmmDr_M4k_0249
+syOmmDr_M4k_0253
+syOmmDr_M4k_0256
+syOmmDr_M4k_0257
+syOmmDr_M4k_0259
+syOmmDr_M4k_0260
+syOmmDr_M4k_0261
+syOmmDr_M4k_0264
+syOmmDr_M4k_0275
+syOmmDr_M4k_0278
+syOmmDr_M4k_0279
+syOmmDr_M4k_0280
+syOmmDr_M4k_0281
+syOmmDr_M4k_0282
+syOmmDr_M4k_0303
+syOmmDr_M4k_0305
+syOmmDr_M4k_0306
+syOmmDr_M4k_0307
+syOmmDr_M4k_0314
+syOmmDr_M4k_0315
+syOmmDr_M4k_0316
+syOmmDr_M4k_0321
+syOmmDr_M4k_0322
+syOmmDr_M4k_0323
+syOmmDr_M4k_0325
+syOmmDr_M4k_0326
+syOmmDr_M4k_0330
+syOmmDr_M4k_0331
+syOmmDr_M4k_0336
+syOmmDr_M4k_0337
+syOmmDr_M4k_0338
+syOmmDr_M4k_0339
+syOmmDr_M4k_0341
+syOmmDr_M4k_0343
+syOmmDr_M4k_0347
+syOmmDr_M4k_0350
+syOmmDr_M4k_0352
+syOmmDr_M4k_0353
+syOmmDr_M4k_0354
+syOmmDr_M4k_0356
+tLdwxXhgdHM_0019
+tLdwxXhgdHM_0062
+tLdwxXhgdHM_0064
+tLdwxXhgdHM_0073
+tR9Qyuwxrms_0087
+tUWFFJq4450_0000
+tUWFFJq4450_0003
+tUWFFJq4450_0041
+tUWFFJq4450_0067
+tUWFFJq4450_0068
+tUWFFJq4450_0076
+tXVnxgd8uqQ_0002
+tXVnxgd8uqQ_0004
+tXVnxgd8uqQ_0022
+tXVnxgd8uqQ_0023
+tXVnxgd8uqQ_0025
+tXVnxgd8uqQ_0028
+tXVnxgd8uqQ_0030
+tXVnxgd8uqQ_0035
+tXVnxgd8uqQ_0038
+tXVnxgd8uqQ_0046
+tXVnxgd8uqQ_0047
+tXVnxgd8uqQ_0049
+toFa1Z_VQCA_0011
+toFa1Z_VQCA_0014
+toFa1Z_VQCA_0021
+toFa1Z_VQCA_0028
+toFa1Z_VQCA_0085
+toFa1Z_VQCA_0094
+toFa1Z_VQCA_0111
+toFa1Z_VQCA_0115
+toFa1Z_VQCA_0116
+toFa1Z_VQCA_0135
+toFa1Z_VQCA_0138
+toFa1Z_VQCA_0139
+toFa1Z_VQCA_0163
+toFa1Z_VQCA_0250
+toFa1Z_VQCA_0251
+toFa1Z_VQCA_0253
+toFa1Z_VQCA_0259
+toFa1Z_VQCA_0261
+toFa1Z_VQCA_0262
+toFa1Z_VQCA_0272
+toFa1Z_VQCA_0273
+toFa1Z_VQCA_0274
+toFa1Z_VQCA_0279
+toFa1Z_VQCA_0287
+u0CNIMmG44c_0075
+u4u5OuWfJBo_0092
+u4u5OuWfJBo_0094
+u4u5OuWfJBo_0100
+u4u5OuWfJBo_0101
+u8RVt3d5_Sk_0005
+u8RVt3d5_Sk_0010
+u8RVt3d5_Sk_0025
+u8RVt3d5_Sk_0035
+u8RVt3d5_Sk_0063
+u8RVt3d5_Sk_0075
+u8RVt3d5_Sk_0077
+u8RVt3d5_Sk_0080
+u8RVt3d5_Sk_0082
+u8RVt3d5_Sk_0086
+u8RVt3d5_Sk_0087
+u8RVt3d5_Sk_0091
+u8RVt3d5_Sk_0094
+u8RVt3d5_Sk_0095
+u8RVt3d5_Sk_0096
+u8RVt3d5_Sk_0100
+u8RVt3d5_Sk_0101
+u8RVt3d5_Sk_0113
+u8RVt3d5_Sk_0114
+u8RVt3d5_Sk_0124
+u8RVt3d5_Sk_0125
+u8RVt3d5_Sk_0128
+u8RVt3d5_Sk_0129
+u8RVt3d5_Sk_0135
+u8RVt3d5_Sk_0136
+u8RVt3d5_Sk_0139
+u8RVt3d5_Sk_0142
+u8RVt3d5_Sk_0146
+u8RVt3d5_Sk_0147
+u8RVt3d5_Sk_0151
+u8RVt3d5_Sk_0152
+u8RVt3d5_Sk_0153
+u8RVt3d5_Sk_0157
+u8RVt3d5_Sk_0164
+u8RVt3d5_Sk_0165
+u8RVt3d5_Sk_0166
+u8RVt3d5_Sk_0167
+u8RVt3d5_Sk_0173
+u8RVt3d5_Sk_0174
+u8RVt3d5_Sk_0178
+u8RVt3d5_Sk_0179
+u8RVt3d5_Sk_0181
+u8RVt3d5_Sk_0182
+u8RVt3d5_Sk_0187
+u8RVt3d5_Sk_0188
+u8RVt3d5_Sk_0192
+u8RVt3d5_Sk_0196
+u8RVt3d5_Sk_0197
+u8RVt3d5_Sk_0198
+u8RVt3d5_Sk_0199
+u8RVt3d5_Sk_0201
+u8RVt3d5_Sk_0202
+u8RVt3d5_Sk_0205
+u8RVt3d5_Sk_0207
+u8RVt3d5_Sk_0208
+u8RVt3d5_Sk_0211
+u8RVt3d5_Sk_0213
+u8RVt3d5_Sk_0216
+u8RVt3d5_Sk_0217
+u8RVt3d5_Sk_0218
+u8RVt3d5_Sk_0219
+u8RVt3d5_Sk_0223
+u8RVt3d5_Sk_0227
+u8RVt3d5_Sk_0228
+u8RVt3d5_Sk_0231
+u8RVt3d5_Sk_0232
+u8RVt3d5_Sk_0234
+u8RVt3d5_Sk_0235
+u8RVt3d5_Sk_0236
+u8RVt3d5_Sk_0239
+u8RVt3d5_Sk_0240
+u8RVt3d5_Sk_0242
+u8RVt3d5_Sk_0246
+uBMD84FISOY_0013
+uBMD84FISOY_0016
+uBMD84FISOY_0017
+uBMD84FISOY_0019
+uBMD84FISOY_0024
+uBMD84FISOY_0031
+uBMD84FISOY_0040
+uBMD84FISOY_0058
+uBMD84FISOY_0068
+uBMD84FISOY_0082
+uBMD84FISOY_0084
+uBMD84FISOY_0085
+ubJR2bKGfz4_0001
+ubJR2bKGfz4_0067
+uiZ3t86wOME_0014
+ushxgQA_iIU_0106
+ushxgQA_iIU_0107
+ushxgQA_iIU_0112
+ushxgQA_iIU_0119
+ushxgQA_iIU_0126
+ushxgQA_iIU_0155
+ushxgQA_iIU_0218
+ushxgQA_iIU_0219
+ushxgQA_iIU_0236
+uumAohMovts_0016
+v8RVnr1CThg_0050
+vKCdpSN_luc_0057
+vKCdpSN_luc_0074
+vKCdpSN_luc_0076
+vKCdpSN_luc_0078
+vKCdpSN_luc_0079
+vKCdpSN_luc_0081
+vKCdpSN_luc_0082
+vKCdpSN_luc_0083
+vKCdpSN_luc_0084
+vKCdpSN_luc_0094
+vKCdpSN_luc_0102
+vKCdpSN_luc_0145
+vKCdpSN_luc_0149
+vKCdpSN_luc_0179
+vKCdpSN_luc_0180
+vKCdpSN_luc_0181
+vKCdpSN_luc_0185
+vQDvIl5e3iI_0096
+vQDvIl5e3iI_0113
+vQDvIl5e3iI_0131
+vQDvIl5e3iI_0142
+vQDvIl5e3iI_0159
+vQDvIl5e3iI_0217
+vQDvIl5e3iI_0218
+vQDvIl5e3iI_0220
+vQDvIl5e3iI_0222
+vQDvIl5e3iI_0223
+vQDvIl5e3iI_0233
+vX_rnBSXKd8_0003
+vX_rnBSXKd8_0048
+vX_rnBSXKd8_0050
+vX_rnBSXKd8_0051
+vX_rnBSXKd8_0052
+vX_rnBSXKd8_0053
+vX_rnBSXKd8_0057
+vX_rnBSXKd8_0060
+vX_rnBSXKd8_0062
+vX_rnBSXKd8_0063
+wGJpjV1jFQw_0013
+wGJpjV1jFQw_0051
+wGJpjV1jFQw_0062
+wGJpjV1jFQw_0071
+wGJpjV1jFQw_0103
+wGJpjV1jFQw_0105
+wGJpjV1jFQw_0110
+wGJpjV1jFQw_0113
+wGJpjV1jFQw_0122
+wOK7_ZkRMN4_0017
+wOK7_ZkRMN4_0020
+wOK7_ZkRMN4_0026
+wOK7_ZkRMN4_0028
+wOK7_ZkRMN4_0031
+wOK7_ZkRMN4_0036
+wOK7_ZkRMN4_0037
+wOK7_ZkRMN4_0040
+wOK7_ZkRMN4_0047
+wOK7_ZkRMN4_0051
+wOK7_ZkRMN4_0053
+wOK7_ZkRMN4_0056
+wOK7_ZkRMN4_0057
+wOK7_ZkRMN4_0058
+wOK7_ZkRMN4_0059
+wOK7_ZkRMN4_0070
+wOK7_ZkRMN4_0073
+wOK7_ZkRMN4_0077
+wOK7_ZkRMN4_0078
+wOK7_ZkRMN4_0081
+wOK7_ZkRMN4_0082
+wOK7_ZkRMN4_0083
+wOK7_ZkRMN4_0085
+wOK7_ZkRMN4_0086
+wOK7_ZkRMN4_0088
+wOK7_ZkRMN4_0094
+wOK7_ZkRMN4_0096
+wOK7_ZkRMN4_0097
+wOK7_ZkRMN4_0099
+wOK7_ZkRMN4_0104
+wOK7_ZkRMN4_0106
+wOK7_ZkRMN4_0107
+wOK7_ZkRMN4_0113
+wOK7_ZkRMN4_0115
+wOK7_ZkRMN4_0116
+wOK7_ZkRMN4_0125
+wOK7_ZkRMN4_0126
+wOK7_ZkRMN4_0127
+wOK7_ZkRMN4_0129
+wOK7_ZkRMN4_0130
+wOK7_ZkRMN4_0137
+wOK7_ZkRMN4_0138
+wOK7_ZkRMN4_0143
+wOK7_ZkRMN4_0149
+wOK7_ZkRMN4_0152
+wOK7_ZkRMN4_0154
+wrfIXsCbFKc_0026
+wrfIXsCbFKc_0030
+wrfIXsCbFKc_0116
+wrfIXsCbFKc_0118
+wrfIXsCbFKc_0141
+wrfIXsCbFKc_0152
+wrfIXsCbFKc_0180
+x2kw6MXDMpo_0030
+x2kw6MXDMpo_0076
+x2kw6MXDMpo_0087
+xT4A1kKMUAY_0000
+xT4A1kKMUAY_0003
+xT4A1kKMUAY_0009
+xT4A1kKMUAY_0016
+xT4A1kKMUAY_0028
+xT4A1kKMUAY_0029
+xT4A1kKMUAY_0037
+xqXQH_uf5Co_0117
+xsGHjobUDK0_0027
+xsGHjobUDK0_0033
+xsGHjobUDK0_0067
+xsGHjobUDK0_0084
+xsGHjobUDK0_0112
+xsGHjobUDK0_0122
+xsGHjobUDK0_0127
+xsGHjobUDK0_0131
+xsGHjobUDK0_0132
+xsGHjobUDK0_0135
+xsGHjobUDK0_0140
+xsGHjobUDK0_0141
+xsGHjobUDK0_0156
+xsGHjobUDK0_0182
+xsGHjobUDK0_0191
+xvBOPLvZXxU_0127
+y1O19R_me9U_0040
+y1O19R_me9U_0043
+y1O19R_me9U_0064
+y1O19R_me9U_0067
+y1O19R_me9U_0092
+y1O19R_me9U_0093
+y1O19R_me9U_0109
+y1O19R_me9U_0118
+yP6BakHMbRY_0022
+yP6BakHMbRY_0064
+yP6BakHMbRY_0067
+yP6BakHMbRY_0068
+ypwkINjGyvo_0015
+yzo68izI7bI_0051
+yzo68izI7bI_0057
+zJucVwUfWww_0005
+zJucVwUfWww_0034
+zVSej40xDuE_0000
+zVSej40xDuE_0007
+zVSej40xDuE_0041
+zVSej40xDuE_0048
+zVSej40xDuE_0049
+zVSej40xDuE_0050
+zVSej40xDuE_0063
+zVSej40xDuE_0064
+zVSej40xDuE_0065
+zVSej40xDuE_0066
+zVSej40xDuE_0068
+zVSej40xDuE_0073
+zfBYJrhMZwQ_0234
+zfBYJrhMZwQ_0235
diff --git a/egs2/jtubespeech/asr1/local/test_speaker_list b/egs2/jtubespeech/asr1/local/test_speaker_list
new file mode 100644
index 00000000000..744b3271290
--- /dev/null
+++ b/egs2/jtubespeech/asr1/local/test_speaker_list
@@ -0,0 +1,324 @@
+1P4ouTTTVNc
+1Uy_W5iOHKA
+1VK1jSCuS0I
+1qP4vc3yTCw
+1zQ1bxwtu24
+23wZ9OmGJI4
+2bWvx461Umc
+33OWZ_X2__U
+3AFobAY8H_8
+3I1uHfht20k
+3Nrbi6yRugo
+3TLv_8tpMdA
+3aILBytvlks
+415VX3QX4cU
+48gSJYp482w
+4BCatS_OM0o
+4EePkO7vBTU
+4NhD0vZ9yPc
+4fpOXK3Znxs
+5N1LmwPFh1g
+5QlqblKceJk
+5ef8wKGqQbw
+5oOOuO5G1jY
+5pf0oP5PQAE
+5w6KTNJ1XxY
+61eExIhQrcY
+64fFAL8d6GA
+6F97M3iktks
+6QWSHPwlBTQ
+7FU9Yb92o7U
+7h6xz2r2AMs
+7kCGm71FJo0
+81UiANYtcOg
+8AQlJEqWBXg
+8RiD9yjvI88
+8XIOuVh4lgM
+8l3IHmW2KSk
+98y1W2SRsX4
+A3ZINKGArTU
+AAdxdUhaIYU
+AQAgvg3KHwo
+ARsrJTRa2MQ
+BDtdR0Lplc0
+BDz56lr0QEI
+BLMI9jdtnFc
+BMBYqLshQkw
+BOoxjHw6c_I
+BY3aalyDpXk
+Bab8j1Ek4jc
+C5ETpmxVicY
+C76kmEOvVEE
+CJJV4xe8N4M
+CkVNl7w_Ibc
+DJ7D7kJhYEA
+DLyKmfU2WLU
+DO8RJgIujEY
+DYjooacju_8
+DsARvCX5Q24
+E4PgIY36DTY
+ELfL879J1ZM
+EUti6bV7iOQ
+EXdqYY60y04
+EZKBDWRn7wc
+EoTFGgvBce4
+Ex7vU3VmPk0
+F9F6rG5Ghig
+FI0HEpoOL4k
+FO19oi80mkU
+FSIXO_G96_U
+FUMRQqNIAmg
+FWKleFPNEPQ
+FYZGO5W3n4I
+FmlGi6u7aIo
+FocYa_DJkzM
+FxFz9nT4y7U
+G6ypXVO_Fm0
+G9FZzpuiLm4
+GJccNIFQnTA
+GLgLvNDTatU
+GTVsvvCjwjk
+HeTB6P4Rxt0
+Ht86Exdrlok
+I3fhTd0iRn4
+ICv_Liucpf0
+IEcl2DZFZj8
+ILFrJIMTFNc
+Ibw0yLy8h20
+Ig9q6z0uIB0
+J30dHOAUu4g
+JRARprI9z6k
+JUfZD1an9i0
+Kop7BUYHaJ8
+LGmey8AfKC0
+Ln7QBexLksQ
+LrJicbYDgBk
+LyzaKXK15Vg
+M5rIjd_VWoA
+MLKARJ4gqD4
+MP5mZaUupXc
+MQdCxzu9I7Y
+MWPsCJMSvZo
+McQrH0iqFlY
+MrnxGpMxGNE
+NDmYS82ztFI
+NvZhSVE5PWQ
+O9sTLQSZfwo
+OCz0we8aiHM
+OJRoBJlpssA
+ObVqn63Zc14
+OyQH5sZOaYg
+P8kmaj89DoE
+PRF7sEOLaJ0
+PWFevMuRRks
+Pkc1C0YDgwI
+Q6RgtFU5hdE
+QachBPW_RZY
+QqQIwB1tFbU
+R1m_I9A40EM
+RBv5SiEoqbs
+ROOgtnxJWIQ
+RYIMKlZQpIs
+S1yH7QxqkM0
+SCiRzd_4qoQ
+SXWaaPsiXZM
+SXkUUg5_nuE
+SjE9k0L1PRI
+SjhpH8CEvkM
+SkpqV4wnnsg
+SuCN2QHLBHU
+SuE0EtNhm10
+TaAHiOFvCLA
+Tp8azIZH2Zg
+TrSURh_zqG8
+U8iB8i0o6lM
+UHf9F8AjihA
+UPhG598XWOQ
+UW6YE4kTm6s
+UWgHRNwSBPs
+UdzWBW_GVdA
+UkspwzJI9D0
+UzoRm0lHL4Y
+V9oqJBUCPfA
+VmGfENnSrtE
+VnLmGUGUFHI
+Vr0_vZpLDOA
+VzKY1Gx53zc
+W1rJdHPRM_0
+WI3uSA37jTQ
+WYMGWr6NVgc
+WsIoM7MZ5iI
+X7AKyI7FRpA
+XDRQQTZ6b9w
+XJrFjEgq5Tk
+XKy80mmqQbE
+XdfLK4pXgNU
+XewnyUJgyA4
+XwQaaMOU6oM
+Y5naGnjTMi0
+YHzoGWjcLRg
+YJlYb0Jif9o
+YPLGuwXomYQ
+YSIxx0eOK4U
+YlH1K27gP1Y
+Yq3MgleQKSc
+YrUcUmQSufE
+YxYY8V9MPCI
+Z5nP4YB93KM
+ZAmgWLDdyZ4
+Z_MQqWzXWjk
+ZbRpQnrMwQM
+ZkKAwe9SYNY
+Zv1Us7bg2Ss
+ZyvrbXsePgM
+_eZaDTcAeHo
+_gxknz2jr70
+_oXRvYMp36E
+_pQ1BCdSTTI
+aJDTMo_pkLE
+akhR1yAibr4
+as1dYNWR1_s
+ay9u8V11JP4
+b2wLhmcPTNw
+bCyhdG3dCdM
+bM7S5UX48w8
+bV4JkpgcNOs
+b_EoBrEr_tQ
+c7q6typ92cQ
+cTHlQsTG_Go
+cc1W6qLdzVo
+coj4VKJvp8E
+cxktJLhV_Uk
+d3GnwrM7r28
+dEhjANwDnwQ
+dSuP1pRopO8
+djRW_JRmkpE
+dqrkV7DZWOk
+egv_NFOZlq4
+fAZQrAoIHZs
+fFoULWCEha8
+fZ4iXEWx9Xs
+fhqR6xke3a4
+fpBEupKjHR8
+fs1tJm743Gg
+fuJBI8SCy4k
+fus6vSQVcFo
+g9ZRGyB6Iqo
+gF5P556luWY
+g_mYumqTEdQ
+gm_Nu_I33tY
+gwTYYXTJP7s
+gxdT0bCS8CQ
+h4q0ote54M0
+hGQtaDy_wL8
+heRyH9ETDsY
+hx0vzbF9Xls
+i4jo61blz2c
+iQpe0PKPQo4
+i_MnyoKqwLw
+ie7wvy3su7E
+ieATkgEmuOo
+iwZIu2B9Kxc
+iwtErhAprUE
+jGjiMMmZ5I4
+jUPVspqB9oI
+jWbXM0z79DU
+jvepzgCLrI8
+kAjaNQmT174
+kCzp9tbRxqk
+kHQJzFdsz2Q
+kJDrEJD3Cyk
+kMPHV8VsWRg
+kj0YTiexoVI
+ku6EHu0igME
+kxTyHU_bt2s
+l1HZxq3wLFo
+lBXbibUlOrw
+lCUSA9DJAsU
+lChWD345fO4
+lFdW2t9Hiog
+lRiM5Kk4OvY
+lS7o9qTAEhg
+lXCDVcrIimg
+lbIrL7PKjNQ
+lvuejJYTE1A
+lxIccBrKK3E
+lz1_yfop7XM
+mWmLeqdo2Mo
+mZo8sR71dSA
+m_8IrmviDDI
+mr18y16hcA0
+nBkwCNUezrw
+n_18ajaN788
+nclwKlzwQtg
+nouG8nHNy38
+nrOPH73XQDE
+nsG1XGv8RU0
+nxX6KqK2DTE
+oq45KIc251c
+ouOCfB5Unb0
+p0OejCxh9sk
+p0wcln3EaGs
+pngzGTW8jpA
+prB2ZcYi_bE
+q1jwWz48XBA
+q91xEjb1SZQ
+qJrPLkkBN7Q
+qK3bdBvf8Sc
+qLCzx7qzRk8
+qfeHKniOvkY
+quC_hC3p1tY
+r5zJmnjKQvw
+r8bvyjeT0aA
+rAzZN30Elcs
+rFAdlU2ETjU
+rRwfAdl4oo8
+reQf05AZm9k
+rqrTPS3p878
+rtXTEG2yKYo
+ryIQByN3HiE
+s1CGt5gHvzU
+s4of2bbqItk
+s7wzLT27NDk
+sPdwWCCHWYY
+sdqKdbyb0Ts
+ssenWQx4YOo
+svS3ZRTRaPQ
+sy6DLF455LA
+syOmmDr_M4k
+tLdwxXhgdHM
+tR9Qyuwxrms
+tUWFFJq4450
+tXVnxgd8uqQ
+toFa1Z_VQCA
+u0CNIMmG44c
+u4u5OuWfJBo
+u8RVt3d5_Sk
+uBMD84FISOY
+ubJR2bKGfz4
+uiZ3t86wOME
+ushxgQA_iIU
+uumAohMovts
+v8RVnr1CThg
+vKCdpSN_luc
+vQDvIl5e3iI
+vX_rnBSXKd8
+vvlozhvQPJw
+wGJpjV1jFQw
+wOK7_ZkRMN4
+wrfIXsCbFKc
+x2kw6MXDMpo
+xT4A1kKMUAY
+xkuS6rTZNek
+xqXQH_uf5Co
+xsGHjobUDK0
+xvBOPLvZXxU
+y1O19R_me9U
+yP6BakHMbRY
+ypwkINjGyvo
+yzo68izI7bI
+z4TWyXhOQ3Y
+zI8EmxDe4eI
+zJucVwUfWww
+zVSej40xDuE
+zfBYJrhMZwQ
diff --git a/egs2/jtubespeech/asr1/path.sh b/egs2/jtubespeech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/jtubespeech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/jtubespeech/asr1/pyscripts b/egs2/jtubespeech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/jtubespeech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/jtubespeech/asr1/run.sh b/egs2/jtubespeech/asr1/run.sh
new file mode 100755
index 00000000000..6d99cb2f7fb
--- /dev/null
+++ b/egs2/jtubespeech/asr1/run.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_ss0622_th-0.3_nodev
+valid_set=valid
+test_sets="dev_easy_jun21 eval_easy_jun21"
+
+asr_config=conf/train_asr_conformer.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+# NOTE: The default settings require 4 GPUs with 32 GB memory
+./asr.sh \
+    --ngpu 1 \
+    --nj 128 \
+    --inference_nj 256 \
+    --lang jp \
+    --token_type char \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --use_lm false \
+    --lm_config "${lm_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/jtubespeech/asr1/scripts b/egs2/jtubespeech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/jtubespeech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/jtubespeech/asr1/steps b/egs2/jtubespeech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/jtubespeech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/jtubespeech/asr1/utils b/egs2/jtubespeech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/jtubespeech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/README.md b/egs2/jtubespeech/tts1/README.md
new file mode 100644
index 00000000000..734ce832e77
--- /dev/null
+++ b/egs2/jtubespeech/tts1/README.md
@@ -0,0 +1,19 @@
+# JTUBESPEECH RECIPE
+
+This is the recipe of a Japanese multi-speaker TTS model with [JTubeSpeech](https://github.com/sarulab-speech/jtubespeech).  
+The script automatically downloads and uses about 20GB subset of a single-speaker split.  
+You can change the CTC-score threshold for data pruning (default is -0.1) with `/local/data.sh`.  
+
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train with X-vector](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-x-vector-training)
+- [How to train with speaker ID](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-speaker-id-embedding-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
diff --git a/egs2/jtubespeech/tts1/cmd.sh b/egs2/jtubespeech/tts1/cmd.sh
new file mode 120000
index 00000000000..05242a7c386
--- /dev/null
+++ b/egs2/jtubespeech/tts1/cmd.sh
@@ -0,0 +1 @@
+../../jsut/tts1/cmd.sh
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/conf/decode.yaml b/egs2/jtubespeech/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/conf/mfcc.conf b/egs2/jtubespeech/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/jtubespeech/tts1/conf/pbs.conf b/egs2/jtubespeech/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/jtubespeech/tts1/conf/queue.conf b/egs2/jtubespeech/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/jtubespeech/tts1/conf/slurm.conf b/egs2/jtubespeech/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/jtubespeech/tts1/conf/train.yaml b/egs2/jtubespeech/tts1/conf/train.yaml
new file mode 120000
index 00000000000..5825b613e30
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_gst+xvector_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/jtubespeech/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/jtubespeech/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/jtubespeech/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a6b8d59d422
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,113 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with GST + X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 4                               # number of heads in GST multi-head attention
+    gst_tokens: 16                             # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
new file mode 100644
index 00000000000..6065c914c39
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
@@ -0,0 +1,81 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 512              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 4                 # number of heads in GST multi-head attention
+    gst_tokens: 16               # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_transformer.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_transformer.yaml
new file mode 100644
index 00000000000..737a26960d4
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_gst+xvector_transformer.yaml
@@ -0,0 +1,96 @@
+# This configuration is for ESPnet2 to train Transformer-TTS with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the learning of the diagonal attention.
+# It requires 4 GPUs with 32 GB memory and it takes around 3 days
+# to finish the training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    spk_embed_dim: 512               # dimension of speaker embedding
+    spk_embed_integration_type: add  # how to integrate speaker embedding
+    use_gst: true                    # whether to use GST embedding
+    gst_heads: 4                     # number of heads in GST multi-head attention
+    gst_tokens: 16                   # number of global style tokens
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 12000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_gst_conformer_fastspeech2.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_gst_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..498181b7f69
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_gst_conformer_fastspeech2.yaml
@@ -0,0 +1,111 @@
+# This configuration is for ESPnet2 to train GST-Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~7 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 8                               # number of heads in GST multi-head attention
+    gst_tokens: 128                            # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000  # number of iterations per epoch
+max_epoch: 1500           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 10            # gradient accumulation
+batch_bins: 2400000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_gst_fastspeech.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_gst_fastspeech.yaml
new file mode 100644
index 00000000000..9e865dea8df
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_gst_fastspeech.yaml
@@ -0,0 +1,85 @@
+# This configuration is for ESPnet2 to train GST-FastSpeech.
+# It requires only a single GPU with 12 GB memory and it
+# takes ~4 days to finish the training on Titan V.
+
+# As a default, we assume that the training is based on knowledge
+# distillation, i.e., using teacher model outputs as the target.
+# This assumes that we always use feat_type=fbank. If you use
+# teacher forcing, please be careful to change batch_bins
+# according to feat_type (in default, we use feats_type=raw).
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech            # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    adim: 384              # attention dimension
+    aheads: 4              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1536           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1536           # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 384     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    use_gst: true                 # whether to use GST embedding
+    gst_heads: 8                  # number of heads in GST multi-head attention
+    gst_tokens: 128               # number of global style tokens
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimenstion
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 300              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+# accum_grad: 6             # gradient accumulation
+# batch_bins: 800000        # batch bins
+accum_grad: 2               # gradient accumulation
+batch_bins: 2400000         # batch bins
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_gst_fastspeech2.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_gst_fastspeech2.yaml
new file mode 100644
index 00000000000..8b9b4bb6068
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_gst_fastspeech2.yaml
@@ -0,0 +1,105 @@
+# This configuration is for ESPnet2 to train GST-FastSpeech2.
+# It requires only a single GPU with 12 GB memory and it
+# takes ~7 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 8                               # number of heads in GST multi-head attention
+    gst_tokens: 128                            # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 800  # number of iterations per epoch
+max_epoch: 1500           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 8             # gradient accumulation
+batch_bins: 3000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_gst_tacotron2.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_gst_tacotron2.yaml
new file mode 100644
index 00000000000..f2043d8d1ad
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_gst_tacotron2.yaml
@@ -0,0 +1,80 @@
+# This configuration is for ESPnet2 to train GST-Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It takes around
+# 4 days to finish the training on RTX Titan.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 8                 # number of heads in GST multi-head attention
+    gst_tokens: 128              # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 300              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_gst_transformer.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_gst_transformer.yaml
new file mode 100644
index 00000000000..44365f81c0c
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_gst_transformer.yaml
@@ -0,0 +1,94 @@
+# This configuration is for ESPnet2 to train GST-Transformer-TTS.
+# This configuration additionally use the guided attention loss
+# to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 2 day
+# to finish the training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_gst: true                    # whether to use GST embedding
+    gst_heads: 8                     # number of heads in GST multi-head attention
+    gst_tokens: 256                  # number of global style tokens
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 12000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 300              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_xvector_conformer_fastspeech2.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..7e3f208aa7e
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,110 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_xvector_tacotron2.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_xvector_tacotron2.yaml
new file mode 100644
index 00000000000..69314f91661
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_xvector_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 512              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/tuning/train_xvector_transformer.yaml b/egs2/jtubespeech/tts1/conf/tuning/train_xvector_transformer.yaml
new file mode 100644
index 00000000000..3ec3312c8d6
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/tuning/train_xvector_transformer.yaml
@@ -0,0 +1,93 @@
+# This configuration is for ESPnet2 to train Transformer-TTS with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the learning of the diagonal attention.
+# It requires 4 GPUs with 32 GB memory and it takes around 3 days
+# to finish the training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    spk_embed_dim: 512               # dimension of speaker embedding
+    spk_embed_integration_type: add  # how to integrate speaker embedding
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 12000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jtubespeech/tts1/conf/vad.conf b/egs2/jtubespeech/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/jtubespeech/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/jtubespeech/tts1/db.sh b/egs2/jtubespeech/tts1/db.sh
new file mode 120000
index 00000000000..0721c37019a
--- /dev/null
+++ b/egs2/jtubespeech/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/db.sh
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/local/data.sh b/egs2/jtubespeech/tts1/local/data.sh
new file mode 100755
index 00000000000..ce50cf92967
--- /dev/null
+++ b/egs2/jtubespeech/tts1/local/data.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Takaaki Saeki
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+threshold=35
+nj=8
+ctcscore_pruning_threshold=-0.1
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${JTUBESPEECH}" ]; then
+   log "Fill the value of 'JTUBESPEECH' of db.sh"
+   exit 1
+fi
+db_root=${JTUBESPEECH}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: Data Download"
+    local/download.sh "${db_root}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    # Initial normalization of the data
+    # Doesn't change sampling frequency and it's done after stages
+    local/data_prep.sh "${db_root}/jtuberaw" "${db_root}/jtubesplit" data/all "${ctcscore_pruning_threshold}"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: scripts/audio/trim_silence.sh"
+    # shellcheck disable=SC2154
+    scripts/audio/trim_silence.sh \
+        --cmd "${train_cmd}" \
+        --nj "${nj}" \
+        --fs 16000 \
+        --win_length 1024 \
+        --shift_length 256 \
+        --threshold "${threshold}" \
+        data/all data/all/log
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: utils/subset_data_dir.sh"
+    # make evaluation and devlopment sets
+    utils/subset_data_dir.sh data/all 500 data/devtest
+    utils/subset_data_dir.sh --first data/devtest 250 data/dev
+    utils/subset_data_dir.sh --last data/devtest 250 data/test
+    utils/copy_data_dir.sh data/all data/tr_no_dev
+    utils/filter_scp.pl --exclude data/devtest/wav.scp \
+        data/all/wav.scp > data/tr_no_dev/wav.scp
+    utils/fix_data_dir.sh data/tr_no_dev
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/jtubespeech/tts1/local/data_prep.sh b/egs2/jtubespeech/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..2d8510359ba
--- /dev/null
+++ b/egs2/jtubespeech/tts1/local/data_prep.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Takaaki Saeki
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+dbr=$1
+dbs=$2
+data_dir=$3
+scorethresh=$4
+ROOTDIR="$(cd $(dirname $(dirname $0)); pwd)"
+
+# check arguments
+if [ $# != 4 ]; then
+    echo "Usage: $0 <db_raw> <db_split> <data_dir> <ctcscore_threshold>"
+    echo "e.g.: $0 downloads/jtuberaw downloads/jtubesplit data/all -0.5"
+    exit 1
+fi
+
+set -euo pipefail
+. ./path.sh
+
+# split wavfiles
+python local/split.py \
+    --db_raw ${dbr} \
+    --db_split ${dbs}
+
+# prune wavfiles with pre-computed ctcscore
+python local/prune.py \
+    --score_thresh ${scorethresh} \
+    --db_raw ${dbr} \
+    --db_split ${dbs}
+
+# check directory existence
+[ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+
+# make text, scp, utt2spk, and spk2utt
+cat "${ROOTDIR}/${dbs}/transcript_prune.txt" | sort | while read filename transcript; do
+    echo "${filename} ${transcript}" >> "${text}"
+    id=${filename}
+    spkr=${filename:0:11}
+    wavpath=`find ${ROOTDIR}/${dbs} -type f -name ${filename}.wav`
+    echo "${id} ${wavpath}" >> "${scp}"
+    echo "${id} ${spkr}" >> "${utt2spk}"
+done
+utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+echo "finished making text, wav.scp, utt2spk, spk2utt."
diff --git a/egs2/jtubespeech/tts1/local/download.sh b/egs2/jtubespeech/tts1/local/download.sh
new file mode 100755
index 00000000000..b61f9adb34e
--- /dev/null
+++ b/egs2/jtubespeech/tts1/local/download.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Takaaki Saeki (The University of Tokyo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e ${download_dir}/jtuberaw ]; then
+    mkdir -p ${download_dir}
+    cd ${download_dir}
+    FILE_NAME=jtuberaw.tar.gz
+    gdown "https://drive.google.com/uc?id=1X_harC0e1tjMX1FtCldD67XOysQuq_Ib" 
+    tar -zxvf ${FILE_NAME} jtuberaw
+    rm -rf ${FILE_NAME}
+    cd ${cwd}
+    echo "Successfully downloaded data."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/egs2/jtubespeech/tts1/local/path.sh b/egs2/jtubespeech/tts1/local/path.sh
new file mode 100644
index 00000000000..8779ab3ffd1
--- /dev/null
+++ b/egs2/jtubespeech/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import pyopenjtalk" > /dev/null; then
+    echo "Error: pyopenjtalk is not installed." >&2
+    echo "Error: please install pyopenjtalk and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make pyopenjtalk.done" >&2
+    return 1
+fi
diff --git a/egs2/jtubespeech/tts1/local/prune.py b/egs2/jtubespeech/tts1/local/prune.py
new file mode 100644
index 00000000000..a6beac9d8d3
--- /dev/null
+++ b/egs2/jtubespeech/tts1/local/prune.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Takaaki Saeki
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import os
+import glob
+import tqdm
+import soundfile as sf
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--score_thresh", type=float, default=-0.5, required=False)
+    parser.add_argument(
+        "--db_raw", type=str, default="downloads/jtuberaw", required=False
+    )
+    parser.add_argument(
+        "--db_split", type=str, default="downloads/jtubesplit", required=False
+    )
+    args = parser.parse_args()
+
+    thresh_ctc = args.score_thresh
+
+    raw_dir = os.path.join(os.path.dirname(__file__), "../{}".format(args.db_raw))
+    data_dir = os.path.join(os.path.dirname(__file__), "../{}".format(args.db_split))
+    ctcscore_path = os.path.join(raw_dir, "ctcscore.txt")
+    raw_transcript_path = os.path.join(data_dir, "transcript_raw.txt")
+    pruned_transcript_path = os.path.join(data_dir, "transcript_prune.txt")
+    if os.path.exists(pruned_transcript_path):
+        os.remove(pruned_transcript_path)
+
+    d_ctcscore = dict()
+    with open(ctcscore_path, "r") as fr:
+        for line in fr:
+            line_list = line.strip().split(" ", 5)
+            d_ctcscore[line_list[0]] = float(line_list[4])
+
+    with open(raw_transcript_path, "r") as fr:
+        for line in fr:
+            line_list = line.strip().split(" ", 1)
+            stem = line_list[0]
+            try:
+                if d_ctcscore[stem] > thresh_ctc:
+                    with open(pruned_transcript_path, "a") as fa:
+                        fa.write(line)
+            except KeyError:
+                pass
diff --git a/egs2/jtubespeech/tts1/local/split.py b/egs2/jtubespeech/tts1/local/split.py
new file mode 100644
index 00000000000..df59e412941
--- /dev/null
+++ b/egs2/jtubespeech/tts1/local/split.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Takaaki Saeki
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import os
+import glob
+import tqdm
+import soundfile as sf
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--db_raw", type=str, default="downloads/jtuberaw", required=False
+    )
+    parser.add_argument(
+        "--db_split", type=str, default="downloads/jtubesplit", required=False
+    )
+    args = parser.parse_args()
+
+    rawdata_dir = os.path.join(os.path.dirname(__file__), "../{}".format(args.db_raw))
+    outdata_dir = os.path.join(os.path.dirname(__file__), "../{}".format(args.db_split))
+    text_paths = glob.glob(os.path.join(rawdata_dir, "txt", "*", "*.txt"))
+
+    os.makedirs(outdata_dir, exist_ok=True)
+    os.makedirs(os.path.join(outdata_dir, "wav16k"), exist_ok=True)
+    transcript_path = os.path.join(outdata_dir, "transcript_raw.txt")
+    if os.path.exists(transcript_path):
+        os.remove(transcript_path)
+
+    for path in text_paths:
+        dirname = os.path.basename(os.path.dirname(path))
+        stem = os.path.splitext(os.path.basename(path))[0]
+        os.makedirs(os.path.join(outdata_dir, "wav16k", stem), exist_ok=True)
+        wav, sr = sf.read(
+            os.path.join(rawdata_dir, "wav16k", dirname, "{}.wav".format(stem))
+        )
+        with open(path, "r") as fr:
+            for i, line in enumerate(fr):
+                line_list = line.strip().split("\t", 2)
+                idx = "0" * (4 - len(str(i))) + str(i)
+                t_s, t_e, transcript = (
+                    float(line_list[0]),
+                    float(line_list[1]),
+                    line_list[2].strip('"'),
+                )
+                wav_seg = wav[int(sr * t_s) : int(sr * t_e)]
+                stem_seg = stem + "_" + idx
+                sf.write(
+                    os.path.join(
+                        outdata_dir, "wav16k", stem, "{}.wav".format(stem_seg)
+                    ),
+                    wav_seg,
+                    sr,
+                )
+                with open(transcript_path, "a") as fa:
+                    fa.write("{} {}\n".format(stem_seg, transcript))
diff --git a/egs2/jtubespeech/tts1/path.sh b/egs2/jtubespeech/tts1/path.sh
new file mode 120000
index 00000000000..8df4980a7a2
--- /dev/null
+++ b/egs2/jtubespeech/tts1/path.sh
@@ -0,0 +1 @@
+../../jsut/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/pyscripts b/egs2/jtubespeech/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/jtubespeech/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/run.sh b/egs2/jtubespeech/tts1/run.sh
new file mode 100755
index 00000000000..75e1657b400
--- /dev/null
+++ b/egs2/jtubespeech/tts1/run.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=16000
+n_fft=1024
+n_shift=256
+win_length=1024
+nj=8
+
+opts=
+if [ "${fs}" -eq 16000 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+train_set=tr_no_dev
+valid_set=dev
+test_sets="dev test"
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# Input example: こ、こんにちは
+
+# 1. Phoneme + Pause
+# (e.g. k o pau k o N n i ch i w a)
+g2p=pyopenjtalk
+
+# 2. Kana + Symbol
+# (e.g. コ 、 コ ン ニ チ ワ)
+# g2p=pyopenjtalk_kana
+
+# 3. Phoneme + Accent
+# (e.g. k 1 0 o 1 0 k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
+# g2p=pyopenjtalk_accent
+
+# 4. Phoneme + Accent + Pause
+# (e.g. k 1 0 o 1 0 pau k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
+# g2p=pyopenjtalk_accent_with_pause
+
+./tts.sh \
+    --lang jp \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --use_xvector true \
+    --token_type phn \
+    --cleaner jaconv \
+    --g2p "${g2p}" \
+    --nj "${nj}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    ${opts} "$@"
diff --git a/egs2/jtubespeech/tts1/scripts b/egs2/jtubespeech/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/jtubespeech/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/sid b/egs2/jtubespeech/tts1/sid
new file mode 120000
index 00000000000..15fb2d46091
--- /dev/null
+++ b/egs2/jtubespeech/tts1/sid
@@ -0,0 +1 @@
+../../vctk/tts1/sid
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/steps b/egs2/jtubespeech/tts1/steps
new file mode 120000
index 00000000000..7c9badbc682
--- /dev/null
+++ b/egs2/jtubespeech/tts1/steps
@@ -0,0 +1 @@
+../../vctk/tts1/steps
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/tts.sh b/egs2/jtubespeech/tts1/tts.sh
new file mode 120000
index 00000000000..8b61e3d0cac
--- /dev/null
+++ b/egs2/jtubespeech/tts1/tts.sh
@@ -0,0 +1 @@
+../../jsut/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/jtubespeech/tts1/utils b/egs2/jtubespeech/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/jtubespeech/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/README.md b/egs2/jv_openslr35/asr1/README.md
new file mode 100755
index 00000000000..d4fffbac4ea
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/README.md
@@ -0,0 +1,31 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Mon Jul 12 18:16:07 PDT 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `3bd2fcbd3391539e61fd70b1a9ce1c2e8dbb5683`
+  - Commit date: `Mon Jul 12 14:35:55 2021 -0700`
+
+## asr_train_asr_raw_bpe1000
+- ASR config: [conf/train_asr.yaml](conf/train_asr.yaml)
+- Pretrained model: [https://zenodo.org/record/5090139](https://zenodo.org/record/5090139)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.best/java_test|1740|12117|81.9|16.4|1.7|0.9|19.0|52.3|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.best/java_test|1740|80419|95.4|2.6|2.0|0.8|5.4|52.3|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.best/java_test|1740|26604|84.6|10.6|4.8|1.2|16.6|52.3|
+
diff --git a/egs2/jv_openslr35/asr1/asr.sh b/egs2/jv_openslr35/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/cmd.sh b/egs2/jv_openslr35/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/conf/decode_asr.yaml b/egs2/jv_openslr35/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/conf/fbank.conf b/egs2/jv_openslr35/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/jv_openslr35/asr1/conf/pbs.conf b/egs2/jv_openslr35/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/jv_openslr35/asr1/conf/pitch.conf b/egs2/jv_openslr35/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/jv_openslr35/asr1/conf/queue.conf b/egs2/jv_openslr35/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/jv_openslr35/asr1/conf/slurm.conf b/egs2/jv_openslr35/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/jv_openslr35/asr1/conf/train_asr.yaml b/egs2/jv_openslr35/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/conf/train_lm.yaml b/egs2/jv_openslr35/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/jv_openslr35/asr1/conf/tuning/decode_rnn.yaml b/egs2/jv_openslr35/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/jv_openslr35/asr1/conf/tuning/decode_transformer.yaml b/egs2/jv_openslr35/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..4e05682bafc
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 16
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/jv_openslr35/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/jv_openslr35/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..ebf51f799c5
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,68 @@
+# This configuration requires 4 GPUs with 32GB memory
+batch_type: numel
+batch_bins: 30000000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d6
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/jv_openslr35/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/jv_openslr35/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/jv_openslr35/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..c31ab13317e
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 20
+max_epoch: 200
+optim_conf:
+    lr: 10.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: chainer
diff --git a/egs2/jv_openslr35/asr1/db.sh b/egs2/jv_openslr35/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/local/data.sh b/egs2/jv_openslr35/asr1/local/data.sh
new file mode 100755
index 00000000000..fb2a1242917
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/local/data.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+mkdir -p ${JAVA}
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    cd ${JAVA}
+    idxs=("0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "a" "b" "c" "d" "e" "f")
+    for i in "${idxs[@]}"; do
+        wget https://www.openslr.org/resources/35/asr_javanese_${i}.zip
+        unzip -o asr_javanese_${i}.zip
+        rm -f asr_javanese_${i}.zip
+    done
+    mv asr_javanese/* .
+    rm -rf asr_javanese
+    cd $workspace
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${JAVA}
+    utils/spk2utt_to_utt2spk.pl data/java_train/spk2utt > data/java_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/java_dev/spk2utt > data/java_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/java_test/spk2utt > data/java_test/utt2spk
+    utils/fix_data_dir.sh data/java_train
+    utils/fix_data_dir.sh data/java_dev
+    utils/fix_data_dir.sh data/java_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/jv_openslr35/asr1/local/data_prep.py b/egs2/jv_openslr35/asr1/local/data_prep.py
new file mode 100644
index 00000000000..4cb5a47596b
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/local/data_prep.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/utt_spk_text.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = l_list[1]
+        text = l_list[2]
+        path = "%s/data/%s/%s.flac" % (args.d, fid[:2], fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 2000:
+            break
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s/data" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 2000:
+                curr_num_fids = num_fids - 2000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s/%s.flac -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid[:2],
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/java_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/jv_openslr35/asr1/local/path.sh b/egs2/jv_openslr35/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/jv_openslr35/asr1/path.sh b/egs2/jv_openslr35/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/pyscripts b/egs2/jv_openslr35/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/run.sh b/egs2/jv_openslr35/asr1/run.sh
new file mode 100755
index 00000000000..8a2ab2d931d
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="java_train"
+train_dev="java_dev"
+test_set="java_test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 80 \
+    --inference_nj 256 \
+    --gpu_inference true \
+    --inference_args "--batch_size 1" \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe 1000 \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --lm_train_text "data/${train_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
diff --git a/egs2/jv_openslr35/asr1/scripts b/egs2/jv_openslr35/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/setup.sh b/egs2/jv_openslr35/asr1/setup.sh
new file mode 120000
index 00000000000..1551e314cdb
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/setup.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/setup.sh
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/steps b/egs2/jv_openslr35/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/jv_openslr35/asr1/utils b/egs2/jv_openslr35/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/jv_openslr35/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/jvs/tts1/README.md b/egs2/jvs/tts1/README.md
index 808038869e3..fa454b4dc02 100644
--- a/egs2/jvs/tts1/README.md
+++ b/egs2/jvs/tts1/README.md
@@ -4,20 +4,25 @@ This is the recipe of the adaptation with Japanese single speaker in [JVS](https
 
 This recipe assumes the use of pretrained model.
 Please follow the usage to perform fine-tuning with pretrained model.
-
 See the following pages before asking the question:
 - [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
 - [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
 
-## How to run
+# HOW TO RUN
+
+- [AR model case (Tacotron2 / Transformer)](#ar-model-case-tacotron2--transformer)
+- [Non-AR model case (FastSpeech / FastSpeech2)](#non-ar-model-case-fastspeech--fastspeech2)
+- [VITS case](#vits-case)
+
+## AR model case (Tacotron2 / Transformer)
 
-Here, we show the procedure of the fine-tuning using Tacotron2 pretrained with [JSUT](../../jsut/tts1) corpus.
+Here, we show the procedure of the fine-tuning using Tacotron2, which was pretrained on [JSUT](../../jsut/tts1) corpus using `pyopenjtalk_accent_with_pause` G2P.
 
 ### 1. Run the recipe until stage 5
 
 ```sh
 # From data preparation to statistics calculation
-$ ./run.sh --stop-stage 5
+$ ./run.sh --stop-stage 5 --g2p pyopenjtalk_accent_with_pause
 ```
 
 The detail of stage 1-5 can be found in [`Recipe flow`](../../TEMPLATE/tts1/README.md#recipe-flow).
@@ -29,53 +34,214 @@ If you have your own pretrained model, you can skip this step.
 
 ```sh
 $ . ./path.sh
-$ espnet_model_zoo_download --unpack true --cache_dir downloads kan-bayashi/jsut_tacotron2
+$ espnet_model_zoo_download --unpack true --cachedir downloads kan-bayashi/jsut_tacotron2_accent_with_pause
 ```
 
 You can find the other pretrained models in [ESPnet model zoo](https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv).
 
-### 3. Replace token list and statistics with pretrained model's one
+### 3. Replace token list with pretrained model's one
 
 Since we use the same language data for fine-tuning, we need to use the token list of the pretrained model instead of that of data for fine-tuning.
 The downloaded pretrained model has `tokens_list` in the config, so first we create `tokens.txt` (`token_list`) from the config.
 
 ```sh
-# NOTE: The path may be changed. Please change it to match with your case.
-$ pyscripts/utils/make_token_list_from_config.py downloads/2dc62478870c846065fe39e609ba6657/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/config.yaml
+$ pyscripts/utils/make_token_list_from_config.py downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/config.yaml
 
 # tokens.txt is created in model directory
-$ ls downloads/2dc62478870c846065fe39e609ba6657/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk
-199epoch.pth    config.yaml   tokens.txt
+$ ls downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+config.yaml  images  tokens.txt  train.loss.ave_5best.pth
 ```
 
-Let us replace the `tokens.txt` and `feats_stats.npz` with pretrained model's one.
+Let us replace the `tokens.txt` with pretrained model's one.
 ```sh
-# NOTE: The path may be changed. Please change it to match with your case.
-
 # Make backup (Rename -> *.bak)
-$ mv data/token_list/phn_jaconv_pyopenjtalk/tokens.{txt,txt.bak}
-$ mv exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.{npz,npz.bak}
-
+$ mv dump/token_list/phn_jaconv_pyopenjtalk_accent_with_pause/tokens.{txt,txt.bak}
 # Make symlink to pretrained model's one (Just copy is also OK)
-$ ln -s downloads/2dc62478870c846065fe39e609ba6657/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/tokens.txt data/token_list/phn_jaconv_pyopenjtalk
-$ ln -s downloads/2dc62478870c846065fe39e609ba6657/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train
+$ ln -s $(pwd)/downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/tokens.txt dump/token_list/phn_jaconv_pyopenjtalk_accent_with_pause
 ```
 
-Now ready to perform fine-tuning!
+### 4 (Optional). Replace statistics with pretrained model's one
 
-### 4. Run fine-tuning
+Sometimes, using the feature statistics of the pretrained models is better than using that of adaptation data.
+This is an optional step, so you can skip if you use the original statistics.
+
+```sh
+# Make backup (Rename -> *.bak)
+$ mv exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.{npz,npz.bak}
+# Make symlink to pretrained model's one (Just copy is also OK)
+$ ln -s $(pwd)/downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.npz exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train
+```
+
+### 5. Run fine-tuning
 
 Run the recipe from stage 6.
 
 You need to specify `--init_param` for `--train_args` to load pretrained parameters (Or you can write them in `*.yaml` config).
+Here `--init_param /path/to/model.pth:a:b` represents loading "a" parameters in model.pth into "b", and `:tts:tts` means load parameters except for the feature normalizer.
+
+```sh
+# Recommend using --tag to name the experiment directory
+$ ./run.sh \
+    --stage 6 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --train_config conf/tuning/finetune_tacotron2.yaml \
+    --train_args "--init_param downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.loss.ave_5best.pth:tts:tts" \
+    --tag finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+```
+
+For more complex loading of pretrained parameters, please check [`How to load pretrained model?`](../../TEMPLATE/tts1/README.md#how-to-load-the-pretrained-model) For example, if you want to perform fine-tuning of English model with Japanese data, you may want to load the network except for the token embedding layer.
+
+## Non-AR model case (FastSpeech / FastSpeech2)
+
+To finetune non-AR models, we need to preapre `durations` file.
+Therefore, at first, please finish the finetuning of AR models by the above steps.
+
+Here, we show the procedure of FastSpeech2 fine-tuning with the above fine-tuened tacotron2 as the teacher.
+
+### 1. Prepare durations file using the adapted AR model
+
+First, prepare the `durations` for all sets by running AR model inference with teacher forcing.
+
+```sh
+$ ./run.sh \
+    --stage 7 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --tts_exp exp/tts_finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause \
+    --inference_args "--use_teacher_forcing true" \
+    --test_sets "jvs010_tr_no_dev jvs010_dev jvs010_eval1"
+```
+
+You can find `durations` files in `exp/tts_finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/decode_use_teacher_forcingtrue_train.loss.ave/*`.
+
+### 2. Download pretrained model
+
+Download pretrained model from ESPnet model zoo here.
+If you have your own pretrained model, you can skip this step.
+
+```sh
+$ . ./path.sh
+$ espnet_model_zoo_download --unpack true --cachedir downloads kan-bayashi/jsut_fastspeech2_accent_with_pause
+```
+
+Please make sure this model used the same `token_list` as the teacher AR model.
+
+### 3. Run fine-tuning
+
+Here we skip the replacement of the statistics (Of course you can do it).
+And we assume that `tokens.txt` is already replaced in AR model fine-tuning.
+
+Since fastspeech2 requires extra feature calculation, run from stage 5.
 
 ```sh
-# NOTE: The path may be changed. Please change it to match with your case.
+# Recommend using --tag to name the experiment directory
+$ ./run.sh \
+    --stage 5 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --write_collected_feats true \
+    --teacher_dumpdir exp/tts_finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/decode_use_teacher_forcingtrue_train.loss.ave \
+    --tts_stats_dir exp/tts_finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/decode_use_teacher_forcingtrue_train.loss.ave/stats \
+    --train_config conf/tuning/finetune_fastspeech2.yaml \
+    --train_args "--init_param downloads/0293a01e429a84a604304bf06f2cc0b0/exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.loss.ave_5best.pth:tts:tts" \
+    --tag finetune_fastspeech2_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+```
+
+## VITS case
+
+In the case of VITS, please be careful about the sampling rate.
+As a default, vits used 22.05 khz (but this recipe default is 24khz).
+
+### 1. Run the recipe until stage 5 with 22.05khz setup
+
+```sh
+# Here we changed root dumpdir from dump -> dump/22k and
+# different g2p to match with the pretrained model.
+# `min_wav_duration` is need to filter out less than 0.38 sec (~=8,192 / 22,050).
+$ ./run.sh \
+    --stage 1 \
+    --stop-stage 5 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --min_wav_duration 0.38 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/finetune_vits.yaml
+```
+
+### 2. Download pretrained model
 
+Download pretrained model from ESPnet model zoo here.
+If you have your own pretrained model, you can skip this step.
+
+```sh
+$ . ./path.sh
+$ espnet_model_zoo_download --unpack true --cachedir downloads kan-bayashi/jsut_vits_accent_with_pause
+```
+
+### 3. Replace token list with pretrained model's one
+
+Since we use the same language data for fine-tuning, we need to use the token list of the pretrained model instead of that of data for fine-tuning.
+The downloaded pretrained model has `tokens_list` in the config, so first we create `tokens.txt` (`token_list`) from the config.
+
+```sh
+$ pyscripts/utils/make_token_list_from_config.py downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_with_accent/config.yaml
+
+# tokens.txt is created in model directory
+$ ls downloads/f3698edf589206588f58f5ec837fa516/exp/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+config.yaml  images  train.total_count.ave_10best.pth
+```
+
+Let us replace the `tokens.txt` with pretrained model's one.
+```sh
+# Make backup (Rename -> *.bak)
+$ mv dump/22k/token_list/phn_jaconv_pyopenjtalk_accent_with_pause/tokens.{txt,txt.bak}
+# Make symlink to pretrained model's one (Just copy is also OK)
+$ ln -s $(pwd)/downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/tokens.txt dump/22k/token_list/phn_jaconv_pyopenjtalk_accent_with_pause
+```
+
+### 4. Run fine-tuning
+
+Run from stage 6.
+
+```sh
 # Recommend using --tag to name the experiment directory
-$ ./run.sh --stage 6 --train_config conf/tuning/finetune_tacotron2.yaml \
-    --train_args "--init_param downloads/2dc62478870c846065fe39e609ba6657/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/199epoch.pth" \
-    --tag finetune_jsut_pretrained_tacotron2
+$ ./run.sh \
+    --stage 6 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --min_wav_duration 0.38 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/finetune_vits.yaml \
+    --train_args "--init_param downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts" \
+    --tag finetune_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause
 ```
 
-If you want to load part of the pretrained model, please see [`How to load pretrained model?`](../../TEMPLATE/tts1/README.md#how-to-load-the-pretrained-model) For example, if you want to perform fine-tuning of English model with Japanese data, you may want to load the network except for the token embedding layer.
+# VITS EXAMPLE RESULTS
+
+## Environments
+- date: `Fri Sep  3 21:09:25 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a1`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `dee654041cddf80281048b3e7525c1cdafc377ff`
+  - Commit date: `Thu Sep 2 14:45:48 2021 +0900`
+
+## Pretrained Models
+
+### kan-bayashi/jvs_tts_finetune_jvs001_jsut_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause_latest
+- 22.05 khz / jvs001 (male) / 50k iters
+- https://zenodo.org/record/5432540
+
+### kan-bayashi/jvs_tts_finetune_jvs010_jsut_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause_latest
+- 22.05 khz / jvs010 (female) / 50k iters
+- https://zenodo.org/record/5432566
diff --git a/egs2/jvs/tts1/cmd.sh b/egs2/jvs/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/jvs/tts1/cmd.sh
+++ b/egs2/jvs/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/jvs/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/jvs/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/jvs/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/jvs/tts1/conf/tuning/decode_vits.yaml b/egs2/jvs/tts1/conf/tuning/decode_vits.yaml
new file mode 100644
index 00000000000..74bb0ebe0e2
--- /dev/null
+++ b/egs2/jvs/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
diff --git a/egs2/jvs/tts1/conf/tuning/finetune_fastspeech2.yaml b/egs2/jvs/tts1/conf/tuning/finetune_fastspeech2.yaml
new file mode 100644
index 00000000000..3f477ee0433
--- /dev/null
+++ b/egs2/jvs/tts1/conf/tuning/finetune_fastspeech2.yaml
@@ -0,0 +1,98 @@
+# This configuration is for ESPnet2 to finetune FastSpeech2.
+# It requires only a single GPU with 12 GB memory and it
+# takes 0.5 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 0.1            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 800  # number of iterations per epoch
+max_epoch: 100            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 8             # gradient accumulation
+batch_bins: 3000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/jvs/tts1/conf/tuning/finetune_vits.yaml b/egs2/jvs/tts1/conf/tuning/finetune_vits.yaml
new file mode 100644
index 00000000000..a3b05bc36e7
--- /dev/null
+++ b/egs2/jvs/tts1/conf/tuning/finetune_vits.yaml
@@ -0,0 +1,184 @@
+# This configuration is for ESPnet2 to finetune VITS, which
+# is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 22050 hz audio as
+# the training data (mainly tested on JVS).
+# This configuration tested on 1 GPU (V100) with 32GB GPU
+# memory. It takes around 1 day to finish the training.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 1.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 1.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 100            # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/jvs/tts1/run.sh b/egs2/jvs/tts1/run.sh
index cf89a4dab6f..b4b5e4479ae 100755
--- a/egs2/jvs/tts1/run.sh
+++ b/egs2/jvs/tts1/run.sh
@@ -11,7 +11,7 @@ n_shift=300
 win_length=1200
 
 opts=
-if [ "${fs}" -eq 48000 ]; then
+if [ "${fs}" -eq 24000 ]; then
     # To suppress recreation, specify wav format
     opts="--audio_format wav "
 else
@@ -44,6 +44,10 @@ g2p=pyopenjtalk
 # (e.g. k 1 0 o 1 0 pau k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
 # g2p=pyopenjtalk_accent_with_pause
 
+# 5. Phoneme + Prosody symbols
+# (e.g. ^, k, #, o, _, k, o, [, N, n, i, ch, i, w, a, $)
+# g2p=pyopenjtalk_prosody
+
 ./tts.sh \
     --lang jp \
     --local_data_opts "--spk ${spk}" \
diff --git a/egs2/ksponspeech/asr1/README.md b/egs2/ksponspeech/asr1/README.md
new file mode 100644
index 00000000000..b9bebafad5c
--- /dev/null
+++ b/egs2/ksponspeech/asr1/README.md
@@ -0,0 +1,37 @@
+# RESULTS
+## Dataset
+- KsponSpeech: Korean Spontaneous Speech Corpus for Automatic Speech Recognition
+  - Database: https://aihub.or.kr/aidata/105
+  - Paper: https://www.mdpi.com/2076-3417/10/19/6936
+  - This corpus contains 969 h of general open-domain dialog utterances, spoken by 2000 native Korean speakers.
+
+## Environments
+- date: `Mon Aug  2 17:20:52 EDT 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `62a7dd6d5f08c4c7b1c72a8785820fc70c9ad603`
+  - Commit date: `Mon Aug 2 14:15:45 2021 -0400`
+- Pretrained Model: https://zenodo.org/record/5154341 
+
+## Conformer+Transformer LM
+### asr_train_asr_conformer8_n_fft512_hop_length256_raw_kr_bpe2309
+- Total number of ASR model parameters: 112.01 M
+- Total number of LM model parameters: 51.99 M
+- ASR config: `conf/tuning/train_asr_conformer8_n_fft512_hop_length256.yaml`
+- LM config: `conf/tuning/train_lm_transformer3.yaml`
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer3_kr_bpe2309_valid.loss.ave_asr_model_valid.acc.best/eval_clean|3000|65475|93.5|3.8|2.7|2.2|8.7|60.4|
+|decode_asr_lm_lm_train_lm_transformer3_kr_bpe2309_valid.loss.ave_asr_model_valid.acc.best/eval_other|3000|92640|93.1|4.3|2.6|2.5|9.4|74.0|
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer3_kr_bpe2309_valid.loss.ave_asr_model_valid.acc.best/eval_clean|3000|20401|82.3|14.5|3.2|4.0|21.7|60.4|
+|decode_asr_lm_lm_train_lm_transformer3_kr_bpe2309_valid.loss.ave_asr_model_valid.acc.best/eval_other|3000|26621|79.0|18.0|3.0|5.6|26.6|74.0|
+
diff --git a/egs2/ksponspeech/asr1/asr.sh b/egs2/ksponspeech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/ksponspeech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/cmd.sh b/egs2/ksponspeech/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/ksponspeech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/ksponspeech/asr1/conf/decode_asr.yaml b/egs2/ksponspeech/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..61ab3f679ed
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/decode_asr.yaml
@@ -0,0 +1,3 @@
+lm_weight: 0.6
+ctc_weight: 0.4
+beam_size: 10
diff --git a/egs2/ksponspeech/asr1/conf/fbank.conf b/egs2/ksponspeech/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/ksponspeech/asr1/conf/pbs.conf b/egs2/ksponspeech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/ksponspeech/asr1/conf/pitch.conf b/egs2/ksponspeech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/ksponspeech/asr1/conf/queue.conf b/egs2/ksponspeech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/ksponspeech/asr1/conf/slurm.conf b/egs2/ksponspeech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/ksponspeech/asr1/conf/train_asr_conformer.yaml b/egs2/ksponspeech/asr1/conf/train_asr_conformer.yaml
new file mode 120000
index 00000000000..fc4015dfe4c
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer8_n_fft512_hop_length256.yaml
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/conf/train_lm_transformer.yaml b/egs2/ksponspeech/asr1/conf/train_lm_transformer.yaml
new file mode 120000
index 00000000000..65c51cd1dc6
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1 @@
+tuning/train_lm_transformer3.yaml
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/conf/tuning/train_asr_conformer8_n_fft512_hop_length256.yaml b/egs2/ksponspeech/asr1/conf/tuning/train_asr_conformer8_n_fft512_hop_length256.yaml
new file mode 100644
index 00000000000..0b96c11da50
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/tuning/train_asr_conformer8_n_fft512_hop_length256.yaml
@@ -0,0 +1,72 @@
+# Trained with Tesla V100-SXM2(32GB) x 4 GPUs. It takes about 1.5 days.
+batch_type: numel
+batch_bins: 18000000
+accum_grad: 1
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/conf/tuning/train_lm_transformer3.yaml b/egs2/ksponspeech/asr1/conf/tuning/train_lm_transformer3.yaml
new file mode 100644
index 00000000000..f5e513160ed
--- /dev/null
+++ b/egs2/ksponspeech/asr1/conf/tuning/train_lm_transformer3.yaml
@@ -0,0 +1,31 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 4 GPUs. It takes about 1 hour.
+use_amp: true
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.0
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 250000000
+accum_grad: 4
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/db.sh b/egs2/ksponspeech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/ksponspeech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/local/data.sh b/egs2/ksponspeech/asr1/local/data.sh
new file mode 100755
index 00000000000..cb40d8c9ef5
--- /dev/null
+++ b/egs2/ksponspeech/asr1/local/data.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+
+#data
+datadir=/ocean/projects/cis210027p/shared/corpora/KsponSpeech/KsponSpeech/
+# KsponSpeech
+#  |_ KsponSpeech_01/
+#  |_ KsponSpeech_02/
+#  |_ KsponSpeech_03/
+#  |_ KsponSpeech_04/
+#  |_ KsponSpeech_05/
+#  |_ KsponSpeech_eval/
+#  |_ scripts/
+# Download data from here:
+# https://aihub.or.kr/aidata/105
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: KsponSpeech Data Preparation"
+    local/trans_prep.sh ${datadir} data/local/KsponSpeech
+    for x in train dev eval_clean eval_other; do
+        local/data_prep.sh ${datadir} data/local/KsponSpeech data/${x}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/local/data_prep.sh b/egs2/ksponspeech/asr1/local/data_prep.sh
new file mode 100755
index 00000000000..03517f08e71
--- /dev/null
+++ b/egs2/ksponspeech/asr1/local/data_prep.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Electronics and Telecommunications Research Institute (Jeong-Uk, Bang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+export LC_ALL=C
+
+. utils/parse_options.sh || exit 1;
+
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <db-dir> <src-dir> <dst-dir>"
+    echo "e.g.: $0 /mls/jubang/databases/KsponSpeech data/local/KsponSpeech data/train"
+    exit 1
+fi
+
+db=$1
+src=$2
+dst=$3
+
+data=$(echo $dst | sed 's:\.:/:' | awk -v src=$src -F"/" '{print src"/"$NF"/text.trn"}')
+temp=tmp
+
+mkdir -p ${dst} ${dst}/$temp || exit 1;
+
+[ ! -d ${db} ] && echo "$0: no such directory ${db}" && exit 1;
+[ ! -f ${data} ] && echo "$0: no such file ${data}. please re-run the script of 'local/trans_prep.sh'." && exit 1;
+
+wav_scp=${dst}/wav.scp; [[ -f "${wav_scp}" ]] && rm ${wav_scp}
+text=${dst}/text; [[ -f "${text}" ]] && rm ${text}
+utt2spk=${dst}/utt2spk; [[ -f "${utt2spk}" ]] && rm ${utt2spk}
+
+# 1) extract meta data
+cat $data | cut -f1 -d' ' > ${dst}/${temp}/pcm.list
+cat ${dst}/${temp}/pcm.list | awk -F"/" '{print $NF}' | awk -F"." '{print $1}' > ${dst}/${temp}/labels
+awk -v db=$db '{print db "/" $0}' ${dst}/${temp}/pcm.list | \
+    paste -d' ' ${dst}/${temp}/labels - | sort > ${dst}/${temp}/pcm.scp
+
+# 2) prepare wav.scp
+awk '{print $1 " sox -r 16000 -b 16 -c 1 -e signed-integer -t raw " $2 " -t wav - |"}' \
+    ${dst}/${temp}/pcm.scp > ${dst}/wav.scp
+
+# 3) prepare text
+cat $data | cut -d' ' -f3- > ${dst}/${temp}/text.org
+cat ${dst}/${temp}/text.org | local/lowercase.perl | local/remove_punctuation.pl | paste -d' ' ${dst}/${temp}/labels - | sort > ${dst}/text
+
+# 4) prepare utt2spk & spk2utt
+spk2utt=${dst}/spk2utt
+awk '{print $1 " " $1}' ${dst}/${temp}/labels | sort -k 1 > $utt2spk || exit 1
+utils/utt2spk_to_spk2utt.pl < ${utt2spk} > $spk2utt || exit 1
+
+ntext=$(wc -l <$text)
+nutt2spk=$(wc -l <$utt2spk)
+! [ "$ntext" -eq "$nutt2spk" ] && \
+  echo "Inconsistent #transcripts($ntext) and #utt2spk($nutt2spk)" && exit 1
+
+utils/validate_data_dir.sh --no-feats $dst || exit 1
+
+echo "$0: successfully prepared data in ${dst}"
+exit 0;
diff --git a/egs2/ksponspeech/asr1/local/get_space_normalized_hyps.py b/egs2/ksponspeech/asr1/local/get_space_normalized_hyps.py
new file mode 100755
index 00000000000..c105b47c578
--- /dev/null
+++ b/egs2/ksponspeech/asr1/local/get_space_normalized_hyps.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+# encoding: utf-8 -*-
+
+# Copyright 2020 Electronics and Telecommunications Research Institute (Jeong-Uk, Bang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import configargparse
+import logging
+import os
+import sys
+
+from numpy import zeros
+
+space_sym = "▁"
+unmatched_sym = "<u>"
+
+
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Get space normelized hypothesis text based on the reference text.",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # general configuration
+    parser.add_argument(
+        "--in-ref",
+        type=str,
+        required=True,
+        help="Filename of word-level reference text",
+    )
+    parser.add_argument(
+        "--in-hyp",
+        type=str,
+        required=True,
+        help="Filename of word-level hypothesis text",
+    )
+    parser.add_argument(
+        "--out-ref",
+        type=str,
+        required=True,
+        help="Filename of space normalized reference text",
+    )
+    parser.add_argument(
+        "--out-hyp",
+        type=str,
+        required=True,
+        help="Filename of space normalized hypothesis text",
+    )
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    return parser
+
+
+def get_score(a, b):
+    # get score for Levenshtein
+    if a == b:
+        return 0
+    else:
+        return 1
+
+
+def norm_space(token):
+    # get normalized token
+    return token.replace(space_sym, "")
+
+
+def get_norm_text(hyps, refs):
+    # this implementation is modified from LevenshteinAlignment of the kaldi toolkit
+    # - https://github.com/kaldi-asr/kaldi/blob/master/src/bin/align-text.cc
+
+    # initialize variables
+    hyp_norm, ref_norm = [], []
+
+    # length of two sequences
+    hlen, rlen = len(hyps), len(refs)
+
+    # initialization
+    # - this is very memory-inefficiently implemented using a vector of vectors
+    scores = zeros((hlen + 1, rlen + 1))
+    for r in range(0, rlen + 1):
+        scores[0][r] = r
+    for h in range(1, hlen + 1):
+        scores[h][0] = scores[h - 1][0] + 1
+        for r in range(1, rlen + 1):
+            hyp_nosp, ref_nosp = norm_space(hyps[h - 1]), norm_space(refs[r - 1])
+            sub_or_cor = scores[h - 1][r - 1] + get_score(hyp_nosp, ref_nosp)
+            insert, delete = scores[h - 1][r] + 1, scores[h][r - 1] + 1
+            scores[h][r] = min(sub_or_cor, insert, delete)
+
+    # traceback and compute the alignment
+    h, r = hlen, rlen  # start from the bottom
+    while h > 0 or r > 0:
+        if h == 0:
+            last_h, last_r = h, r - 1
+        elif r == 0:
+            last_h, last_r = h - 1, r
+        else:
+            # get score
+            hyp_nosp, ref_nosp = norm_space(hyps[h - 1]), norm_space(refs[r - 1])
+            sub_or_cor = scores[h - 1][r - 1] + get_score(hyp_nosp, ref_nosp)
+            insert, delete = scores[h - 1][r] + 1, scores[h][r - 1] + 1
+
+            # choose sub_or_cor if all else equal
+            if sub_or_cor <= min(insert, delete):
+                last_h = h - 1
+                last_r = r - 1
+            else:
+                if insert < delete:
+                    last_h = h - 1
+                    last_r = r
+                else:
+                    last_h = h
+                    last_r = r - 1
+
+        c_hyp = hyps[last_h] if last_h != h else ""
+        c_ref = refs[last_r] if last_r != r else ""
+        h, r = last_h, last_r
+
+        # do word-spacing normalization
+        if c_hyp != c_ref and norm_space(c_hyp) == norm_space(c_ref):
+            c_hyp = c_ref
+        if c_hyp != "":
+            hyp_norm.append(c_hyp)
+        if c_ref != "":
+            ref_norm.append(c_ref)
+
+    # reverse list
+    hyp_norm.reverse()
+    ref_norm.reverse()
+
+    return (hyp_norm, ref_norm)
+
+
+def main(args):
+    """Run the main normalizing function."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check input arguments
+    if not os.path.exists(args.in_ref):
+        logging.error("Filename '" + args.in_ref + "' is not exist")
+        logging.error("Please check the '--in-ref' argument")
+        sys.exit(1)
+    else:
+        logging.info("Loading reference text: " + args.in_ref)
+    if not os.path.exists(args.in_hyp):
+        logging.error("Filename '" + args.in_hyp + "' is not exist")
+        logging.error("Please check the '--raw-trans' argument")
+        sys.exit(1)
+    else:
+        logging.info("Loading hypothesis text: " + args.in_hyp)
+
+    # read lines of ref & hyp files
+    refs = open(args.in_ref, "r").readlines()
+    hyps = open(args.in_hyp, "r").readlines()
+
+    # comparing the number of files
+    if len(refs) != len(hyps):
+        logging.error(
+            "# of sentences in refs(%s) and hyps(%s) are different."
+            % (str(len(refs)), str(len(hyps)))
+        )
+        sys.exit(1)
+
+    # create normalized ref & hyp files
+    save_norm_ref = open(args.out_ref, mode="w")
+    save_norm_hyp = open(args.out_hyp, mode="w")
+
+    # run Levenshtein alignment processing
+    for i, _ in enumerate(refs):
+        # get utt-id
+        ref_pos = refs[i].rindex("(")
+        hyp_pos = hyps[i].rindex("(")
+        ref_utt_id = refs[i][ref_pos + 1 :].strip()[:-1]
+        hyp_utt_id = hyps[i][hyp_pos + 1 :].strip()[:-1]
+
+        # check the number of line in two texts
+        if ref_utt_id != hyp_utt_id:
+            logging.error(
+                "The utt-id is not sorted. (ref[%s:%s], hyp[%s:%s])"
+                % (str(ref_utt_id), str(i), str(hyp_utt_id), str(i))
+            )
+            sys.exit(1)
+
+        # do character-level tokenizing with space symbol ('_')
+        ref = (
+            " ".join(space_sym + refs[i][:ref_pos].strip().replace(" ", space_sym))
+            .replace(space_sym + " ", space_sym)
+            .split()
+        )
+        hyp = (
+            " ".join(space_sym + hyps[i][:hyp_pos].strip().replace(" ", space_sym))
+            .replace(space_sym + " ", space_sym)
+            .split()
+        )
+
+        # get space-normalized texts with Levenshtein algorithm
+        hyp_norm, ref_norm = get_norm_text(hyp, ref)
+
+        # save space-normalized texts
+        save_norm_hyp.write(str(" ".join(hyp_norm)).strip() + "(" + hyp_utt_id + ")\n")
+        save_norm_ref.write(str(" ".join(ref_norm)).strip() + "(" + ref_utt_id + ")\n")
+
+    # close files
+    save_norm_ref.close()
+    save_norm_hyp.close()
+
+    logging.info("Succeeded creating normalized texts.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/egs2/ksponspeech/asr1/local/get_transcriptions.py b/egs2/ksponspeech/asr1/local/get_transcriptions.py
new file mode 100644
index 00000000000..9d1db4b9225
--- /dev/null
+++ b/egs2/ksponspeech/asr1/local/get_transcriptions.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+# encoding: utf-8 -*-
+
+# Copyright 2020 Electronics and Telecommunications Research Institute (Jeong-Uk, Bang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import codecs
+import configargparse
+import logging
+import os
+import re
+import shutil
+import sys
+
+
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Get transcription for KsponSpeech dataset (aihub.or.kr).",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # general configuration
+    parser.add_argument(
+        "--raw-trans",
+        type=str,
+        required=True,
+        help="Filename of raw transcription ({text-filename} :: {raw-text})",
+    )
+    parser.add_argument("--out-fn", type=str, help="Output filename of refining script")
+    parser.add_argument(
+        "--type",
+        type=str,
+        default="dt",
+        choices=["df", "dt", "fl"],
+        help="""Type of output transcription ('df', 'dt', or 'fl').
+                If type is df, output to disfluent transcription
+                               with repeated and filler words
+                Else if type is dt, output to disfluent transcription
+                               with '/' and '+' tags for repeated and filler words
+                Else if type is fl, output to fluent transcription
+                               without repeated and filler words""",
+    )
+    parser.add_argument(
+        "--notation-type",
+        type=str,
+        default="char",
+        help="""Notation of transcription ('char' or 'pron').
+                If notation-type is char, extract from string ($1)/($2) -> $1
+                Else if notation-type is pron, extract from string ($1)/($2) -> $2""",
+    )
+    parser.add_argument(
+        "--unk-sym",
+        type=str,
+        default="[unk]",
+        help="If unk-sym is empty(''), remove the unknown symbol (\\u)",
+    )
+    parser.add_argument(
+        "--log-dir",
+        type=str,
+        default="./logs",
+        help="Directory name that saves information for each stage",
+    )
+    parser.add_argument("--clear", action="store_true", help="Remove log directory")
+    parser.add_argument(
+        "--stage",
+        type=int,
+        default=1,
+        help="Start from 0 if you need to start from raw-trans data gathering",
+    )
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    return parser
+
+
+def split_fn_trans(fn_trans):
+    fn_trans_tok = fn_trans.split()
+    fn = fn_trans_tok[0]  # utt-id
+    trans = " ".join(fn_trans_tok[2:])  # transcription
+    return fn, trans
+
+
+def main(args):
+    """Run the main refining function."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check input arguments
+    if not os.path.exists(args.raw_trans) and args.stage > 0:
+        logging.error("Filename '" + args.raw_trans + "' is not exist")
+        logging.error("Please check the '--raw-trans' argument")
+        sys.exit(1)
+    else:
+        logging.info("Loading raw text: " + args.raw_trans)
+
+    # make logging dir
+    if not os.path.exists(args.log_dir):
+        os.makedirs(args.log_dir)
+
+    # run refining processes
+    logging.debug(
+        "stage 1: Remove meta-symbols for unwanted pause and elongated segment"
+    )
+    in_fn = args.raw_trans
+    out_fn = "1_remove_meta_symbols.txt"
+    if args.raw_trans is not None and args.stage <= 1:
+        logging.debug(">> reading %s", in_fn)
+        with codecs.open(in_fn, mode="r", encoding="utf-8") as f:
+            trans_raw = f.readlines()
+
+        # save log-file of stage 1
+        trans_rm_meta_sym = codecs.open(
+            args.log_dir + "/" + out_fn, mode="w", encoding="utf-8"
+        )
+        for ix, _line in enumerate(trans_raw):
+            fn_trans = _line.strip()
+            fn, trans = split_fn_trans(fn_trans)  # txt_path, transcription
+
+            line = trans.replace("*", "").strip()  # symbol for noisy speech
+            line = line.replace("o/", "").strip()  # symbol for overlapped signals
+            line = line.replace("l/", "").strip()  # symbol for laugh signals
+            line = line.replace("b/", "").strip()  # symbol for breath signals
+            line = line.replace("n/", "").strip()  # symbol for noise signals
+            line = line.replace("u/", args.unk_sym).strip()  # symbol for unknown words
+            line = " ".join(line.split())  # remove multiple space
+
+            trans_rm_meta_sym.write(str(fn) + " :: " + str(line).strip() + "\n")
+        trans_rm_meta_sym.close()
+
+    logging.debug("stage 2: Remove repeated symbols")
+    in_fn = "1_remove_meta_symbols.txt"
+    out_fn = "2_remove_repeat_symbols.txt"
+    if os.path.exists(args.log_dir + "/" + in_fn) and args.stage <= 2:
+        logging.debug(">> reading %s", args.log_dir + "/" + in_fn)
+        with codecs.open(args.log_dir + "/" + in_fn, mode="r", encoding="utf-8") as f:
+            trans_rm_meta_sym = f.readlines()
+
+        # save log-file of stage 2
+        trans_rm_repeat_sym = codecs.open(
+            args.log_dir + "/" + out_fn, mode="w", encoding="utf-8"
+        )
+        for ix, _line in enumerate(trans_rm_meta_sym):
+            fn_trans = _line.strip()
+            fn, trans = split_fn_trans(fn_trans)  # txt_path, transcription
+
+            line = trans.replace(
+                " +", "+ "
+            ).strip()  # fix the notation errors ('w1 +w2' -> 'w1+ w2')
+            if args.type == "fl":  # remove repeated words
+                line = re.sub(
+                    r"\(([^\)]+)\)/\(([^\)]+)\)[^ \+]*\+", "", line
+                ).strip()  # ('(w1)/(w1)+' -> '')
+                line = re.sub(
+                    r"[^ \+]+[\+]($|\s)", " ", line
+                ).strip()  # remove repeated words ('w1+' -> '')
+                line = re.sub(
+                    r".*\+.*", "", line
+                ).strip()  # remove error sentences ('w1+w2' -> '')
+            elif args.type == "df":  # remain repeated words
+                line = re.sub(
+                    r"\(([^\)]+)\)/\(([^\)]+)\)([^ \+]*)\+", r"(\1)/(\2)\3", line
+                ).strip()
+                line = re.sub(
+                    r"([^ \+]+)[\+]($|\s)", r"\1 ", line
+                ).strip()  # remove repeated symbol ('w1+' -> 'w1')
+                line = re.sub(
+                    r"([^ \+]+)[\+](\,|\.|\?|\!)", r"\1\2", line
+                ).strip()  # remove repeated symbol ('w1+,' -> 'w1')
+                line = re.sub(
+                    r".*\+.*", "", line
+                ).strip()  # remove mis-used special symbol ('w1+w2' -> '')
+            else:  # remain repeated words and symbol '+'
+                line = line.strip()
+            line = " ".join(line.split())  # remove multiple space
+
+            trans_rm_repeat_sym.write(str(fn) + " :: " + str(line).strip() + "\n")
+        trans_rm_repeat_sym.close()
+
+    logging.debug("stage 3: Select transcription notation")
+    in_fn = "2_remove_repeat_symbols.txt"
+    out_fn = "3_select_notation.txt"
+    if os.path.exists(args.log_dir + "/" + in_fn) and args.stage <= 3:
+        logging.debug(">> reading %s", args.log_dir + "/" + in_fn)
+        with codecs.open(args.log_dir + "/" + in_fn, mode="r", encoding="utf-8") as f:
+            trans_raw = f.readlines()
+
+        # save log-file of stage 3
+        trans_notation = codecs.open(
+            args.log_dir + "/" + out_fn, mode="w", encoding="utf-8"
+        )
+        for ix, _line in enumerate(trans_raw):
+            fn_trans = _line.strip()
+            fn, trans = split_fn_trans(fn_trans)  # txt_path, transcription
+
+            line = re.sub(r"\)\s*/\s*\(", r")/(", trans)
+            if args.notation_type == "char":
+                line = re.sub(
+                    r"\(([^\)]+)\)/\(([^\)]+)\)", r"\1", line
+                )  # ($1)/($2) -> $1
+            else:
+                line = re.sub(
+                    r"\(([^\)]+)\)/\(([^\)]+)\)", r"\2", line
+                )  # ($1)/($2) -> $2
+            if re.search(r"[\(\)]]", line):
+                logging.warning(
+                    "INVALID EXPR at line %d - %s" % (ix + 1, _line.strip())
+                )
+                continue
+
+            trans_notation.write(str(fn) + " :: " + str(line).strip() + "\n")
+        trans_notation.close()
+
+    logging.debug("stage 4: Remove filler words")
+    in_fn = "3_select_notation.txt"
+    out_fn = "4_remove_filler_words.txt"
+    if os.path.exists(args.log_dir + "/" + in_fn) and args.stage <= 4:
+        logging.debug(">> reading %s", args.log_dir + "/" + in_fn)
+        with codecs.open(args.log_dir + "/" + in_fn, mode="r", encoding="utf-8") as f:
+            trans_rm_repeat_sym = f.readlines()
+
+        # save log-file of stage 4
+        trans_rm_filler_word = codecs.open(
+            args.log_dir + "/" + out_fn, mode="w", encoding="utf-8"
+        )
+        for ix, _line in enumerate(trans_rm_repeat_sym):
+            fn_trans = _line.strip()
+            fn, trans = split_fn_trans(fn_trans)  # txt_path, transcription
+
+            if args.type == "fl":  # remove filler words
+                line = re.sub(
+                    r"[^ \+]+[\/]", "", trans
+                ).strip()  # remove filler words ('/')
+            elif args.type == "df":  # remain filler words
+                line = re.sub(
+                    r"([^ \+]+)[\/]", r"\1", trans
+                ).strip()  # remain filler words ('/')
+            else:  # remain filler words with symbol '/'
+                line = trans.strip()
+
+            trans_rm_filler_word.write(str(fn) + " :: " + str(line).strip() + "\n")
+        trans_rm_filler_word.close()
+
+    logging.debug("stage 5: Remove miss punctuation marks")
+    in_fn = "4_remove_filler_words.txt"
+    out_fn = "5_remove_error_punctuation_marks.txt"
+    if os.path.exists(args.log_dir + "/" + in_fn) and args.stage <= 5:
+        logging.debug(">> reading %s", args.log_dir + "/" + in_fn)
+        with codecs.open(args.log_dir + "/" + in_fn, mode="r", encoding="utf-8") as f:
+            trans_rm_repeat_sym = f.readlines()
+
+        # save log-file of stage 5
+        trans_rm_error_punctuation_mark = codecs.open(
+            args.log_dir + "/" + out_fn, mode="w", encoding="utf-8"
+        )
+        for ix, _line in enumerate(trans_rm_repeat_sym):
+            fn_trans = _line.strip()
+            fn, trans = split_fn_trans(fn_trans)  # txt_path, transcription
+
+            # remove error punctuation marks
+            line = trans.upper()  # replace lowercase with uppercase letters
+            line = re.sub(r"^[\,\.\?\!]+ ", " ", line).strip()  # '. w1' -> 'w1'
+            line = re.sub(r" [\,\.\?\! ]+ ", " ", line).strip()  # 'w1 . w2 ' -> 'w1 w2'
+            line = re.sub(r" [\,\.\?\!]+$", " ", line).strip()  # 'w1 .' -> 'w1'
+            line = re.sub(
+                r"(^| )[\,\.\?\!]([^ ]+)", r" \2", line
+            ).strip()  # '.w1' -> 'w1'
+            line = re.sub(
+                r"^[\,\.\?\!]$", "", line
+            ).strip()  # if the line has only '.', remove it
+            line = line.replace("#", "").strip()
+            line = line.replace("/", "").strip() if args.type != "dt" else line.strip()
+            line = " ".join(line.split())  # remove multiple space
+
+            if len(line) > 0:  # remove empty lines
+                trans_rm_error_punctuation_mark.write(
+                    str(fn) + " :: " + str(line).strip() + "\n"
+                )
+        trans_rm_error_punctuation_mark.close()
+
+    # save refined transcription
+    logging.debug("stage 6: Save refined transcription")
+    out_fn = (
+        os.path.basename(args.raw_trans) + "." + str(args.type) + ".out"
+        if args.out_fn is None
+        else args.out_fn
+    )
+    shutil.copy(args.log_dir + "/" + "5_remove_error_punctuation_marks.txt", out_fn)
+
+    if args.clear:  # remove log directory and files
+        shutil.rmtree(args.log_dir)
+
+    logging.info("Succeeded creating transcription: " + out_fn)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/egs2/ksponspeech/asr1/local/lowercase.perl b/egs2/ksponspeech/asr1/local/lowercase.perl
new file mode 100755
index 00000000000..cda6f2b624f
--- /dev/null
+++ b/egs2/ksponspeech/asr1/local/lowercase.perl
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+use warnings;
+use strict;
+
+while (@ARGV) {
+    $_ = shift;
+    /^-b$/ && ($| = 1, next); # not buffered (flush each line)
+}
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  print lc($_);
+}
diff --git a/egs2/ksponspeech/asr1/local/path.sh b/egs2/ksponspeech/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/ksponspeech/asr1/local/remove_punctuation.pl b/egs2/ksponspeech/asr1/local/remove_punctuation.pl
new file mode 100755
index 00000000000..0ad0052a4a1
--- /dev/null
+++ b/egs2/ksponspeech/asr1/local/remove_punctuation.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+while(<STDIN>) {
+  $_ = " $_ ";
+
+  # remove punctuation except apostrophe
+  s/<space>/spacemark/g;  # for scoring
+  s/<unk>/unknown1/g;
+  s/\[unk\]/unknown2/g;
+  s/\+/repeatsym/g;
+  s/\//fillersym/g;
+  s/'/apostrophe/g;
+  s/[[:punct:]]//g;
+  s/apostrophe/'/g;
+  s/spacemark/<space>/g;  # for scoring
+  s/fillersym/\//g;
+  s/repeatsym/\+/g;
+  s/unknown2/\[unk\]/g;
+  s/unknown1/<unk>/g;
+
+  # remove consecutive commas and spaces
+  s/\s+/ /g;
+
+  # remove whitespace again
+  s/\s+/ /g;
+  s/^\s+//;
+  s/\s+$//;
+
+  print "$_\n";
+}
diff --git a/egs2/ksponspeech/asr1/local/score_sclite.sh b/egs2/ksponspeech/asr1/local/score_sclite.sh
new file mode 100755
index 00000000000..a0d66ab712b
--- /dev/null
+++ b/egs2/ksponspeech/asr1/local/score_sclite.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#           2020 Electronics and Telecommunications Research Institute (Jeong-Uk, Bang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+[ -f ./path.sh ] && . ./path.sh
+
+nlsyms=""
+wer=false
+bpe=""
+bpemodel=""
+remove_blank=true
+space_norm=false
+filter=""
+num_spkrs=1
+help_message="Usage: $0 <data-dir> <dict>"
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+dir=$1
+dic=$2
+
+concatjson.py ${dir}/data.*.json > ${dir}/data.json
+
+if [ $num_spkrs -eq 1 ]; then
+  json2trn.py ${dir}/data.json ${dic} --num-spkrs ${num_spkrs} --refs ${dir}/ref.trn --hyps ${dir}/hyp.trn
+
+  if ${remove_blank}; then
+      sed -i.bak2 -r 's/<blank> //g' ${dir}/hyp.trn
+  fi
+  if [ -n "${nlsyms}" ]; then
+      cp ${dir}/ref.trn ${dir}/ref.trn.org
+      cp ${dir}/hyp.trn ${dir}/hyp.trn.org
+      filt.py -v ${nlsyms} ${dir}/ref.trn.org > ${dir}/ref.trn
+      filt.py -v ${nlsyms} ${dir}/hyp.trn.org > ${dir}/hyp.trn
+  fi
+  if [ -n "${filter}" ]; then
+      sed -i.bak3 -f ${filter} ${dir}/hyp.trn
+      sed -i.bak3 -f ${filter} ${dir}/ref.trn
+  fi
+
+  sclite -r ${dir}/ref.trn trn -h ${dir}/hyp.trn trn -i rm -o all stdout > ${dir}/result.txt
+
+  echo "write a CER (or TER) result in ${dir}/result.txt"
+  grep -e Avg -e SPKR -m 2 ${dir}/result.txt
+
+  if ${wer}; then
+      if [ -n "$bpe" ]; then
+  	    spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
+  	    spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
+      else
+  	    sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/ref.trn > ${dir}/ref.wrd.trn
+  	    sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" ${dir}/hyp.trn > ${dir}/hyp.wrd.trn
+      fi
+
+      sclite -r ${dir}/ref.wrd.trn trn -h ${dir}/hyp.wrd.trn trn -i rm -o all stdout > ${dir}/result.wrd.txt
+      echo "write a WER result in ${dir}/result.wrd.txt"
+      grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.txt
+
+      if $space_norm; then
+            # get space-normalized texts
+            python local/get_space_normalized_hyps.py --verbose 0 \
+                  --in-ref ${dir}/ref.wrd.trn --in-hyp ${dir}/hyp.wrd.trn \
+                  --out-ref ${dir}/ref.sp_norm.trn --out-hyp ${dir}/hyp.sp_norm.trn || exit 1;
+
+            # character error rate; CER (excluding space symbols)
+            sclite -r ${dir}/ref.sp_norm.trn trn -h ${dir}/hyp.sp_norm.trn trn -i rm -o all stdout > ${dir}/result.sp_norm.txt
+            echo "write a CER result in ${dir}/result.sp_norm.txt"
+            grep -e Avg -e SPKR -m 2 ${dir}/result.sp_norm.txt
+
+            # space-normalized word error rate; sWER
+            cat ${dir}/ref.sp_norm.trn | sed -e "s/ //g" | sed -e "s/▁/ /g" > ${dir}/ref.wrd.sp_norm.trn
+            cat ${dir}/hyp.sp_norm.trn | sed -e "s/ //g" | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.sp_norm.trn
+            sclite -r ${dir}/ref.wrd.sp_norm.trn trn -h ${dir}/hyp.wrd.sp_norm.trn trn -i rm -o all stdout > ${dir}/result.wrd.sp_norm.txt
+            echo "write a sWER result in ${dir}/result.wrd.sp_norm.txt"
+            grep -e Avg -e SPKR -m 2 ${dir}/result.wrd.sp_norm.txt
+      fi
+  fi
+fi
diff --git a/egs2/ksponspeech/asr1/local/trans_prep.sh b/egs2/ksponspeech/asr1/local/trans_prep.sh
new file mode 100755
index 00000000000..7a3f01ffd6d
--- /dev/null
+++ b/egs2/ksponspeech/asr1/local/trans_prep.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Electronics and Telecommunications Research Institute (Jeong-Uk, Bang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+export LC_ALL=C
+
+. utils/parse_options.sh || exit 1;
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <db-dir> <text-dir>"
+    echo "e.g.: $0 /mls/jubang/databases/KsponSpeech data/local/KsponSpeech"
+    exit 1
+fi
+
+db=$1
+text=$2
+
+tgt_case="df"
+# fl: fluent transcription
+# df: disfluent transcription
+# dt: disfluent transcription with tag symbols ('/' or '+')
+
+[ ! -d ${db} ] && echo "$0: no such directory ${db}" && exit 1;
+[ -f ${text}/.done ] && echo "$0: the KsponSpeech transcription exists ==> Skip" && exit 0;
+
+mkdir -p ${text} ${text}/logs || exit 1;
+
+# 1) get original transcription 
+for x in train dev eval_clean eval_other; do
+    [ ! -f ${db}/scripts/${x}.trn ] && echo "$0: no such transcription scripts/${x}.trn" && exit 1;
+    mkdir -p ${text}/${x} && cp -a ${db}/scripts/${x}.trn ${text}/${x}/text.raw
+done
+
+# 2) get transcription files
+echo "$0: get transcription files for KsponSpeech"
+for task in train dev eval_clean eval_other; do
+    python local/get_transcriptions.py --verbose 1 --clear \
+        --type $tgt_case --log-dir ${text}/logs/get_transcription --unk-sym '[unk]' \
+        --raw-trans ${text}/${task}/text.raw --out-fn ${text}/${task}/text.trn 
+done
+
+echo "$0: successfully prepared transcription files for KsponSpeech dataset"
+touch ${text}/.done && exit 0;
+
diff --git a/egs2/ksponspeech/asr1/path.sh b/egs2/ksponspeech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/ksponspeech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/pyscripts b/egs2/ksponspeech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/ksponspeech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/run.sh b/egs2/ksponspeech/asr1/run.sh
new file mode 100755
index 00000000000..9811c3922f6
--- /dev/null
+++ b/egs2/ksponspeech/asr1/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="eval_clean eval_other"
+asr_config="conf/train_asr_conformer.yaml"
+lm_config="conf/train_lm_transformer.yaml"
+inference_config="conf/decode_asr.yaml"
+nbpe=2309
+
+./asr.sh \
+    --lang kr \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --max_wav_duration 30 \
+    --lm_config "${lm_config}" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --nbpe "${nbpe}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" \
+    --bpe_nlsyms '[unk]' "$@"
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/scripts b/egs2/ksponspeech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/ksponspeech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/steps b/egs2/ksponspeech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/ksponspeech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/ksponspeech/asr1/utils b/egs2/ksponspeech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/ksponspeech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/kss/tts1/README.md b/egs2/kss/tts1/README.md
new file mode 100644
index 00000000000..7c4981adb5f
--- /dev/null
+++ b/egs2/kss/tts1/README.md
@@ -0,0 +1,129 @@
+# KSS RECIPE
+
+This is the recipe of Korean female single speaker TTS model with [KSS dataset](https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset).
+
+Before running the recipe, please download from https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset.  
+Then, edit 'KSS' in `db.sh` and locate unzipped dataset as follows:
+
+```bash
+$ vim db.sh
+KSS=/path/to/kss
+
+$ tree -L 1 /path/to/kss
+/path/to/kss
+├── 1
+├── 2
+├── 3
+├── 4
+└── transcript.v.1.4.txt
+```
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+
+# THIRD RESULTS
+- Applied with `korean_jaso` and `korean_cleaner`
+- Sampling frequency of 44,100 Hz
+- VITS configuration applied.
+
+```sh
+# Run with the following command for vits:
+./run.sh \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --fs 44100 \
+    --fmin 80 \
+    --fmax 22050 \
+    --n_mels 120 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --win_length 2048 \
+    --train_config conf/tuning/train_full_band_vits.yaml \
+    --inference_config conf/tuning/decode_vits.yaml \
+    --token_type phn \
+    --g2p korean_jaso \
+    --cleaner korean_cleaner
+```
+
+## Environments
+- date: `Wed Oct 13 16:56:45 KST 2021`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.4a1`
+- chainer version: `chainer 7.8.0`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `e2c8c30580caf010a957b278df6083bcab14117e`
+  - Commit date: `Tue Oct 12 15:25:39 2021 +0900`
+
+## Pretrained models
+
+### kss_vits_vits_44100_train.total_count.best, fs=44100, lang=ko
+- https://zenodo.org/record/5563406
+
+
+# SECOND RESULTS
+- New G2P of `korean_jaso` (korean grapheme-based tokenizer)
+- New cleaner of `korean_cleaner` (basic one, not sophisticated)
+- Sampling frequency of 44,100 Hz
+
+```sh
+# Run with the following command for tacotron2:
+./run.sh \
+    --fs 44100 \
+    --fmin 80 \
+    --fmax 22050 \
+    --n_mels 120 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --win_length 2048 \
+    --train_config conf/tuning/train_tacotron2.yaml \
+    --inference_config conf/tuning/decode_tacotron2.yaml \
+    --token_type phn \
+    --g2p korean_jaso \
+    --cleaner korean_cleaner
+```
+
+## Environments
+- date: `Tue Sep 14 13:39:16 KST 2021`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- chainer version: `chainer 7.8.0`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `97b9dad4dbca71702cb7928a126ec45d96414a3f`
+  - Commit date: `Mon Sep 13 22:55:04 2021 +0900`
+
+## Pretrained models
+
+### kss_tts_train_tacotron2_raw_phn_korean_cleaner_korean_jaso_train.loss.ave, fs=44100
+- https://zenodo.org/record/5508413
+
+
+# INITIAL RESULTS
+
+## Environments
+- date: `Tue Aug  3 15:23:52 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.5.1`
+- Git hash: `0b17d7081defbe2d3b840fdbf488007860b3a6c3`
+  - Commit date: `Mon Aug 2 21:35:48 2021 -0400`
+
+## Pretrained models
+
+### kss_tts_train_transformer_raw_phn_g2pk_no_space_train.loss.ave
+- https://zenodo.org/record/5154791
+
+### kss_tts_train_tacotron2_raw_phn_g2pk_no_space_train.loss.ave
+- https://zenodo.org/record/5154795
+
+### kss_tts_train_conformer_fastspeech2_raw_phn_g2pk_no_space_teacher_transformer_train.loss.ave
+- https://zenodo.org/record/5154835
diff --git a/egs2/kss/tts1/cmd.sh b/egs2/kss/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/kss/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/kss/tts1/conf/decode.yaml b/egs2/kss/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/kss/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/kss/tts1/conf/mfcc.conf b/egs2/kss/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/kss/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/kss/tts1/conf/pbs.conf b/egs2/kss/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/kss/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/kss/tts1/conf/queue.conf b/egs2/kss/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/kss/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/kss/tts1/conf/slurm.conf b/egs2/kss/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/kss/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/kss/tts1/conf/train.yaml b/egs2/kss/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/kss/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/kss/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/kss/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/kss/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/kss/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/kss/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..e006156e8be
--- /dev/null
+++ b/egs2/kss/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false  # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/kss/tts1/conf/tuning/decode_vits.yaml b/egs2/kss/tts1/conf/tuning/decode_vits.yaml
new file mode 100644
index 00000000000..73184f45402
--- /dev/null
+++ b/egs2/kss/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
\ No newline at end of file
diff --git a/egs2/kss/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/kss/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..434096c4bbc
--- /dev/null
+++ b/egs2/kss/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 200            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 12000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/kss/tts1/conf/tuning/train_full_band_vits.yaml b/egs2/kss/tts1/conf/tuning/train_full_band_vits.yaml
new file mode 100644
index 00000000000..a19ef039b45
--- /dev/null
+++ b/egs2/kss/tts1/conf/tuning/train_full_band_vits.yaml
@@ -0,0 +1,185 @@
+# This configuration is for ESPnet2 to train 44.1 kHz VITS,
+# which is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 44100 hz audio as
+# the training data (mainly tested on JSUT and CSMSC).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 44100          # must be the same as the training data
+        n_fft: 2048        # fft points
+        hop_length: 512    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 120         # number of Mel basis
+        fmin: 80            # minimum frequency for Mel basis
+        fmax: 22050         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 44100          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 400 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 3000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
\ No newline at end of file
diff --git a/egs2/kss/tts1/conf/tuning/train_tacotron2.yaml b/egs2/kss/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/kss/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/kss/tts1/conf/tuning/train_transformer.yaml b/egs2/kss/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/kss/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/kss/tts1/conf/vad.conf b/egs2/kss/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/kss/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/kss/tts1/db.sh b/egs2/kss/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/kss/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/kss/tts1/local/data.sh b/egs2/kss/tts1/local/data.sh
new file mode 100755
index 00000000000..2bfdadf1195
--- /dev/null
+++ b/egs2/kss/tts1/local/data.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+text_format=raw
+threshold=35
+nj=32
+g2p=g2pk_no_space
+
+log "$0 $*"
+# shellcheck disable=SC1091
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+# shellcheck disable=SC1091
+. ./cmd.sh || exit 1;
+# shellcheck disable=SC1091
+. ./db.sh || exit 1;
+
+if [ -z "${KSS}" ]; then
+   log "Fill the value of 'KSS' of db.sh"
+   exit 1
+fi
+
+db_root=${KSS}
+train_set=tr_no_dev
+dev_set=dev
+eval_set=eval1
+
+if [ ! -e "${KSS}/transcript.v.1.4.txt" ]; then
+    log "KSS dataset is not found."
+    log "Please download it from https://bit.ly/376oCzY and locate as follows:"
+    cat << EOF
+$ vim db.sh
+KSS=/path/to/kss
+
+$ tree -L 1 /path/to/kss
+/path/to/kss
+├── 1
+├── 2
+├── 3
+├── 4
+└── transcript.v.1.4.txt
+EOF
+    exit 1
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    local/data_prep.sh "${db_root}" data/all
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: scripts/audio/trim_silence.sh"
+    # shellcheck disable=SC2154
+    scripts/audio/trim_silence.sh \
+        --cmd "${train_cmd}" \
+        --nj "${nj}" \
+        --fs 44100 \
+        --win_length 2048 \
+        --shift_length 512 \
+        --threshold "${threshold}" \
+        data/all data/all/log
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: utils/subset_data_dir.sh"
+    utils/subset_data_dir.sh data/all 500 data/deveval
+    utils/subset_data_dir.sh --first data/deveval 250 "data/${dev_set}"
+    utils/subset_data_dir.sh --last data/deveval 250 "data/${eval_set}"
+    utils/copy_data_dir.sh data/all "data/${train_set}"
+    utils/filter_scp.pl --exclude data/deveval/wav.scp \
+        data/all/wav.scp > "data/${train_set}/wav.scp"
+    utils/fix_data_dir.sh "data/${train_set}"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && [ "${text_format}" = phn ]; then
+    log "stage 3: pyscripts/utils/convert_text_to_phn.py"
+    for dset in "${train_set}" "${dev_set}" "${eval_set}"; do
+        utils/copy_data_dir.sh "data/${dset}" "data/${dset}_phn"
+        pyscripts/utils/convert_text_to_phn.py --g2p "${g2p}" --nj "${nj}" \
+            "data/${dset}/text" "data/${dset}_phn/text"
+        utils/fix_data_dir.sh "data/${dset}_phn"
+    done
+fi
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/kss/tts1/local/data_prep.sh b/egs2/kss/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..7201661efda
--- /dev/null
+++ b/egs2/kss/tts1/local/data_prep.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <db_root> <data_dir>"
+    echo "e.g.: $0 /path/to/kss data/train"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check directory existence
+[ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+
+# make scp, utt2spk, and spk2utt
+find "${db_root}" -name "*.wav" | sort | while read -r filename; do
+    id=kss_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    # NOTE(kan-bayashi): Some wav files are stereo
+    echo "${id} sox ${filename} -t wav -c 1 - |" >> "${scp}"
+    echo "${id} kss" >> "${utt2spk}"
+done
+utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+echo "Successfully finished making wav.scp, utt2spk, spk2utt."
+
+# make text
+find "${db_root}" -name "transcript*.txt" | sort | while read -r filename; do
+   sed < "${filename}" -e "s;^[1-4]/;;g" -e "s/.wav//g" |
+        awk -F "|" -v spk=kss '{print spk "_" $1 " " $2}' |
+        sort >> "${text}"
+done
+echo "Successfully finished making text."
+
+utils/fix_data_dir.sh "${data_dir}"
+echo "Successfully finished preparing data directory."
diff --git a/egs2/kss/tts1/local/path.sh b/egs2/kss/tts1/local/path.sh
new file mode 100644
index 00000000000..859094f826c
--- /dev/null
+++ b/egs2/kss/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import g2pk" > /dev/null; then
+    echo "Error: g2pk is not installed." >&2
+    echo "Error: please install g2pk and its dependencies as follows:" >&2
+    echo "Error: source ${MAIN_ROOT}/tools/activate_python.sh && pip install g2pK" >&2
+    return 1
+fi
diff --git a/egs2/kss/tts1/path.sh b/egs2/kss/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/kss/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/kss/tts1/pyscripts b/egs2/kss/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/kss/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/kss/tts1/run.sh b/egs2/kss/tts1/run.sh
new file mode 100755
index 00000000000..278aa03d1bd
--- /dev/null
+++ b/egs2/kss/tts1/run.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Feature related
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+# Data prep related
+text_format=raw  # Use "raw" or "phn". If use "phn", convert to phn in data prep.
+local_data_opts=""
+local_data_opts+=" --text_format ${text_format}"
+
+dset_suffix=""
+if [ "${text_format}" = phn ]; then
+    dset_suffix=_phn
+fi
+train_set=tr_no_dev${dset_suffix}
+valid_set=dev${dset_suffix}
+test_sets="dev${dset_suffix} eval1${dset_suffix}"
+
+# Config related
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# NOTE(kan-bayashi): Make sure that you use text_format=raw
+#   if you want to use token_type=char.
+token_type=phn
+
+# g2p=g2pk
+g2p=g2pk_no_space  # No word sparator
+
+# Default settings for non-vits models
+tts_task=tts
+feats_extract=fbank
+feats_normalize=global_mvn
+
+./tts.sh \
+    --tts_task "${tts_task}" \
+    --feats_extract "${feats_extract}" \
+    --feats_normalize "${feats_normalize}" \
+    --local_data_opts "${local_data_opts}" \
+    --audio_format wav \
+    --lang ko \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type "${token_type}" \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    "$@"
diff --git a/egs2/kss/tts1/scripts b/egs2/kss/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/kss/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/kss/tts1/sid b/egs2/kss/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/kss/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/kss/tts1/steps b/egs2/kss/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/kss/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/kss/tts1/tts.sh b/egs2/kss/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/kss/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/kss/tts1/utils b/egs2/kss/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/kss/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/laborotv/asr1/cmd.sh b/egs2/laborotv/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/laborotv/asr1/cmd.sh
+++ b/egs2/laborotv/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/laborotv/asr1/conf/decode_asr_streaming.yaml b/egs2/laborotv/asr1/conf/decode_asr_streaming.yaml
new file mode 100644
index 00000000000..98e3597a723
--- /dev/null
+++ b/egs2/laborotv/asr1/conf/decode_asr_streaming.yaml
@@ -0,0 +1,10 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
+sim_chunk_length: 512
+disable_repetition_detection: true
+decoder_text_length_limit: 0
+encoded_feat_length_limit: 0
diff --git a/egs2/laborotv/asr1/conf/train_asr_streaming_transformer.yaml b/egs2/laborotv/asr1/conf/train_asr_streaming_transformer.yaml
new file mode 100644
index 00000000000..0b5f85e21d4
--- /dev/null
+++ b/egs2/laborotv/asr1/conf/train_asr_streaming_transformer.yaml
@@ -0,0 +1,68 @@
+# This configuration requires 4 GPUs with 32GB memory
+batch_type: numel
+batch_bins: 15000000
+accum_grad: 6
+max_epoch: 33
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: contextual_block_transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d6
+    normalize_before: true
+    block_size: 40
+    hop_size: 16
+    look_ahead: 16
+    init_average: true
+    ctx_pos_enc: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+num_att_plot: 0
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/laborotv/asr1/run_streaming.sh b/egs2/laborotv/asr1/run_streaming.sh
new file mode 100755
index 00000000000..a9cf0a96d0c
--- /dev/null
+++ b/egs2/laborotv/asr1/run_streaming.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_nodev
+valid_set=dev_4k
+test_sets="dev_4k dev tedx-jp-10k"
+
+asr_config=conf/train_asr_streaming_transformer.yaml
+inference_config=conf/decode_asr_streaming.yaml
+lm_config=conf/train_lm.yaml
+inference_asr_model=valid.acc.ave.pth
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+# NOTE: The default settings require 4 GPUs with 32 GB memory
+./asr.sh \
+    --use_streaming true \
+    --ngpu 4 \
+    --nj 128 \
+    --inference_nj 256 \
+    --lang jp \
+    --token_type char \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --lm_train_text "data/${train_set}/text" "$@" \
+    --inference_asr_model ${inference_asr_model}
+
diff --git a/egs2/librilight_limited/asr1/asr.sh b/egs2/librilight_limited/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/librilight_limited/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/librilight_limited/asr1/cmd.sh b/egs2/librilight_limited/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/librilight_limited/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/librilight_limited/asr1/conf/fbank.conf b/egs2/librilight_limited/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/librilight_limited/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/librilight_limited/asr1/conf/pbs.conf b/egs2/librilight_limited/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/librilight_limited/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/librilight_limited/asr1/conf/pitch.conf b/egs2/librilight_limited/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/librilight_limited/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/librilight_limited/asr1/conf/queue.conf b/egs2/librilight_limited/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/librilight_limited/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/librilight_limited/asr1/conf/slurm.conf b/egs2/librilight_limited/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/librilight_limited/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/librilight_limited/asr1/conf/tuning/train_asr_hubert_base_10h_finetuning.yaml b/egs2/librilight_limited/asr1/conf/tuning/train_asr_hubert_base_10h_finetuning.yaml
new file mode 100644
index 00000000000..512dee42dc8
--- /dev/null
+++ b/egs2/librilight_limited/asr1/conf/tuning/train_asr_hubert_base_10h_finetuning.yaml
@@ -0,0 +1,52 @@
+batch_type: numel
+batch_bins: 3200000
+accum_grad: 1
+max_epoch: 100
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
+
+freeze_param: [
+    "encoder.encoders.mask_emb", "encoder.encoders.feature_extractor",
+    "encoder.encoders.post_extract_proj", "encoder.encoders.encoder.pos_conv",
+]
+
+input_size: 1
+encoder: hubert
+encoder_conf:
+    output_size: 256
+    normalize_before: false
+    freeze_finetune_updates: 10000
+    # if use espent-trained model
+    hubert_url: espnet
+    hubert_dir_path: ../../librispeech/ssl1/exp/pretrain_train_asr_hubert_base_960h_pretrain_it1_raw_iter1/valid.acc.best.pth
+    # if use downloaded model:
+    #hubert_url: https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt
+    #hubert_dir_path: ./downloads/hubert_pretrained_models/hubert_base_ls960.pt
+
+
+model_conf:
+    ctc_weight: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.00002
+    
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 8000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
diff --git a/egs2/librilight_limited/asr1/db.sh b/egs2/librilight_limited/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/librilight_limited/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/librilight_limited/asr1/local/data.sh b/egs2/librilight_limited/asr1/local/data.sh
new file mode 100644
index 00000000000..3630382aea5
--- /dev/null
+++ b/egs2/librilight_limited/asr1/local/data.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+ll_data_url=https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz
+ls_data_url=www.openslr.org/resources/12
+train_set="train_10h"
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${LIBRILIGHT_LIMITED}" ]; then
+    log "Fill the value of 'LIBRILIGHT_LIMITED' of db.sh"
+    exit 1
+fi
+
+if [ -z "${LIBRISPEECH}" ]; then
+    log "Fill the value of 'LIBRISPEECH' of db.sh"
+    exit 1
+fi
+
+src=${LIBRILIGHT_LIMITED}/librispeech_finetuning
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "Stage 1: Data Download to ${LIBRILIGHT_LIMITED}"
+    if [ ! -d ${src}/1h ] && [ ! -d ${src}/9h ]; then
+	mkdir -p "${LIBRILIGHT_LIMITED}"
+	wget "${ll_data_url}" -P "${LIBRILIGHT_LIMITED}"
+	tar vxfz "${LIBRILIGHT_LIMITED}/librispeech_finetuning.tgz" -C "${LIBRILIGHT_LIMITED}"
+    else
+	log "${LIBRILIGHT_LIMITED}/librispeech_finetuning is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    for part in 1h/{0..5}/{clean,other} 9h/{clean,other}; do
+	dataname=$(echo ${part} | sed 's/\//_/g')
+	
+	data_part=$(./utils/make_absolute.sh ${src}/${part})
+	data_new_path="data/train_${dataname}"
+	mkdir -p "${data_new_path}"
+	files=( "$(find -L ${data_part}/ -name '*.flac')" )
+
+	for f in "${files[@]}"; do
+	    filename=$(basename "$f")
+	    filename=${filename%%.flac}
+	    echo "${filename} flac -c -d -s ${f} |" 
+	done | sort | uniq > ${data_new_path}/wav.scp
+	
+	paste -d' ' <(awk '{print $1}' ${data_new_path}/wav.scp) \
+              <(awk '{print $1}' "${data_new_path}/wav.scp" | cut -d'-' -f1) \
+              > "${data_new_path}/utt2spk"
+	./utils/utt2spk_to_spk2utt.pl "${data_new_path}/utt2spk" > "${data_new_path}/spk2utt"
+	text_files=( "$(find -L ${data_part}/ -name '*.trans.txt')" )
+	echo "${text_files[@]}"
+	for f in "${text_files[@]}"; do
+	    cat ${f}
+	done | sort | uniq > "${data_new_path}/text"
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: combine 10hr training sets"
+    ./utils/combine_data.sh \
+	data/${train_set} data/train_1h_{0..5}_{clean,other} data/train_9h_{clean,other}
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    if [ ! -e "${LIBRISPEECH}/LibriSpeech/LICENSE.TXT" ]; then
+	echo "stage 4: Evaluation Data Download to ${LIBRISPEECH}"
+	for part in dev-clean test-clean dev-other test-other; do
+            local/download_and_untar_eval.sh "${LIBRISPEECH}" "${ls_data_url}" "${part}"
+	done
+    else
+        log "stage 4: ${LIBRISPEECH}/LibriSpeech/LICENSE.TXT is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    log "stage 5: Evaluation Data Preparation and Combination"
+    for part in dev-clean test-clean dev-other test-other; do
+        # use underscore-separated names in data directories.
+        local/data_prep_eval.sh ${LIBRISPEECH}/LibriSpeech/${part} data/${part//-/_}
+    done
+    utils/combine_data.sh --extra_files utt2num_frames data/dev data/dev_clean data/dev_other
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/librilight_limited/asr1/local/data_prep_eval.sh b/egs2/librilight_limited/asr1/local/data_prep_eval.sh
new file mode 120000
index 00000000000..168bf5ad2f5
--- /dev/null
+++ b/egs2/librilight_limited/asr1/local/data_prep_eval.sh
@@ -0,0 +1 @@
+../../../librispeech/asr1/local/data_prep.sh
\ No newline at end of file
diff --git a/egs2/librilight_limited/asr1/local/download_and_untar_eval.sh b/egs2/librilight_limited/asr1/local/download_and_untar_eval.sh
new file mode 120000
index 00000000000..8af58a9ab7b
--- /dev/null
+++ b/egs2/librilight_limited/asr1/local/download_and_untar_eval.sh
@@ -0,0 +1 @@
+../../../librispeech/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/librilight_limited/asr1/local/path.sh b/egs2/librilight_limited/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/librilight_limited/asr1/path.sh b/egs2/librilight_limited/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/librilight_limited/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/librilight_limited/asr1/pyscripts b/egs2/librilight_limited/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/librilight_limited/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/librilight_limited/asr1/run.sh b/egs2/librilight_limited/asr1/run.sh
new file mode 100644
index 00000000000..c2470aea329
--- /dev/null
+++ b/egs2/librilight_limited/asr1/run.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. ./db.sh
+
+train_set="train_10h"
+valid_set="dev_clean"
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_config=conf/tuning/train_asr_hubert_base_10h_finetuning.yaml
+inference_config=conf/decode_asr.yaml
+
+
+./asr.sh \
+    --lang en \
+    --ngpu 0 \
+    --nj 4 \
+    --max_wav_duration 30 \
+    --asr_config "${asr_config}" \
+    --use_lm false \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --token_type char \
+    --inference_asr_model valid.loss.ave.pth \
+    --feats-normalize null "$@"
diff --git a/egs2/librilight_limited/asr1/scripts b/egs2/librilight_limited/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/librilight_limited/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/librilight_limited/asr1/steps b/egs2/librilight_limited/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/librilight_limited/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/librilight_limited/asr1/utils b/egs2/librilight_limited/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/librilight_limited/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/librimix/diar1/cmd.sh b/egs2/librimix/diar1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/librimix/diar1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/librimix/diar1/conf/decode_diar.yaml b/egs2/librimix/diar1/conf/decode_diar.yaml
new file mode 120000
index 00000000000..e5365cef606
--- /dev/null
+++ b/egs2/librimix/diar1/conf/decode_diar.yaml
@@ -0,0 +1 @@
+tuning/decode_diar.yaml
\ No newline at end of file
diff --git a/egs2/librimix/diar1/conf/decode_diar_eda.yaml b/egs2/librimix/diar1/conf/decode_diar_eda.yaml
new file mode 120000
index 00000000000..5d4a9d9479d
--- /dev/null
+++ b/egs2/librimix/diar1/conf/decode_diar_eda.yaml
@@ -0,0 +1 @@
+tuning/decode_diar_eda.yaml
\ No newline at end of file
diff --git a/egs2/librimix/diar1/conf/pbs.conf b/egs2/librimix/diar1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/librimix/diar1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/librimix/diar1/conf/queue.conf b/egs2/librimix/diar1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/librimix/diar1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/librimix/diar1/conf/slurm.conf b/egs2/librimix/diar1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/librimix/diar1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/librimix/diar1/conf/train_diar.yaml b/egs2/librimix/diar1/conf/train_diar.yaml
new file mode 120000
index 00000000000..8319de0962c
--- /dev/null
+++ b/egs2/librimix/diar1/conf/train_diar.yaml
@@ -0,0 +1 @@
+tuning/train_diar_2.yaml
\ No newline at end of file
diff --git a/egs2/librimix/diar1/conf/train_diar_eda.yaml b/egs2/librimix/diar1/conf/train_diar_eda.yaml
new file mode 120000
index 00000000000..87ce3010b02
--- /dev/null
+++ b/egs2/librimix/diar1/conf/train_diar_eda.yaml
@@ -0,0 +1 @@
+tuning/train_diar_eda_5.yaml
\ No newline at end of file
diff --git a/egs2/librimix/diar1/conf/train_diar_eda_adapt.yaml b/egs2/librimix/diar1/conf/train_diar_eda_adapt.yaml
new file mode 120000
index 00000000000..cac162ec255
--- /dev/null
+++ b/egs2/librimix/diar1/conf/train_diar_eda_adapt.yaml
@@ -0,0 +1 @@
+tuning/train_diar_eda_adapt.yaml
\ No newline at end of file
diff --git a/egs2/librimix/diar1/conf/tuning/decode_diar.yaml b/egs2/librimix/diar1/conf/tuning/decode_diar.yaml
new file mode 100644
index 00000000000..40f05fd3395
--- /dev/null
+++ b/egs2/librimix/diar1/conf/tuning/decode_diar.yaml
@@ -0,0 +1,2 @@
+fs: 8000
+num_spk: 2 # The number of speakers will be estimated if "num_spk" is "None"
diff --git a/egs2/librimix/diar1/conf/tuning/decode_diar_eda.yaml b/egs2/librimix/diar1/conf/tuning/decode_diar_eda.yaml
new file mode 100644
index 00000000000..506fe93a549
--- /dev/null
+++ b/egs2/librimix/diar1/conf/tuning/decode_diar_eda.yaml
@@ -0,0 +1,2 @@
+fs: 8000
+#num_spk: 2 # The number of speakers will be estimated if "num_spk" is "None"
\ No newline at end of file
diff --git a/egs2/librimix/diar1/conf/tuning/train_diar_2.yaml b/egs2/librimix/diar1/conf/tuning/train_diar_2.yaml
new file mode 100644
index 00000000000..565a6e3147a
--- /dev/null
+++ b/egs2/librimix/diar1/conf/tuning/train_diar_2.yaml
@@ -0,0 +1,58 @@
+#    This config file is for SA-EEND.
+#    For the details about SA-EEND, refer to the following paper:
+#    SA-EEND: https://arxiv.org/pdf/1909.06247.pdf
+
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "linear"
+    num_blocks: 4
+    linear_units: 512
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.1
+
+# decoder related
+decoder: linear
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+max_epoch: 100
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 100000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: false
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librimix/diar1/conf/tuning/train_diar_eda_5.yaml b/egs2/librimix/diar1/conf/tuning/train_diar_eda_5.yaml
new file mode 100644
index 00000000000..91c32aa80be
--- /dev/null
+++ b/egs2/librimix/diar1/conf/tuning/train_diar_eda_5.yaml
@@ -0,0 +1,61 @@
+#    This config file is for EEND-EDA pre-training stage (training on fixed number of speakers).
+#    For the details about EEND-EDA, refer to the following papers:
+#    EEND-EDA: https://arxiv.org/pdf/2005.09921.pdf, https://arxiv.org/pdf/2106.10654.pdf
+
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "linear"
+    num_blocks: 4
+    linear_units: 512
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.1
+
+# attractor related
+attractor: rnn
+attractor_conf:
+    unit: 256 # same as encoder output size
+    layer: 1
+    dropout: 0.1
+    attractor_grad: True
+
+# optimization related
+optim: adam
+grad_clip: 5
+max_epoch: 250
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+batch_type: numel
+batch_bins: 15000000
+accum_grad: 6
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: false
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/librimix/diar1/conf/tuning/train_diar_eda_adapt.yaml b/egs2/librimix/diar1/conf/tuning/train_diar_eda_adapt.yaml
new file mode 100644
index 00000000000..fc91fe5a03f
--- /dev/null
+++ b/egs2/librimix/diar1/conf/tuning/train_diar_eda_adapt.yaml
@@ -0,0 +1,57 @@
+#    This config file is for the EEND-EDA adaptation stage (training on flexible number of speakers).
+#    For the details about EEND-EDA, refer to the following papers:
+#    EEND-EDA: https://arxiv.org/pdf/2005.09921.pdf, https://arxiv.org/pdf/2106.10654.pdf
+
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "linear"
+    num_blocks: 4
+    linear_units: 512
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.1
+
+# attractor related
+attractor: rnn
+attractor_conf:
+    unit: 256 # same as encoder output size
+    layer: 1
+    dropout: 0.1
+    attractor_grad: True
+
+
+# minibatch related
+batch_type: folded
+batch_size: 1
+
+# optimization related
+optim: adam
+accum_grad: 10
+grad_clip: 5
+max_epoch: 50
+optim_conf:
+    lr: 0.0001
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: false
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/librimix/diar1/db.sh b/egs2/librimix/diar1/db.sh
new file mode 120000
index 00000000000..09753a68144
--- /dev/null
+++ b/egs2/librimix/diar1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/diar1/db.sh
\ No newline at end of file
diff --git a/egs2/librimix/diar1/diar.sh b/egs2/librimix/diar1/diar.sh
new file mode 120000
index 00000000000..1f70beb7142
--- /dev/null
+++ b/egs2/librimix/diar1/diar.sh
@@ -0,0 +1 @@
+../../TEMPLATE/diar1/diar.sh
\ No newline at end of file
diff --git a/egs2/librimix/diar1/local/data.sh b/egs2/librimix/diar1/local/data.sh
new file mode 100755
index 00000000000..32d2cbf25b0
--- /dev/null
+++ b/egs2/librimix/diar1/local/data.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+FOLDER=git_librimix
+fs=8k
+num_spk="2 3"
+
+ . utils/parse_options.sh || exit 1;
+
+if [ -z "${LIBRIMIX}" ]; then
+    log "Fill the value of 'LIBRIMIX' of db.sh"
+    exit 1
+fi
+mkdir -p ${LIBRIMIX}
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log "data preparation started"
+
+# Github LibriMix : https://github.com/s3prl/LibriMix.git
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ] ; then  
+    URL=https://github.com/YushiUeda/LibriMix.git
+    # our fork 
+    if [ ! -d "$FOLDER" ] ; then
+        git clone "$URL" "$FOLDER"
+        log "git successfully downloaded"
+        # Not installing matplotlib to avoid conflict with ESPnet
+        sed -i -e "s/matplotlib>=3\.1\.3//" $FOLDER/requirements.txt
+        pip install -r "$FOLDER"/requirements.txt 
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] ; then
+# download data & generate librimix
+./local/generate_librimix_sd.sh $LIBRIMIX $FOLDER $fs
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ] ; then
+# Create Kaldi-style files
+fs_int=${fs//k/"000"}
+mkdir -p data/
+
+for i in $num_spk; do
+    python3 local/prepare_diarization.py \
+        --target_dir data/ \
+        --source_dir ${LIBRIMIX}/Libri${i}Mix/wav${fs}/max/metadata \
+        --rttm_dir ${FOLDER}/metadata/LibriSpeech \
+        --fs ${fs_int} \
+        --num_spk $i
+done
+
+for file in reco2dur rttm segments spk2utt utt2spk wav.scp; do
+    for dir in data/test data/train data/dev; do
+        echo -n "" > ${dir}/${file}
+        for i in $num_spk; do
+            cat ${dir}${i}/${file} >> ${dir}/${file}
+        done
+    done
+done
+
+for dir in data/test data/train data/dev; do
+    utils/fix_data_dir.sh $dir
+done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/librimix/diar1/local/generate_librimix_sd.sh b/egs2/librimix/diar1/local/generate_librimix_sd.sh
new file mode 100755
index 00000000000..2de9fd14c28
--- /dev/null
+++ b/egs2/librimix/diar1/local/generate_librimix_sd.sh
@@ -0,0 +1,84 @@
+#!/bin/env bash
+set -eu  # Exit on error
+
+storage_dir=$1
+folder=$2
+fs=$3
+librispeech_dir=$storage_dir/LibriSpeech
+wham_dir=$storage_dir/wham_noise
+librimix_outdir=$storage_dir/
+
+function LibriSpeech_dev_clean() {
+	if ! test -e $librispeech_dir/dev-clean; then
+		echo "Download LibriSpeech/dev-clean into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/dev-clean.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/dev-clean.tar.gz -C $storage_dir
+		rm -rf $storage_dir/dev-clean.tar.gz
+	fi
+}
+
+function LibriSpeech_test_clean() {
+	if ! test -e $librispeech_dir/test-clean; then
+		echo "Download LibriSpeech/test-clean into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/test-clean.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/test-clean.tar.gz -C $storage_dir
+		rm -rf $storage_dir/test-clean.tar.gz
+	fi
+}
+
+function LibriSpeech_clean100() {
+	if ! test -e $librispeech_dir/train-clean-100; then
+		echo "Download LibriSpeech/train-clean-100 into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-100.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/train-clean-100.tar.gz -C $storage_dir
+		rm -rf $storage_dir/train-clean-100.tar.gz
+	fi
+}
+
+function LibriSpeech_clean360() {
+	if ! test -e $librispeech_dir/train-clean-360; then
+		echo "Download LibriSpeech/train-clean-360 into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 http://www.openslr.org/resources/12/train-clean-360.tar.gz -P $storage_dir
+		tar -xzf $storage_dir/train-clean-360.tar.gz -C $storage_dir
+		rm -rf $storage_dir/train-clean-360.tar.gz
+	fi
+}
+
+function wham() {
+	if ! test -e $wham_dir; then
+		echo "Download wham_noise into $storage_dir"
+		# If downloading stalls for more than 20s, relaunch from previous state.
+		wget -c --tries=0 --read-timeout=20 https://storage.googleapis.com/whisper-public/wham_noise.zip -P $storage_dir
+		unzip -qn $storage_dir/wham_noise.zip -d $storage_dir
+		rm -rf $storage_dir/wham_noise.zip
+	fi
+}
+
+LibriSpeech_dev_clean &
+LibriSpeech_test_clean &
+LibriSpeech_clean100 &
+wham &
+
+wait
+
+# Path to python
+python_path=python3
+
+# If you wish to rerun this script in the future please comment this line out.
+#$python_path $folder/scripts/augment_train_noise.py --wham_dir $wham_dir
+
+for n_src in 2 3; do
+  metadata_dir=$folder/metadata/Libri$n_src"Mix"
+  $python_path $folder/scripts/create_librimix_from_metadata.py --librispeech_dir $librispeech_dir \
+    --wham_dir $wham_dir \
+    --metadata_dir $metadata_dir \
+    --librimix_outdir $librimix_outdir \
+    --n_src $n_src \
+    --freqs $fs \
+    --modes max \
+    --types mix_both
+done
\ No newline at end of file
diff --git a/egs2/librimix/diar1/local/path.sh b/egs2/librimix/diar1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/librimix/diar1/local/prepare_diarization.py b/egs2/librimix/diar1/local/prepare_diarization.py
new file mode 100755
index 00000000000..b4ab66e5c44
--- /dev/null
+++ b/egs2/librimix/diar1/local/prepare_diarization.py
@@ -0,0 +1,217 @@
+import os
+import re
+import argparse
+
+
+def float2str(number, size=6):
+    number = str(int(number * 1000))
+    return (size - len(number)) * "0" + number
+
+
+def load_rttm_text(path):
+    # Read a RTTM file
+    spk_index = 0
+    data = {}
+    spk_dict = {}
+    with open(path, "r", encoding="utf-8") as f:
+        for linenum, line in enumerate(f, 1):
+            sps = re.split(" +", line.rstrip())
+
+            # RTTM format must have exactly 9 fields
+            assert len(sps) == 10 and path
+            label_type, utt_id, channel, start, duration, _, _, spk_id, _, _ = sps
+
+            # Only support speaker label now
+            assert label_type == "SPEAKER"
+
+            if spk_id not in spk_dict.keys():
+                spk_dict[spk_id] = spk_index
+                spk_index += 1
+            data[utt_id] = data.get(utt_id, []) + [
+                (spk_id, float(start), float(start) + float(duration))
+            ]
+
+    return data, spk_dict
+
+
+def process_metadata(metadata, target_dir, source_rttm, fs, num_spk):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    wavscp = open(os.path.join(target_dir, "wav.scp"), "w", encoding="utf-8")
+    utt2spk = open(os.path.join(target_dir, "utt2spk"), "w", encoding="utf-8")
+    spk2utt = open(os.path.join(target_dir, "spk2utt"), "w", encoding="utf-8")
+    segments = open(os.path.join(target_dir, "segments"), "w", encoding="utf-8")
+    rttm = open(os.path.join(target_dir, "rttm"), "w", encoding="utf-8")
+    reco2dur = open(os.path.join(target_dir, "reco2dur"), "w", encoding="utf-8")
+
+    spk2utt_cache = {}
+
+    with open(metadata, "r", encoding="utf-8") as f:
+        header = f.readline().split(",")
+        if num_spk == 2:
+            assert len(header) == 6
+            for linenum, line in enumerate(f, 1):
+                mix_id, mix_path, _, _, _, length = line.strip().split(",")
+                # from 3536-8226-0026_1673-143397-0009
+                # to 3536-8226-0026, 1673-143397-0009
+                source1_id, source2_id = mix_id.split("_")
+                spk1, spk2 = source1_id.split("-")[0], source2_id.split("-")[0]
+                reco1, reco2 = source1_id[len(spk1) + 1 :], source2_id[len(spk2) + 1 :]
+                wavscp.write("{} {}\n".format(mix_id, mix_path))
+                spk1_segs, spk2_segs = source_rttm[reco1], source_rttm[reco2]
+
+                for spk_id, start, end in spk1_segs:
+                    assert spk_id == spk1
+                    # making seg_id to have prefixes of spk_id to avoid sorting issue
+                    seg_id = "{}-{}_{}_{}".format(
+                        spk_id, mix_id, float2str(start), float2str(end)
+                    )
+                    segments.write("{} {} {} {}\n".format(seg_id, mix_id, start, end))
+                    utt2spk.write("{} {}\n".format(seg_id, spk_id))
+                    rttm.write(
+                        "SPEAKER\t{}\t1\t{}\t{}\t<NA>\t<NA>\t{}\t<NA>\n".format(
+                            mix_id, start, end - start, spk_id
+                        )
+                    )
+                    spk2utt_cache[spk_id] = spk2utt_cache.get(spk_id, []) + [mix_id]
+
+                for spk_id, start, end in spk2_segs:
+                    assert spk_id == spk2
+                    # making seg_id to have prefixes of spk_id to avoid sorting issue
+                    seg_id = "{}-{}_{}_{}".format(
+                        spk_id, mix_id, float2str(start), float2str(end)
+                    )
+                    segments.write("{} {} {} {}\n".format(seg_id, mix_id, start, end))
+                    utt2spk.write("{} {}\n".format(seg_id, spk_id))
+                    rttm.write(
+                        "SPEAKER\t{}\t1\t{}\t{}\t<NA>\t<NA>\t{}\t<NA>\n".format(
+                            mix_id, start, end - start, spk_id
+                        )
+                    )
+                    spk2utt_cache[spk_id] = spk2utt_cache.get(spk_id, []) + [mix_id]
+
+                reco2dur.write("{} {}\n".format(mix_id, float(length) / fs))
+
+        elif num_spk == 3:
+            assert len(header) == 7
+            for linenum, line in enumerate(f, 1):
+                mix_id, mix_path, _, _, _, _, length = line.strip().split(",")
+                # from 6313-76958-0019_6295-244435-0031_2277-149897-0012
+                # to 6313-76958-0019, 6295-244435-0031, 2277-149897-0012
+                source1_id, source2_id, source3_id = mix_id.split("_")
+                spk1, spk2, spk3 = (
+                    source1_id.split("-")[0],
+                    source2_id.split("-")[0],
+                    source3_id.split("-")[0],
+                )
+                reco1, reco2, reco3 = (
+                    source1_id[len(spk1) + 1 :],
+                    source2_id[len(spk2) + 1 :],
+                    source3_id[len(spk3) + 1 :],
+                )
+                wavscp.write("{} {}\n".format(mix_id, mix_path))
+                spk1_segs, spk2_segs, spk3_segs = (
+                    source_rttm[reco1],
+                    source_rttm[reco2],
+                    source_rttm[reco3],
+                )
+
+                for spk_id, start, end in spk1_segs:
+                    assert spk_id == spk1
+                    # making seg_id to have prefixes of spk_id to avoid sorting issue
+                    seg_id = "{}-{}_{}_{}".format(
+                        spk_id, mix_id, float2str(start), float2str(end)
+                    )
+                    segments.write("{} {} {} {}\n".format(seg_id, mix_id, start, end))
+                    utt2spk.write("{} {}\n".format(seg_id, spk_id))
+                    rttm.write(
+                        "SPEAKER\t{}\t1\t{}\t{}\t<NA>\t<NA>\t{}\t<NA>\n".format(
+                            mix_id, start, end - start, spk_id
+                        )
+                    )
+                    spk2utt_cache[spk_id] = spk2utt_cache.get(spk_id, []) + [mix_id]
+
+                for spk_id, start, end in spk2_segs:
+                    assert spk_id == spk2
+                    # making seg_id to have prefixes of spk_id to avoid sorting issue
+                    seg_id = "{}-{}_{}_{}".format(
+                        spk_id, mix_id, float2str(start), float2str(end)
+                    )
+                    segments.write("{} {} {} {}\n".format(seg_id, mix_id, start, end))
+                    utt2spk.write("{} {}\n".format(seg_id, spk_id))
+                    rttm.write(
+                        "SPEAKER\t{}\t1\t{}\t{}\t<NA>\t<NA>\t{}\t<NA>\n".format(
+                            mix_id, start, end - start, spk_id
+                        )
+                    )
+                    spk2utt_cache[spk_id] = spk2utt_cache.get(spk_id, []) + [mix_id]
+
+                for spk_id, start, end in spk3_segs:
+                    assert spk_id == spk3
+                    # making seg_id to have prefixes of spk_id to avoid sorting issue
+                    seg_id = "{}-{}_{}_{}".format(
+                        spk_id, mix_id, float2str(start), float2str(end)
+                    )
+                    segments.write("{} {} {} {}\n".format(seg_id, mix_id, start, end))
+                    utt2spk.write("{} {}\n".format(seg_id, spk_id))
+                    rttm.write(
+                        "SPEAKER\t{}\t1\t{}\t{}\t<NA>\t<NA>\t{}\t<NA>\n".format(
+                            mix_id, start, end - start, spk_id
+                        )
+                    )
+                    spk2utt_cache[spk_id] = spk2utt_cache.get(spk_id, []) + [mix_id]
+
+                reco2dur.write("{} {}\n".format(mix_id, float(length) / fs))
+
+    for spk_id in spk2utt_cache.keys():
+        spk2utt.write("{} {}\n".format(spk_id, " ".join(spk2utt_cache[spk_id])))
+
+    wavscp.close()
+    utt2spk.close()
+    spk2utt.close()
+    segments.close()
+    rttm.close()
+    reco2dur.close()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--target_dir", type=str, required=True, help="Path to generate kaldi_style result"
+)
+parser.add_argument("--source_dir", type=str, default="Libri2Mix/wav16k/max/metadata")
+parser.add_argument("--rttm_dir", type=str, default="metadata/LibriSpeech")
+parser.add_argument("--fs", type=int, default=8000)
+parser.add_argument("--num_spk", type=int, default=2)
+
+args = parser.parse_args()
+
+train_rttm, train_spk = load_rttm_text(
+    os.path.join(args.rttm_dir, "train_clean_100.rttm")
+)
+dev_rttm, dev_spk = load_rttm_text(os.path.join(args.rttm_dir, "dev_clean.rttm"))
+test_rttm, test_spk = load_rttm_text(os.path.join(args.rttm_dir, "test_clean.rttm"))
+
+process_metadata(
+    os.path.join(args.source_dir, "mixture_train-100_mix_both.csv"),
+    os.path.join(args.target_dir, "train" + str(args.num_spk)),
+    train_rttm,
+    args.fs,
+    args.num_spk,
+)
+process_metadata(
+    os.path.join(args.source_dir, "mixture_dev_mix_both.csv"),
+    os.path.join(args.target_dir, "dev" + str(args.num_spk)),
+    dev_rttm,
+    args.fs,
+    args.num_spk,
+)
+process_metadata(
+    os.path.join(args.source_dir, "mixture_test_mix_both.csv"),
+    os.path.join(args.target_dir, "test" + str(args.num_spk)),
+    test_rttm,
+    args.fs,
+    args.num_spk,
+)
+
+print("Successfully finish Kaldi-style preparation")
diff --git a/egs2/librimix/diar1/path.sh b/egs2/librimix/diar1/path.sh
new file mode 120000
index 00000000000..50bd110c027
--- /dev/null
+++ b/egs2/librimix/diar1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/diar1/path.sh
\ No newline at end of file
diff --git a/egs2/librimix/diar1/pyscripts b/egs2/librimix/diar1/pyscripts
new file mode 120000
index 00000000000..5112eadce96
--- /dev/null
+++ b/egs2/librimix/diar1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/diar1/pyscripts
\ No newline at end of file
diff --git a/egs2/librimix/diar1/run.sh b/egs2/librimix/diar1/run.sh
new file mode 100755
index 00000000000..d950c187e06
--- /dev/null
+++ b/egs2/librimix/diar1/run.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Jiatong Shi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+#
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test"
+
+train_config="conf/train_diar.yaml"
+decode_config="conf/decode_diar.yaml"
+num_spk=2 # 2, 3
+
+./diar.sh \
+    --collar 0.0 \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --ngpu 1 \
+    --diar_config "${train_config}" \
+    --inference_config "${decode_config}" \
+    --inference_nj 5 \
+    --local_data_opts "--num_spk ${num_spk}" \
+    --num_spk "${num_spk}"\
+    "$@"
diff --git a/egs2/librimix/diar1/run_eda.sh b/egs2/librimix/diar1/run_eda.sh
new file mode 100755
index 00000000000..8529ff9e96b
--- /dev/null
+++ b/egs2/librimix/diar1/run_eda.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Yushi Ueda
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+#
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test"
+
+train_config1="conf/train_diar_eda.yaml"
+train_config2="conf/train_diar_eda_adapt.yaml"
+decode_config="conf/decode_diar_eda.yaml"
+
+pretrain_stage=true
+adapt_stage=true
+# If you want to run only one of the stages (e.g., the adaptation stage),
+# set "false" to the one you don't want to run (e.g., the pre-training stage)
+
+if [[ ${pretrain_stage} == "true" ]]; then
+./diar.sh \
+    --collar 0.0 \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --ngpu 1 \
+    --diar_config "${train_config1}" \
+    --inference_config "${decode_config}" \
+    --inference_nj 5 \
+    --local_data_opts "--num_spk 2" \
+    --stop_stage 5 \
+    "$@"
+fi
+
+# Modify "--diar_args "--init_param <path of the pre-trained model>""
+# according to the actual path of your experiment.
+if [[ ${adapt_stage} == "true" ]]; then
+./diar.sh \
+    --collar 0.0 \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --ngpu 1 \
+    --diar_config "${train_config2}" \
+    --inference_config "${decode_config}" \
+    --inference_nj 5 \
+    --local_data_opts "--stage 2" \
+    --diar_args "--init_param exp/diar_train_diar_eda_5_raw_max_epoch250/valid.acc.ave_10best.pth" \
+    --diar_tag "train_diar_eda_adapt_raw" \
+    --num_spk "3"\
+    "$@"
+fi
diff --git a/egs2/librimix/diar1/scripts b/egs2/librimix/diar1/scripts
new file mode 120000
index 00000000000..a06fc8dac8d
--- /dev/null
+++ b/egs2/librimix/diar1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/diar1/scripts
\ No newline at end of file
diff --git a/egs2/librimix/diar1/steps b/egs2/librimix/diar1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/librimix/diar1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/librimix/diar1/utils b/egs2/librimix/diar1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/librimix/diar1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/librimix/enh1/cmd.sh b/egs2/librimix/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/librimix/enh1/cmd.sh
+++ b/egs2/librimix/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/librispeech/asr1/README.md b/egs2/librispeech/asr1/README.md
index b685a9fe414..ddcb14fce05 100644
--- a/egs2/librispeech/asr1/README.md
+++ b/egs2/librispeech/asr1/README.md
@@ -1,3 +1,388 @@
+# Self-supervised learning features [HuBERT_large_ll60k, Conformer, utt_mvn](conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer2.yaml)
+
+## Environments
+- date: `Sat Jan  1 23:24:39 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `37a5c7cdb84b1d2361f4a4fa08826b2873bf7753`
+  - Commit date: `Thu Nov 25 05:30:02 2021 +0000`
+- Pretrained model: https://huggingface.co/espnet/simpleoier_librispeech_asr_train_asr_conformer7_hubert_ll60k_large_raw_en_bpe5000_sp
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|98.5|1.4|0.2|0.2|1.7|22.9|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|96.7|3.0|0.3|0.3|3.6|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|98.4|1.4|0.2|0.2|1.8|23.4|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|96.6|3.1|0.3|0.4|3.7|37.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|99.7|0.2|0.2|0.2|0.5|22.9|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|99.0|0.6|0.5|0.4|1.4|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|99.7|0.2|0.2|0.2|0.5|23.4|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|99.1|0.5|0.4|0.4|1.3|37.2|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|68010|98.2|1.4|0.4|0.4|2.2|22.9|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|63110|96.1|3.0|0.9|0.8|4.7|36.0|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|98.1|1.4|0.5|0.4|2.3|23.4|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|96.1|2.9|1.1|0.7|4.6|37.2|
+
+# Self-supervised learning features [WavLM_large, Conformer, utt_mvn](conf/tuning/train_asr_conformer7_wavlm_large.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer2.yaml)
+
+## Environments
+- date: `Tue Jan  4 20:52:48 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `37a5c7cdb84b1d2361f4a4fa08826b2873bf7753`
+  - Commit date: `Thu Nov 25 05:30:02 2021 +0000`
+- Pretrained model: https://huggingface.co/espnet/simpleoier_librispeech_asr_train_asr_conformer7_wavlm_large_raw_en_bpe5000_sp
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|98.4|1.4|0.1|0.2|1.7|23.1|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|96.7|3.0|0.3|0.3|3.6|35.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|98.4|1.5|0.1|0.2|1.8|23.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|96.7|3.0|0.3|0.4|3.7|37.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|99.7|0.2|0.2|0.2|0.5|23.1|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|98.9|0.6|0.4|0.4|1.5|35.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|99.6|0.2|0.2|0.2|0.6|23.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|99.1|0.5|0.4|0.4|1.3|37.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|68010|98.2|1.4|0.4|0.3|2.1|23.1|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|63110|96.0|3.1|0.9|0.9|4.9|35.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|98.1|1.4|0.5|0.4|2.3|23.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|96.1|2.9|1.0|0.8|4.7|37.9|
+
+# Self-supervised learning features [Wav2Vec2_large_960hr, Conformer, utt_mvn](conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer2.yaml)
+
+## Environments
+- date: `Thu Dec 16 23:20:01 EST 2021`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `37a5c7cdb84b1d2361f4a4fa08826b2873bf7753`
+  - Commit date: `Thu Nov 25 05:30:02 2021 +0000`
+- Pretrained model: https://huggingface.co/espnet/simpleoier_librispeech_asr_train_asr_conformer7_wav2vec2_960hr_large_raw_en_bpe5000_sp
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|98.3|1.6|0.2|0.2|1.9|24.6|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|95.2|4.3|0.5|0.5|5.2|42.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|98.1|1.6|0.2|0.2|2.1|25.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|95.4|4.1|0.5|0.5|5.1|45.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|99.6|0.2|0.2|0.2|0.6|24.6|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|42.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|25.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|98.4|0.8|0.7|0.6|2.1|45.1|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|68010|98.0|1.5|0.5|0.4|2.4|24.6|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|63110|94.4|4.3|1.3|1.2|6.8|42.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|97.7|1.6|0.7|0.4|2.7|25.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|94.5|3.9|1.5|1.0|6.4|45.1|
+
+
+# Conformer, `hop_length=160`
+- Params: 116.15 M
+- ASR config: [conf/tuning/train_asr_conformer10_hop_length160.yaml](conf/tuning/train_asr_conformer10_hop_length160.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer_hop_length160](https://huggingface.co/pyf98/librispeech_conformer_hop_length160)
+
+# RESULTS
+## Environments
+- date: `Mon Mar 14 12:26:10 EDT 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `467660021998c416ac366aed0f75f3399e321a3a`
+  - Commit date: `Sun Mar 13 17:08:56 2022 -0400`
+
+## asr_train_asr_conformer10_hop_length160_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.7|0.2|0.2|2.1|27.7|
+|beam60_ctc0.3/dev_other|2864|50948|95.3|4.3|0.4|0.5|5.2|44.1|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|27.9|
+|beam60_ctc0.3/test_other|2939|52343|95.4|4.1|0.4|0.6|5.2|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.4|3.2|0.4|0.4|3.9|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.3|1.5|0.2|0.2|2.0|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.3|0.4|0.5|4.2|39.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|27.7|
+|beam60_ctc0.3/dev_other|2864|265951|98.4|1.0|0.6|0.6|2.2|44.1|
+|beam60_ctc0.3/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|27.9|
+|beam60_ctc0.3/test_other|2939|272758|98.5|0.9|0.7|0.6|2.1|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.6|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.8|0.6|0.5|1.9|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.6|0.7|0.7|0.5|1.9|39.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.7|0.6|0.4|2.7|27.7|
+|beam60_ctc0.3/dev_other|2864|63110|94.2|4.3|1.5|0.9|6.7|44.1|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.8|0.4|3.0|27.9|
+|beam60_ctc0.3/test_other|2939|65101|94.4|3.9|1.7|0.8|6.4|44.8|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|98.0|1.4|0.6|0.3|2.3|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.2|3.4|1.4|0.6|5.5|36.2|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.8|1.4|0.8|0.3|2.5|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.1|3.2|1.7|0.6|5.5|39.6|
+
+
+
+# Conformer, using stochastic depth
+- Params: 116.15M
+- ASR config [conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml](conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer_layerdrop0.1_last6](https://huggingface.co/pyf98/librispeech_conformer_layerdrop0.1_last6)
+
+# RESULTS
+## Environments
+- date: `Mon Mar  7 12:21:40 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c3569453a408fd4ff4173d9c1d2062c88d1fc060`
+  - Commit date: `Sun Mar 6 23:58:36 2022 -0500`
+
+## asr_train_asr_conformer9_layerdrop0.1_last6_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|26.6|
+|beam60_ctc0.3/dev_other|2864|50948|95.4|4.2|0.4|0.5|5.1|43.3|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.4|28.1|
+|beam60_ctc0.3/test_other|2939|52343|95.3|4.3|0.4|0.7|5.4|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.4|3.2|0.4|0.4|4.0|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.2|1.6|0.2|0.2|2.0|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.4|0.5|0.5|4.3|40.4|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|26.6|
+|beam60_ctc0.3/dev_other|2864|265951|98.3|1.0|0.7|0.6|2.3|43.3|
+|beam60_ctc0.3/test_clean|2620|281530|99.5|0.3|0.3|0.2|0.8|28.1|
+|beam60_ctc0.3/test_other|2939|272758|98.4|1.0|0.7|0.6|2.3|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.3|0.3|0.2|0.7|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.8|0.7|0.5|1.9|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.5|0.7|0.7|0.5|2.0|40.4|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.7|0.7|0.3|2.7|26.6|
+|beam60_ctc0.3/dev_other|2864|63110|94.2|4.3|1.5|0.8|6.6|43.3|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.8|0.3|2.9|28.1|
+|beam60_ctc0.3/test_other|2939|65101|94.2|4.1|1.7|0.8|6.6|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|97.9|1.4|0.7|0.3|2.4|23.3|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.2|3.4|1.5|0.6|5.5|36.5|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.7|1.5|0.8|0.3|2.6|23.7|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.0|3.2|1.8|0.6|5.6|40.4|
+
+
+
+# Conformer, new SpecAug, using weight decay in Adam
+- Params: 116.15M
+- ASR config [conf/tuning/train_asr_conformer8.yaml](conf/tuning/train_asr_conformer8.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://huggingface.co/pyf98/librispeech_conformer](https://huggingface.co/pyf98/librispeech_conformer)
+
+# RESULTS
+## Environments
+- date: `Mon Mar  7 12:26:10 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `c3569453a408fd4ff4173d9c1d2062c88d1fc060`
+  - Commit date: `Sun Mar 6 23:58:36 2022 -0500`
+
+## asr_train_asr_conformer8_raw_en_bpe5000_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|54402|98.1|1.8|0.2|0.2|2.1|27.3|
+|beam60_ctc0.3/dev_other|2864|50948|95.2|4.4|0.4|0.5|5.4|43.7|
+|beam60_ctc0.3/test_clean|2620|52576|97.9|1.9|0.2|0.3|2.3|29.0|
+|beam60_ctc0.3/test_other|2939|52343|95.2|4.3|0.4|0.6|5.4|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.8|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|50948|96.2|3.4|0.4|0.4|4.1|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|52576|98.3|1.5|0.2|0.2|1.9|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|52343|96.2|3.3|0.5|0.5|4.3|39.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|27.3|
+|beam60_ctc0.3/dev_other|2864|265951|98.2|1.1|0.7|0.6|2.4|43.7|
+|beam60_ctc0.3/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|29.0|
+|beam60_ctc0.3/test_other|2939|272758|98.4|0.9|0.7|0.6|2.2|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.7|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|265951|98.5|0.9|0.7|0.5|2.0|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|272758|98.6|0.7|0.7|0.5|1.9|39.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam60_ctc0.3/dev_clean|2703|68010|97.6|1.8|0.7|0.3|2.8|27.3|
+|beam60_ctc0.3/dev_other|2864|63110|94.1|4.4|1.5|0.9|6.8|43.7|
+|beam60_ctc0.3/test_clean|2620|65818|97.4|1.8|0.7|0.3|2.9|29.0|
+|beam60_ctc0.3/test_other|2939|65101|94.2|4.1|1.7|0.8|6.6|45.7|
+|beam60_ctc0.3_lm0.6/dev_clean|2703|68010|97.9|1.5|0.7|0.3|2.4|23.5|
+|beam60_ctc0.3_lm0.6/dev_other|2864|63110|95.1|3.5|1.4|0.6|5.6|37.4|
+|beam60_ctc0.3_lm0.6/test_clean|2620|65818|97.7|1.5|0.8|0.3|2.5|24.1|
+|beam60_ctc0.3_lm0.6/test_other|2939|65101|95.1|3.2|1.7|0.6|5.5|39.9|
+
+
+
+# Tuning warmup_steps
+- Note
+    - warmup_steps: 25000 -> 40000
+    - lr: 0.0015 -> 0.0025
+
+## Environments
+- date: `Sat Mar 13 13:51:19 UTC 2021`
+- python version: `3.8.8 (default, Feb 24 2021, 21:46:12)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.8`
+- pytorch version: `pytorch 1.8.0`
+- Git hash: `2ccd176da5de478e115600b874952cebc549c6ef`
+  - Commit date: `Mon Mar 8 10:41:31 2021 +0000`
+
+## With transformer LM
+- ASR config: [conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml](conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml)
+- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
+- Pretrained model: [https://zenodo.org/record/4604066](https://zenodo.org/record/4604066)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|98.3|1.5|0.2|0.2|1.9|25.2|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|95.8|3.7|0.4|0.5|4.6|40.0|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|98.1|1.7|0.2|0.3|2.1|26.2|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|95.8|3.7|0.5|0.5|4.7|42.4|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|25.2|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|98.3|1.0|0.7|0.5|2.2|40.0|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|99.5|0.3|0.3|0.2|0.7|26.2|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|98.5|0.8|0.7|0.5|2.1|42.4|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|68010|97.8|1.5|0.7|0.3|2.5|25.2|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|63110|94.6|3.8|1.6|0.7|6.1|40.0|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|97.6|1.6|0.8|0.3|2.7|26.2|
+|decode_asr_lm_lm_train_lm_transformer2_bpe5000_scheduler_confwarmup_steps25000_batch_bins500000000_accum_grad2_use_amptrue_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|94.7|3.5|1.8|0.7|6.0|42.4|
+
+## Without LM
+- ASR config: [conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml](conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|54402|97.9|1.9|0.2|0.2|2.3|28.6|
+|decode_asr_asr_model_valid.acc.ave/dev_other|2864|50948|94.5|5.1|0.5|0.6|6.1|48.3|
+|decode_asr_asr_model_valid.acc.ave/test_clean|2620|52576|97.7|2.1|0.2|0.3|2.6|31.4|
+|decode_asr_asr_model_valid.acc.ave/test_other|2939|52343|94.7|4.9|0.5|0.7|6.0|49.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|288456|99.4|0.3|0.2|0.2|0.8|28.6|
+|decode_asr_asr_model_valid.acc.ave/dev_other|2864|265951|98.0|1.2|0.8|0.7|2.7|48.3|
+|decode_asr_asr_model_valid.acc.ave/test_clean|2620|281530|99.4|0.3|0.3|0.3|0.9|31.4|
+|decode_asr_asr_model_valid.acc.ave/test_other|2939|272758|98.2|1.0|0.7|0.7|2.5|49.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|68010|97.5|1.9|0.7|0.4|2.9|28.6|
+|decode_asr_asr_model_valid.acc.ave/dev_other|2864|63110|93.4|5.0|1.6|1.0|7.6|48.3|
+|decode_asr_asr_model_valid.acc.ave/test_clean|2620|65818|97.2|2.0|0.8|0.4|3.3|31.4|
+|decode_asr_asr_model_valid.acc.ave/test_other|2939|65101|93.7|4.5|1.8|0.9|7.2|49.0|
+
+
+## CTC decoding with nbest rescoring from decoder and LM (using k2)
+With configure file:
+egs2/librispeech/asr1/conf/decode_asr_transformer_with_k2.yaml
+
+### WER
+Test with a single Tesla V100 gpu and batch_size==2
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|Decoding time, seconds|
+|---|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave_use_k2_is_ctc_decoding_true_use_nbest_rescoring_true/dev_clean|2703|54402|98.2|1.6|0.2|0.2|2.0|26.5|613|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave_use_k2_is_ctc_decoding_true_use_nbest_rescoring_true/dev_other|2864|50948|95.1|4.3|0.5|0.5|5.4|42.7|959|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave_use_k2_is_ctc_decoding_true_use_nbest_rescoring_true/test_clean|2620|52576|98.0|1.8|0.2|0.3|2.3|27.6|618|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave_use_k2_is_ctc_decoding_true_use_nbest_rescoring_true/test_other|2939|52343|95.2|4.2|0.5|0.5|5.3|44.7|970|
+
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave_use_k2_is_ctc_decoding_true_use_nbest_rescoring_true/dev_clean|2703|288456|99.5|0.3|0.3|0.2|0.7|26.5|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave_use_k2_is_ctc_decoding_true_use_nbest_rescoring_true/dev_other|2864|265951|98.1|1.0|0.9|0.6|2.5|42.7|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave_use_k2_is_ctc_decoding_true_use_nbest_rescoring_true/test_clean|2620|281530|99.4|0.3|0.3|0.2|0.8|27.6|
+|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave_use_k2_is_ctc_decoding_true_use_nbest_rescoring_true/test_other|2939|272758|98.2|0.9|0.9|0.6|2.3|44.7|
+
+
 # Updated the result of conformer with transformer LM
 ## Environments
 - date: `Mon Feb 15 13:39:43 UTC 2021`
diff --git a/egs2/librispeech/asr1/cmd.sh b/egs2/librispeech/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/librispeech/asr1/cmd.sh
+++ b/egs2/librispeech/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/librispeech/asr1/conf/decode_asr.yaml b/egs2/librispeech/asr1/conf/decode_asr.yaml
index fe5290e82d1..7b44351b5f2 100644
--- a/egs2/librispeech/asr1/conf/decode_asr.yaml
+++ b/egs2/librispeech/asr1/conf/decode_asr.yaml
@@ -1,3 +1,3 @@
-lm_weight: 0.6
-ctc_weight: 0.4
 beam_size: 60
+ctc_weight: 0.3
+lm_weight: 0.6
diff --git a/egs2/librispeech/asr1/conf/decode_asr_transformer_with_k2.yaml b/egs2/librispeech/asr1/conf/decode_asr_transformer_with_k2.yaml
new file mode 100644
index 00000000000..64319140f8f
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/decode_asr_transformer_with_k2.yaml
@@ -0,0 +1,7 @@
+search_beam_size: 20
+output_beam_size: 20
+blank_bias: -0.0
+lattice_weight: 1.0
+am_weight: 1.0
+decoder_weight: 0.5
+nnlm_weight: 1.0
diff --git a/egs2/librispeech/asr1/conf/decode_rnnt_conformer.yaml b/egs2/librispeech/asr1/conf/decode_rnnt_conformer.yaml
new file mode 100644
index 00000000000..9ca18d6d4e2
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/decode_rnnt_conformer.yaml
@@ -0,0 +1,5 @@
+# The conformer transducer decoding configuration from @jeon30c
+beam_size: 10
+transducer_conf:
+    search_type: default
+    score_norm: True
diff --git a/egs2/librispeech/asr1/conf/train_asr_conformer.yaml b/egs2/librispeech/asr1/conf/train_asr_conformer.yaml
new file mode 120000
index 00000000000..11b013a3089
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer10_hop_length160.yaml
\ No newline at end of file
diff --git a/egs2/librispeech/asr1/conf/train_rnnt_conformer.yaml b/egs2/librispeech/asr1/conf/train_rnnt_conformer.yaml
new file mode 100644
index 00000000000..5615e6fa600
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/train_rnnt_conformer.yaml
@@ -0,0 +1,75 @@
+# The conformer transducer training configuration from @jeon30c
+# WERs for test-clean/test-other are 2.9 and 7.2, respectively.
+# Trained with Tesla V100-SXM2(32GB) x 8 GPUs. It takes about 1.5 days.
+batch_type: numel
+batch_bins: 20000000
+accum_grad: 2
+max_epoch: 100
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
+
+model_conf:
+    ctc_weight: 0.0
+    report_cer: False
+    report_wer: False
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transducer
+decoder_conf:
+    rnn_type: lstm
+    num_layers: 1
+    hidden_size: 512
+    dropout: 0.1
+    dropout_embed: 0.1
+
+joint_net_conf:
+    joint_space_size: 640
+
+optim: adam
+optim_conf:
+    lr: 0.0015
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 160 
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml
new file mode 100644
index 00000000000..76094f0c4a9
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer10_hop_length160.yaml
@@ -0,0 +1,76 @@
+# Trained with Tesla V100 (32GB) x 4 GPUs. It takes about 3.5 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 160
+
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft400_hop_length160.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft400_hop_length160.yaml
index 258a2d04d8f..afe8b107903 100644
--- a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft400_hop_length160.yaml
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft400_hop_length160.yaml
@@ -1,4 +1,4 @@
-# This configuration requires Tesla V100-SXM2(32GB) x 16 GPUs It takes about 1.5 days.
+# Trained with Tesla V100-SXM2(32GB) x 16 GPUs. It takes about 1.5 days.
 batch_type: numel
 batch_bins: 140000000
 accum_grad: 1
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length128.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length128.yaml
index b6f11eaf8af..097078ba698 100644
--- a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length128.yaml
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length128.yaml
@@ -1,4 +1,4 @@
-# This configuration requires Tesla V100-SXM2(32GB) x 16 GPUs It takes about 1.5 days.
+# Trained with Tesla V100-SXM2(32GB) x 16 GPUs. It takes about 1.5 days.
 batch_type: numel
 batch_bins: 140000000
 accum_grad: 1
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml
index ea90bc857b6..d519fbfdebe 100644
--- a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml
@@ -1,4 +1,4 @@
-# This configuration requires Tesla V100-SXM2(32GB) x 16 GPUs It takes about 1.5 days.
+# Trained with Tesla V100-SXM2(32GB) x 8 GPUs. It takes about 1.5 days.
 batch_type: numel
 batch_bins: 140000000
 accum_grad: 1
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml
new file mode 100644
index 00000000000..3dfc1337030
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_hubert_ll60k_large.yaml
@@ -0,0 +1,85 @@
+# Trained with Ampere A6000(48GB) x 2 GPUs. It takes about 10 days.
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+unused_parameters: true
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml
new file mode 100644
index 00000000000..64022225bde
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_n_fft512_hop_length256.yaml
@@ -0,0 +1,72 @@
+# Trained with Tesla V100-SXM2(32GB) x 8 GPUs. It takes about 1.5 days.
+batch_type: numel
+batch_bins: 140000000
+accum_grad: 1
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml
new file mode 100644
index 00000000000..0354be5d931
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wav2vec2_960hr_large.yaml
@@ -0,0 +1,86 @@
+# Trained with Ampere A6000(48GB) x 2 GPUs. It takes about 10 days.
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+unused_parameters: true
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wav2vec2_large_960  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wavlm_large.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wavlm_large.yaml
new file mode 100644
index 00000000000..d89f9620cc2
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer7_wavlm_large.yaml
@@ -0,0 +1,85 @@
+# Trained with Ampere A6000(48GB) x 2 GPUs. It takes about 10 days.
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+unused_parameters: true
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wavlm_large  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml
new file mode 100644
index 00000000000..5ff37537086
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer8.yaml
@@ -0,0 +1,76 @@
+# Trained with Tesla V100 (32GB) x 3 GPUs. It takes about 3 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 256
+
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml
new file mode 100644
index 00000000000..bb89c68826b
--- /dev/null
+++ b/egs2/librispeech/asr1/conf/tuning/train_asr_conformer9_layerdrop0.1_last6.yaml
@@ -0,0 +1,90 @@
+# Trained with Tesla V100 (32GB) x 3 GPUs. It takes about 3 days.
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+    stochastic_depth_rate:
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.0
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+    - 0.1
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    hop_length: 256
+
+unused_parameters: true         # due to layer dropout, some layers are not used
+use_amp: true
+num_workers: 4
+batch_type: numel
+batch_bins: 35000000
+accum_grad: 4
+max_epoch: 50
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 40000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech/asr1/run.sh b/egs2/librispeech/asr1/run.sh
index 811a66196e9..4a457e86a7d 100755
--- a/egs2/librispeech/asr1/run.sh
+++ b/egs2/librispeech/asr1/run.sh
@@ -9,14 +9,13 @@ train_set="train_960"
 valid_set="dev"
 test_sets="test_clean test_other dev_clean dev_other"
 
-asr_config=conf/tuning/train_asr_conformer6_n_fft512_hop_length256.yaml
+asr_config=conf/train_asr_conformer.yaml
 lm_config=conf/tuning/train_lm_transformer2.yaml
 inference_config=conf/decode_asr.yaml
 
 ./asr.sh \
-    --audio_format flac.ark \
     --lang en \
-    --ngpu 16 \
+    --ngpu 4 \
     --nbpe 5000 \
     --max_wav_duration 30 \
     --speed_perturb_factors "0.9 1.0 1.1" \
diff --git a/egs2/librispeech/ssl1/README.md b/egs2/librispeech/ssl1/README.md
new file mode 100644
index 00000000000..ecfcf84106b
--- /dev/null
+++ b/egs2/librispeech/ssl1/README.md
@@ -0,0 +1,72 @@
+## INTRODUCTION
+
+This recipe trains a [Hubert](https://arxiv.org/pdf/2106.07447.pdf)pretrain model, using data Librispeech 960hr data, including the k-means-based pseudo label generation and mask-prediction training.
+
+This recipe requires fairseq installed, please run:
+
+    cd ${MAIN_ROOT}/tools && make fairseq.done
+
+Run.sh calls hubert.sh, and there are 7 stages in total. First, we need to do some data preparation to build espnet-style data/dump folders(stage 0-4). 
+
+    ./run.sh --stage 1 --stop-stage 4
+
+Then stage 5 calls script/km.sh to generate pseudo labels used in pretraining. To run this stage, please specify:
+    train/valid set, 
+    number of k-means clusters, 
+    feature type(could be mfcc or hubert extracted feature, will explain later), 
+    the percentage you want to used to train the k-means model.
+These parameters can be settled in run.sh and pass to hubert.sh
+
+    ./run.sh --stage 5 --stop-stage 5
+	
+Stage 6 and 7 collect stats of train/valid set and train the hubert model respectively. These two stages have the same functionality as stage 10 and 11 of asr recipes. The only difference is we call `espnet2.bin.hubert_train` instead of `espnet2.bin.train` for training.
+
+    ./run.sh --stage 6 --stop-stage 7
+	
+Note that stage 5-7 could run multiple times, which is specified by `pretrain_start_iter` and `pretrain_start_iter`. You may find the default value in run.sh:
+
+    pretrain_start_iter=0
+    pretrain_stop_iter=2
+	
+That refers to 3 iterations, n_clusters_iter[0-2] and feature_iter[0-2] specify the number of clusters and the feature type used for k-means clustering of different iterations. Follow the [Hubert](https://arxiv.org/pdf/2106.07447.pdf) settings of base model, we use mfcc and 100 clusters for the iteration 0, and extract the latent features from transformer layer 6 of HuBERT model(HuBERT6) pre-trained in previous iteration, 500 clusters for iteration 1, and (HuBERT9) 500 clusters for iteration 2. Each iteration has a different config file. Please refer to conf/tuning/train_asr_hubert_base_960h_pretrain_it*.yaml
+
+This is the end of Hubert pretraining. After the pretraining finish, you can run the finetuning stage with run.sh under any asr recipes. An example finetuning config file is egs2/librilight_limited/asr1/conf/tuning/train_asr_hubert_base_10h_finetuning.yaml:
+
+    cd ../../librilight_limited/asr1/
+	./run.sh
+
+================================================
+
+## RESULTS
+
+The `CER` and `WER` in following result is got after librilight_limited-10hr finetuning:
+
+### iteration 0 without language model:
+- Model files
+    - model link: (TO BE ADDED)
+    - training config file: `conf/tuning/train_asr_hubert_base_960h_pretrain_it0.yaml` 
+    - e2e file: `exp/pretrain_train_asr_hubert_base_960h_pretrain_it0_raw_iter0/valid.acc.best.pth`    
+    - e2e JSON file: `exp/pretrain_train_asr_hubert_base_960h_pretrain_it0_raw_iter0/config.yaml`    
+  - Results
+  (TO BE ADDED)
+
+  (MORE RESULT PENDING TO BE ADDED)
+  
+### iteration 0 with 4-gram language model:
+### iteration 0 without language model:
+### iteration 1 with 4-gram language model:
+
+================================================
+
+## HUBERT IN FAIRSEQ
+
+The original Hubert paper, code and model can be found in:
+paper: https://arxiv.org/pdf/2106.07447.pdf
+code and model: https://github.com/pytorch/fairseq/tree/master/examples/hubert
+
+================================================
+
+## ACKNOWLEDGEMENT
+
+We would like to thank Wei-Ning Hsu(Facebook) and Abdelrahman Mohamed(Facebook) for their work on Hubert and valuable
+information/kind help of this implementation.
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/cmd.sh b/egs2/librispeech/ssl1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/librispeech/ssl1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/librispeech/ssl1/conf/pbs.conf b/egs2/librispeech/ssl1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/librispeech/ssl1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/librispeech/ssl1/conf/queue.conf b/egs2/librispeech/ssl1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/librispeech/ssl1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/librispeech/ssl1/conf/slurm.conf b/egs2/librispeech/ssl1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/librispeech/ssl1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it0.yaml b/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it0.yaml
new file mode 100644
index 00000000000..651cddff3af
--- /dev/null
+++ b/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it0.yaml
@@ -0,0 +1,59 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 400
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: hubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    # TODO(jzmo): add comments here
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
diff --git a/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it1.yaml b/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it1.yaml
new file mode 100644
index 00000000000..2537abb7aed
--- /dev/null
+++ b/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it1.yaml
@@ -0,0 +1,60 @@
+# Please not the 'label_rate' is different between iteration0 and iteration1 (100 for iteration0 and 50 or iteration1)
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 700
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: hubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    # TODO(jzmo): add comments here
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 50
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
diff --git a/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it2.yaml b/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it2.yaml
new file mode 100644
index 00000000000..55e563f745c
--- /dev/null
+++ b/egs2/librispeech/ssl1/conf/tuning/train_asr_hubert_base_960h_pretrain_it2.yaml
@@ -0,0 +1,60 @@
+# Please not the 'label_rate' is different between iteration0 and iteration2 (100 for iteration0 and 50 or iteration2)
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 150000000
+accum_grad: 1
+max_epoch: 700
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 768
+encoder: hubert_pretrain
+encoder_conf:
+    output_size: 768
+    linear_units: 3072
+    attention_heads: 8
+    num_blocks: 12
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    # TODO(jzmo): add comments here
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 50
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
diff --git a/egs2/librispeech/ssl1/db.sh b/egs2/librispeech/ssl1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/librispeech/ssl1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/hubert.sh b/egs2/librispeech/ssl1/hubert.sh
new file mode 120000
index 00000000000..d02e853c54c
--- /dev/null
+++ b/egs2/librispeech/ssl1/hubert.sh
@@ -0,0 +1 @@
+../../TEMPLATE/ssl1/hubert.sh
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/local/data.sh b/egs2/librispeech/ssl1/local/data.sh
new file mode 120000
index 00000000000..58fbb0a9212
--- /dev/null
+++ b/egs2/librispeech/ssl1/local/data.sh
@@ -0,0 +1 @@
+../../asr1/local/data.sh
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/local/data_prep.sh b/egs2/librispeech/ssl1/local/data_prep.sh
new file mode 120000
index 00000000000..a561e334c40
--- /dev/null
+++ b/egs2/librispeech/ssl1/local/data_prep.sh
@@ -0,0 +1 @@
+../../asr1/local/data_prep.sh
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/local/download_and_untar.sh b/egs2/librispeech/ssl1/local/download_and_untar.sh
new file mode 120000
index 00000000000..e7226b38904
--- /dev/null
+++ b/egs2/librispeech/ssl1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/librispeech/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/local/path.sh b/egs2/librispeech/ssl1/local/path.sh
new file mode 100644
index 00000000000..306a709a086
--- /dev/null
+++ b/egs2/librispeech/ssl1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import fairseq" > /dev/null; then
+    echo "Error: fairseq is not installed." >&2
+    echo "Error: please install fairseq and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make fairseq.done" >&2
+    return 1
+fi
diff --git a/egs2/librispeech/ssl1/path.sh b/egs2/librispeech/ssl1/path.sh
new file mode 120000
index 00000000000..2206738bc13
--- /dev/null
+++ b/egs2/librispeech/ssl1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/ssl1/path.sh
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/pyscripts b/egs2/librispeech/ssl1/pyscripts
new file mode 120000
index 00000000000..9a529559b00
--- /dev/null
+++ b/egs2/librispeech/ssl1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/ssl1/pyscripts/
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/run.sh b/egs2/librispeech/ssl1/run.sh
new file mode 100644
index 00000000000..98744559479
--- /dev/null
+++ b/egs2/librispeech/ssl1/run.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+pretrain_start_iter=0
+pretrain_stop_iter=2
+
+n_clusters_iter0=100
+n_clusters_iter1=500
+n_clusters_iter2=500
+
+# Extract mfcc feature for k-means clustering to generate pseudo targets
+feature_iter0="mfcc"
+# Extract latent features from transformer layer 6 of HuBERT model pre-trained in the iteration0
+feature_iter1="HuBERT6"
+# Extract latent features from transformer layer 9 of HuBERT model pre-trained in the iteration1
+feature_iter2="HuBERT9"
+
+train_set="train_960"
+valid_set="dev"
+
+pretrain_config_iter0=conf/tuning/train_asr_hubert_base_960h_pretrain_it0.yaml
+pretrain_config_iter1=conf/tuning/train_asr_hubert_base_960h_pretrain_it1.yaml
+pretrain_config_iter2=conf/tuning/train_asr_hubert_base_960h_pretrain_it2.yaml
+
+./hubert.sh \
+    --lang en \
+    --ngpu 8 \
+    --num_nodes 4 \
+    --pretrain_start_iter "${pretrain_start_iter}"\
+    --pretrain_stop_iter "${pretrain_stop_iter}" \
+    --nj 32 \
+    --max_wav_duration 30 \
+    --pretrain_configs "${pretrain_config_iter0} ${pretrain_config_iter1} ${pretrain_config_iter2}" \
+    --n_clusters "${n_clusters_iter0} ${n_clusters_iter1} ${n_clusters_iter2}" \
+    --features_km "${feature_iter0} ${feature_iter1} ${feature_iter2}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" "$@"
diff --git a/egs2/librispeech/ssl1/scripts b/egs2/librispeech/ssl1/scripts
new file mode 120000
index 00000000000..22268f36d8e
--- /dev/null
+++ b/egs2/librispeech/ssl1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/ssl1/scripts/
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/steps b/egs2/librispeech/ssl1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/librispeech/ssl1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/librispeech/ssl1/utils b/egs2/librispeech/ssl1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/librispeech/ssl1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/README.md b/egs2/librispeech_100/asr1/README.md
new file mode 100644
index 00000000000..910d5106377
--- /dev/null
+++ b/egs2/librispeech_100/asr1/README.md
@@ -0,0 +1,102 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Mon Feb  7 21:28:00 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.6a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `060fdb8b231b980c67b88a00fb8dd644aebbb1c0`
+  - Commit date: `Mon Feb 7 21:26:51 2022 -0500`
+
+## asr_conformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic
+
+GPU: a single V100-32GB
+
+Training Time: 57072 seconds
+
+Model: https://huggingface.co/pyf98/librispeech_100h_conformer
+
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam1_ctc0.3/dev_clean|2703|54402|93.6|5.3|1.1|1.5|8.0|58.5|
+|beam1_ctc0.3/dev_other|2864|50948|83.7|14.3|2.0|3.2|19.5|81.2|
+|beam1_ctc0.3/test_clean|2620|52576|93.3|5.6|1.1|1.7|8.4|59.4|
+|beam1_ctc0.3/test_other|2939|52343|83.5|14.4|2.1|2.9|19.4|83.3|
+|beam20_ctc0.3/dev_clean|2703|54402|94.5|5.1|0.4|0.8|6.3|56.3|
+|beam20_ctc0.3/dev_other|2864|50948|84.6|13.9|1.5|2.1|17.4|79.9|
+|beam20_ctc0.3/test_clean|2620|52576|94.3|5.3|0.4|0.8|6.5|57.0|
+|beam20_ctc0.3/test_other|2939|52343|84.7|13.7|1.6|2.0|17.3|81.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam1_ctc0.3/dev_clean|2703|288456|97.4|1.2|1.4|1.4|4.0|58.5|
+|beam1_ctc0.3/dev_other|2864|265951|92.5|4.5|3.0|3.2|10.7|81.2|
+|beam1_ctc0.3/test_clean|2620|281530|97.3|1.2|1.5|1.5|4.2|59.4|
+|beam1_ctc0.3/test_other|2939|272758|92.6|4.3|3.1|2.9|10.3|83.3|
+|beam20_ctc0.3/dev_clean|2703|288456|98.2|1.1|0.7|0.7|2.5|56.3|
+|beam20_ctc0.3/dev_other|2864|265951|93.3|4.2|2.5|2.0|8.7|79.9|
+|beam20_ctc0.3/test_clean|2620|281530|98.1|1.1|0.8|0.6|2.5|57.0|
+|beam20_ctc0.3/test_other|2939|272758|93.5|4.0|2.6|1.9|8.4|81.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam1_ctc0.3/dev_clean|2703|69558|91.0|5.5|3.5|1.4|10.4|58.5|
+|beam1_ctc0.3/dev_other|2864|64524|80.2|14.7|5.1|4.2|24.0|81.2|
+|beam1_ctc0.3/test_clean|2620|66983|91.0|5.6|3.4|1.6|10.6|59.4|
+|beam1_ctc0.3/test_other|2939|66650|80.0|14.4|5.6|3.7|23.7|83.3|
+|beam20_ctc0.3/dev_clean|2703|69558|91.9|5.0|3.1|0.6|8.7|56.3|
+|beam20_ctc0.3/dev_other|2864|64524|81.0|13.5|5.5|2.3|21.3|79.9|
+|beam20_ctc0.3/test_clean|2620|66983|92.0|5.0|3.0|0.6|8.6|57.0|
+|beam20_ctc0.3/test_other|2939|66650|81.2|13.0|5.8|2.0|20.9|81.6|
+
+
+
+## Environments
+- date: `Fri Feb 18 16:00:45 EST 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `f6779876103be2116de158a44757f8979eff0ab0`
+  - Commit date: `Fri Feb 18 15:57:13 2022 -0500`
+
+## asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic
+
+GPU: a single V100-32GB
+
+Training Time: 42834 seconds
+
+Model: https://huggingface.co/pyf98/librispeech_100h_transformer
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam20_ctc0.3/dev_clean|2703|54402|93.0|6.4|0.5|1.1|8.1|63.1|
+|beam20_ctc0.3/dev_other|2864|50948|82.5|15.9|1.6|2.7|20.2|83.8|
+|beam20_ctc0.3/test_clean|2620|52576|92.8|6.5|0.7|1.2|8.4|63.3|
+|beam20_ctc0.3/test_other|2939|52343|82.1|16.0|1.9|2.6|20.5|84.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam20_ctc0.3/dev_clean|2703|288456|97.5|1.4|1.1|0.9|3.4|63.1|
+|beam20_ctc0.3/dev_other|2864|265951|92.1|4.8|3.1|2.4|10.3|83.8|
+|beam20_ctc0.3/test_clean|2620|281530|97.4|1.4|1.2|0.9|3.5|63.3|
+|beam20_ctc0.3/test_other|2939|272758|92.0|4.7|3.2|2.3|10.2|84.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|beam20_ctc0.3/dev_clean|2703|69558|89.9|6.1|4.0|0.8|10.9|63.1|
+|beam20_ctc0.3/dev_other|2864|64524|78.5|15.3|6.2|2.8|24.3|83.8|
+|beam20_ctc0.3/test_clean|2620|66983|90.0|6.2|3.9|0.8|10.9|63.3|
+|beam20_ctc0.3/test_other|2939|66650|77.9|15.2|6.9|2.5|24.6|84.8|
diff --git a/egs2/librispeech_100/asr1/asr.sh b/egs2/librispeech_100/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/librispeech_100/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/cmd.sh b/egs2/librispeech_100/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/librispeech_100/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/librispeech_100/asr1/conf/decode_asr.yaml b/egs2/librispeech_100/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..a80fbb67f0d
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+ctc_weight: 0.3
+lm_weight: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+penalty: 0.0
diff --git a/egs2/librispeech_100/asr1/conf/fbank.conf b/egs2/librispeech_100/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/librispeech_100/asr1/conf/pbs.conf b/egs2/librispeech_100/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/librispeech_100/asr1/conf/pitch.conf b/egs2/librispeech_100/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/librispeech_100/asr1/conf/queue.conf b/egs2/librispeech_100/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/librispeech_100/asr1/conf/slurm.conf b/egs2/librispeech_100/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/librispeech_100/asr1/conf/train_asr.yaml b/egs2/librispeech_100/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..ca04d1f7e8a
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/conf/tuning/decode_ctc_bs1.yaml b/egs2/librispeech_100/asr1/conf/tuning/decode_ctc_bs1.yaml
new file mode 100644
index 00000000000..d1a6b48b430
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/decode_ctc_bs1.yaml
@@ -0,0 +1,3 @@
+lm_weight: 0.0
+ctc_weight: 1.0
+beam_size: 1
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml
new file mode 100644
index 00000000000..6b2da79b3d4
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml
@@ -0,0 +1,84 @@
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+log_interval: 400   
+num_att_plot: 0     
+num_workers: 4      
+sort_in_batch: descending       # how to sort data in making batch
+sort_batch: descending          # how to sort created batches
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+use_amp: true      
+cudnn_deterministic: false  
+cudnn_benchmark: false      
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic.yaml
new file mode 100644
index 00000000000..49b0050de8c
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic.yaml
@@ -0,0 +1,77 @@
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+log_interval: 400
+num_att_plot: 0
+num_workers: 4
+sort_in_batch: descending
+sort_batch: descending
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+use_amp: true
+cudnn_deterministic: false
+cudnn_benchmark: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml
new file mode 100644
index 00000000000..053cd5bccd2
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_ctc.yaml
@@ -0,0 +1,61 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 16
+max_epoch: 60
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+model_conf:
+    ctc_weight: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_conformer_interctc.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_interctc.yaml
new file mode 100644
index 00000000000..01bc5d3ce75
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_interctc.yaml
@@ -0,0 +1,63 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 16
+max_epoch: 60
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+    interctc_layer_idx: [6,12]
+
+model_conf:
+    ctc_weight: 1.0
+    interctc_weight: 0.66
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/conf/tuning/train_conformer_scctc.yaml b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_scctc.yaml
new file mode 100644
index 00000000000..202b4abdc3f
--- /dev/null
+++ b/egs2/librispeech_100/asr1/conf/tuning/train_conformer_scctc.yaml
@@ -0,0 +1,64 @@
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 16
+max_epoch: 60
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 18
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+    interctc_layer_idx: [6,12]
+    interctc_use_conditioning: true
+
+model_conf:
+    ctc_weight: 1.0
+    interctc_weight: 0.66
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/db.sh b/egs2/librispeech_100/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/librispeech_100/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/local/data.sh b/egs2/librispeech_100/asr1/local/data.sh
new file mode 100755
index 00000000000..87393e084c6
--- /dev/null
+++ b/egs2/librispeech_100/asr1/local/data.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+data_url=www.openslr.org/resources/12
+train_dev="dev"
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${LIBRISPEECH}" ]; then
+    log "Fill the value of 'LIBRISPEECH' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${LIBRISPEECH}/LibriSpeech/LICENSE.TXT" ]; then
+	echo "stage 1: Data Download to ${LIBRISPEECH}"
+	for part in dev-clean test-clean dev-other test-other train-clean-100; do
+            local/download_and_untar.sh ${LIBRISPEECH} ${data_url} ${part}
+	done
+    else
+        log "stage 1: ${LIBRISPEECH}/LibriSpeech/LICENSE.TXT is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    for part in dev-clean test-clean dev-other test-other train-clean-100; do
+        # use underscore-separated names in data directories.
+        local/data_prep.sh ${LIBRISPEECH}/LibriSpeech/${part} data/${part//-/_}
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: combine all training and development sets"
+    utils/combine_data.sh --extra_files utt2num_frames data/${train_dev} data/dev_clean data/dev_other
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # use external data
+    if [ ! -e data/local/other_text/librispeech-lm-norm.txt.gz ]; then
+	log "stage 4: prepare external text data from http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz"
+        wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/other_text/
+    fi
+    if [ ! -e data/local/other_text/text ]; then
+	# provide utterance id to each texts
+	# e.g., librispeech_lng_00003686 A BANK CHECK
+	zcat data/local/other_text/librispeech-lm-norm.txt.gz | \
+	    awk '{ printf("librispeech_lng_%08d %s\n",NR,$0) } ' > data/local/other_text/text
+    fi
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/librispeech_100/asr1/local/data_prep.sh b/egs2/librispeech_100/asr1/local/data_prep.sh
new file mode 120000
index 00000000000..168bf5ad2f5
--- /dev/null
+++ b/egs2/librispeech_100/asr1/local/data_prep.sh
@@ -0,0 +1 @@
+../../../librispeech/asr1/local/data_prep.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/local/download_and_untar.sh b/egs2/librispeech_100/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..8af58a9ab7b
--- /dev/null
+++ b/egs2/librispeech_100/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../librispeech/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/local/path.sh b/egs2/librispeech_100/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/librispeech_100/asr1/path.sh b/egs2/librispeech_100/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/librispeech_100/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/pyscripts b/egs2/librispeech_100/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/librispeech_100/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/run.sh b/egs2/librispeech_100/asr1/run.sh
new file mode 100755
index 00000000000..287d2b2bfe6
--- /dev/null
+++ b/egs2/librispeech_100/asr1/run.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train_clean_100"
+valid_set="dev"
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_tag=conformer_lr2e-3_warmup15k_amp_nondeterministic
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --skip_data_prep false \
+    --skip_train false \
+    --skip_eval false \
+    --lang en \
+    --ngpu 1 \
+    --nj 32 \
+    --inference_nj 32 \
+    --nbpe 5000 \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --audio_format "flac.ark" \
+    --feats_type raw \
+    --use_lm false \
+    --asr_tag "${asr_tag}" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/librispeech_100/asr1/scripts b/egs2/librispeech_100/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/librispeech_100/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/steps b/egs2/librispeech_100/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/librispeech_100/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/librispeech_100/asr1/utils b/egs2/librispeech_100/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/librispeech_100/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/libritts/tts1/README.md b/egs2/libritts/tts1/README.md
index 2ce00621693..fe41a0945f5 100644
--- a/egs2/libritts/tts1/README.md
+++ b/egs2/libritts/tts1/README.md
@@ -7,11 +7,119 @@ See the following pages for the usage:
 - [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
 - [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
 - [How to train with X-vector](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-x-vector-training)
+- [How to train with speaker ID](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-speaker-id-embedding-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
 
 See the following pages before asking the question:
 - [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
 - [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
 
+# THIRD RESULTS
+
+- Use espeak-ng based G2P
+
+## Environments
+- date: `Fri Oct  8 15:48:44 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `628b46282537ce532d613d6bafb75e826e8455de`
+  - Commit date: `Wed Sep 8 13:30:50 2021 +0900`
+
+## Pretrained Models
+
+### libritts_tts_train_xvector_vits_raw_phn_tacotron_espeak_ng_english_us_vits_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+# Prep data directory
+./run.sh --stage 1 --stop-stage 1
+
+# Since espeak is super slow, dump phonemized text at first
+for dset in train-clean-460 dev-clean test-clean; do
+    utils/copy_data_dir.sh data/"${dset}"{,_phn}
+    ./pyscripts/utils/convert_text_to_phn.py \
+        --nj 32 \
+        --g2p espeak_ng_english_us_vits \
+        --cleaer tacotron \
+        data/"${dset}"{,_phn}/text
+done
+
+# Run from stage 2
+./run.sh \
+    --train_set train-clean-460_phn \
+    --valid_set dev-clean_phn \
+    --test_sets "dev-clean_phn test-clean_phn" \
+    --srctexts "data/train-clean-460_phn/text" \
+    --g2p none \
+    --cleaner none \
+    --stage 2 \
+    --min_wav_duration 0.38 \
+    --use_xvector true \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --win_length null \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_xvector_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 22.05 kHz / 1M iters / Use x-vector / Average the last 10 epochs
+- https://zenodo.org/record/5560155
+
+
+# SECOND RESULTS
+
+- Initial VITS model
+
+## Environments
+
+- date: `Wed Sep 22 22:46:46 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `628b46282537ce532d613d6bafb75e826e8455de`
+  - Commit date: `Wed Sep 8 13:30:50 2021 +0900`
+
+### libritts_tts_train_xvector_vits_raw_phn_tacotron_g2p_en_no_space_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --min_wav_duration 0.38 \
+    --use_xvector true \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --win_length null \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_xvector_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 22.05 kHz / 1M iters / Use x-vector / Average the last 10 epochs
+- https://zenodo.org/record/5521416
+
+
 # FIRST RESULTS
 
 ## Environments
@@ -24,10 +132,10 @@ See the following pages before asking the question:
 
 ## Pretrained Models
 
-### libritts_tts_train_xvector_trasnformer_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
+### libritts_tts_train_xvector_transformer_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
 - https://zenodo.org/record/4409704
 
-### libritts_tts_train_gst+xvector_trasnformer_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
+### libritts_tts_train_gst+xvector_transformer_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
 - https://zenodo.org/record/4409702
 
 ### libritts_tts_train_xvector_conformer_fastspeech2_transformer_teacher_raw_phn_tacotron_g2p_en_no_space_train.loss
diff --git a/egs2/libritts/tts1/cmd.sh b/egs2/libritts/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/libritts/tts1/cmd.sh
+++ b/egs2/libritts/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/libritts/tts1/conf/tuning/decode_vits.yaml b/egs2/libritts/tts1/conf/tuning/decode_vits.yaml
new file mode 100644
index 00000000000..74bb0ebe0e2
--- /dev/null
+++ b/egs2/libritts/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
diff --git a/egs2/libritts/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml b/egs2/libritts/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a6b8d59d422
--- /dev/null
+++ b/egs2/libritts/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,113 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with GST + X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 4                               # number of heads in GST multi-head attention
+    gst_tokens: 16                             # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/libritts/tts1/conf/tuning/train_xvector_conformer_fastspeech2.yaml b/egs2/libritts/tts1/conf/tuning/train_xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..7e3f208aa7e
--- /dev/null
+++ b/egs2/libritts/tts1/conf/tuning/train_xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,110 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/libritts/tts1/conf/tuning/train_xvector_vits.yaml b/egs2/libritts/tts1/conf/tuning/train_xvector_vits.yaml
new file mode 100644
index 00000000000..739abb667e0
--- /dev/null
+++ b/egs2/libritts/tts1/conf/tuning/train_xvector_vits.yaml
@@ -0,0 +1,186 @@
+# This configuration is for ESPnet2 to train multi-speaker
+# VITS with x-vector instead of sepeaker ID embedding.
+# To run this config, you need to specify "--tts_task gan_tts"
+# and "--use_xvector true" options for tts.sh at least and use
+# 22khz audio as the training data (mainly tested on LibriTTS).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        spk_embed_dim: 512
+        global_channels: 256
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/ljspeech/tts1/README.md b/egs2/ljspeech/tts1/README.md
index daa3761e779..910647e7208 100644
--- a/egs2/ljspeech/tts1/README.md
+++ b/egs2/ljspeech/tts1/README.md
@@ -6,12 +6,205 @@ See the following pages for the usage:
 - [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
 - [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
 - [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
 
 See the following pages before asking the question:
 - [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
 - [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
 
 
+# FIFTH RESULTS
+
+- Use espeak-ng based G2P
+
+## Environments
+- date: `Fri Oct  8 15:48:44 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `628b46282537ce532d613d6bafb75e826e8455de`
+  - Commit date: `Wed Sep 8 13:30:50 2021 +0900`
+
+## Pretrained Models
+
+### ljspeech_tts_train_vits_raw_phn_tacotron_espeak_ng_english_us_vits_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+# Prep data directory
+./run.sh --stage 1 --stop-stage 1
+
+# Since espeak is super slow, dump phonemized text at first
+for dset in tr_no_dev dev eval1; do
+    utils/copy_data_dir.sh data/"${dset}"{,_phn}
+    ./pyscripts/utils/convert_text_to_phn.py \
+        --nj 32 \
+        --g2p espeak_ng_english_us_vits \
+        --cleaer tacotron \
+        data/"${dset}"{,_phn}/text
+done
+
+# Run from stage 2
+./run.sh \
+    --train_set tr_no_dev_phn \
+    --valid_set dev_phn \
+    --test_sets "dev_phn eval1_phn" \
+    --srctexts "data/tr_no_dev_phn/text" \
+    --stage 2 \
+    --ngpu 4 \
+    --g2p none \
+    --cleaner none \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5555690
+
+### ljspeech_tts_train_tacotron2_raw_phn_tacotron_espeak_ng_english_us_vits_train.loss.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+# Prep data directory
+./run.sh --stage 1 --stop-stage 1
+
+# Since espeak is super slow, dump phonemized text at first
+for dset in tr_no_dev dev eval1; do
+    utils/copy_data_dir.sh data/"${dset}"{,_phn}
+    ./pyscripts/utils/convert_text_to_phn.py \
+        --nj 32 \
+        --g2p espeak_ng_english_us_vits \
+        --cleaer tacotron \
+        data/"${dset}"{,_phn}/text
+done
+
+# Run from stage 2
+./run.sh \
+    --train_set tr_no_dev_phn \
+    --valid_set dev_phn \
+    --test_sets "dev_phn eval1_phn" \
+    --srctexts "data/tr_no_dev_phn/text" \
+    --stage 2 \
+    --g2p none \
+    --cleaner none \
+    --train_config ./conf/tuning/train_tacotron2.yaml
+```
+
+</div></details>
+
+- Average the best 5 train loss models
+- https://zenodo.org/record/5560125
+
+### ljspeech_tts_train_conformer_fastspeech2_raw_phn_tacotron_espeak_ng_english_us_vits_train.loss.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+# Use the above tacotron2 model as the teacher
+./run.sh \
+    --ngpu 1 \
+    --stage 7 \
+    --train_set tr_no_dev_phn \
+    --valid_set dev_phn \
+    --test_sets "tr_no_dev_phn dev_phn eval1_phn" \
+    --cleaner none \
+    --g2p none \
+    --train_config ./conf/tuning/train_tacotron2.yaml \
+    --tts_exp exp/tts_train_tacotron2_raw_phn_none \
+    --inference_args "--use_teacher_forcing true"
+
+# Run fastspeech2 training
+./run.sh \
+    --train_set tr_no_dev_phn \
+    --valid_set dev_phn \
+    --test_sets "dev_phn eval1_phn" \
+    --stage 5 \
+    --g2p none \
+    --cleaner none \
+    --train_config ./conf/tuning/train_conformer_fastspeech2.yaml \
+    --teacher_dumpdir exp/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave \
+    --tts_stats_dir exp/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats
+```
+
+</div></details>
+
+- Average the best 5 train loss models
+- https://zenodo.org/record/5560127
+
+
+# FORTH RESULTS
+
+- Initial joint training models
+
+## Environments
+- date: `Fri Sep 10 13:04:49 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `628b46282537ce532d613d6bafb75e826e8455de`
+  - Commit date: `Wed Sep 8 13:30:50 2021 +0900`
+
+## Pretrained models
+
+### ljspeech_tts_train_joint_conformer_fastspeech2_hifigan_raw_phn_tacotron_g2p_en_no_space_train.total_count.ave
+- Joint training from scrath
+- 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5498487
+
+### ljspeech_tts_finetune_joint_conformer_fastspeech2_hifigan_initilized_discriminator_raw_phn_tacotron_g2p_en_no_space_train.total_count.ave
+- Joint finetuning with the initialized discriminator
+- 0.5M iters / Average the last 5 epoch models
+- https://zenodo.org/record/5498497
+
+### ljspeech_tts_finetune_joint_conformer_fastspeech2_hifigan_raw_phn_tacotron_g2p_en_no_space_train.total_count.ave
+- Joint finetuning with pretrained text2mel, vocoder and discriminator
+- 0.5M iters / Average the last 5 epoch models
+- https://zenodo.org/record/5498896
+
+
+# THIRD RESULTS
+
+- Initial VITS model
+
+## Environments
+- date: `Sat Sep  4 19:38:35 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a1`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `dee654041cddf80281048b3e7525c1cdafc377ff`
+  - Commit date: `Thu Sep 2 14:45:48 2021 +0900`
+
+## Pretrained Models
+
+### ljspeech_tts_train_vits_raw_phn_tacotron_g2p_en_no_space_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --ngpu 4 \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 1M iters / Average the last 10 epoch models
+- https://zenodo.org/record/5443814
+
+
 # SECOND RESULTS
 
 ## Environments
diff --git a/egs2/ljspeech/tts1/cmd.sh b/egs2/ljspeech/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/ljspeech/tts1/cmd.sh
+++ b/egs2/ljspeech/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/ljspeech/tts1/conf/tuning/decode_vits.yaml b/egs2/ljspeech/tts1/conf/tuning/decode_vits.yaml
new file mode 100644
index 00000000000..74bb0ebe0e2
--- /dev/null
+++ b/egs2/ljspeech/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
diff --git a/egs2/ljspeech/tts1/conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml b/egs2/ljspeech/tts1/conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml
new file mode 100644
index 00000000000..7cf581f02ce
--- /dev/null
+++ b/egs2/ljspeech/tts1/conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml
@@ -0,0 +1,271 @@
+# This EXPERIMENTAL configuration is for ESPnet2 to finetune
+# Conformer FastSpeech2 + HiFiGAN vocoder jointly. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 22050 hz audio as the
+# training data (mainly tested on LJspeech).
+# This configuration tested on 4 GPUs with 12GB GPU memory.
+# It takes around less than 1 week to finish the training but
+# 100k iters model should generate reasonable results.
+
+# YOU NEED TO MODIFY THE "*_params" AND "init_param" SECTIONS
+# IF YOU WANT TO USE YOUR OWN PRETRAINED MODLES.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: joint_text2wav
+tts_conf:
+    # copied from pretrained model's config.yaml
+    text2mel_type: fastspeech2
+    text2mel_params:
+        adim: 384
+        aheads: 2
+        conformer_activation_type: swish
+        conformer_dec_kernel_size: 31
+        conformer_enc_kernel_size: 7
+        conformer_pos_enc_layer_type: rel_pos
+        conformer_self_attn_layer_type: rel_selfattn
+        decoder_normalize_before: false
+        decoder_type: conformer
+        dlayers: 4
+        dunits: 1536
+        duration_predictor_chans: 256
+        duration_predictor_kernel_size: 3
+        duration_predictor_layers: 2
+        elayers: 4
+        encoder_normalize_before: false
+        encoder_type: conformer
+        energy_embed_dropout: 0.0
+        energy_embed_kernel_size: 1
+        energy_predictor_chans: 256
+        energy_predictor_dropout: 0.5
+        energy_predictor_kernel_size: 3
+        energy_predictor_layers: 2
+        eunits: 1536
+        init_type: xavier_uniform
+        pitch_embed_dropout: 0.0
+        pitch_embed_kernel_size: 1
+        pitch_predictor_chans: 256
+        pitch_predictor_dropout: 0.5
+        pitch_predictor_kernel_size: 5
+        pitch_predictor_layers: 5
+        positionwise_conv_kernel_size: 3
+        positionwise_layer_type: conv1d
+        postnet_chans: 256
+        postnet_filts: 5
+        postnet_layers: 5
+        reduction_factor: 1
+        stop_gradient_from_energy_predictor: false
+        stop_gradient_from_pitch_predictor: true
+        transformer_dec_attn_dropout_rate: 0.2
+        transformer_dec_dropout_rate: 0.2
+        transformer_dec_positional_dropout_rate: 0.2
+        transformer_enc_attn_dropout_rate: 0.2
+        transformer_enc_dropout_rate: 0.2
+        transformer_enc_positional_dropout_rate: 0.2
+        use_cnn_in_conformer: true
+        use_macaron_style_in_conformer: true
+        use_masking: true
+
+    # copied from pretrained vocoder's config.yaml
+    vocoder_type: hifigan_generator
+    vocoder_params:
+        bias: true
+        channels: 512
+        in_channels: 80
+        kernel_size: 7
+        nonlinear_activation: LeakyReLU
+        nonlinear_activation_params:
+            negative_slope: 0.1
+        out_channels: 1
+        resblock_dilations:
+        - - 1
+          - 3
+          - 5
+        - - 1
+          - 3
+          - 5
+        - - 1
+          - 3
+          - 5
+        resblock_kernel_sizes:
+        - 3
+        - 7
+        - 11
+        upsample_kernel_sizes:
+        - 16
+        - 16
+        - 4
+        - 4
+        upsample_scales:
+        - 8
+        - 8
+        - 2
+        - 2
+        use_additional_convs: true
+        use_weight_norm: true
+
+    # copied from pretrained vocoder's config.yaml
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        follow_official_norm: true
+        period_discriminator_params:
+            bias: true
+            channels: 32
+            downsample_scales:
+            - 3
+            - 3
+            - 3
+            - 3
+            - 1
+            in_channels: 1
+            kernel_sizes:
+            - 5
+            - 3
+            max_downsample_channels: 1024
+            nonlinear_activation: LeakyReLU
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            out_channels: 1
+            use_spectral_norm: false
+            use_weight_norm: true
+        periods:
+        - 2
+        - 3
+        - 5
+        - 7
+        - 11
+        scale_discriminator_params:
+            bias: true
+            channels: 128
+            downsample_scales:
+            - 4
+            - 4
+            - 4
+            - 4
+            - 1
+            in_channels: 1
+            kernel_sizes:
+            - 15
+            - 41
+            - 5
+            - 3
+            max_downsample_channels: 1024
+            max_groups: 16
+            nonlinear_activation: LeakyReLU
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            out_channels: 1
+        scale_downsample_pooling: AvgPool1d
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            padding: 2
+            stride: 2
+        scales: 3
+
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    use_feat_match_loss: true            # whether to use feat match loss
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    use_mel_loss: true     # whether to use mel-spectrogram loss
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_text2mel: 1.0   # loss scaling coefficient for text2mel loss
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    segment_size: 32              # segment size for random windowed discriminator
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_extract_conf:
+    reduction_factor: 1
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_extract_conf:
+    reduction_factor: 1
+energy_normalize: global_mvn # normalizer for the energy feature
+
+# initialization (might need to modify for your own pretrained model)
+init_param:
+- exp/tts_train_conformer_fastspeech2_raw_phn_tacotron_g2p_en_no_space/train.loss.ave_5best.pth:tts:tts.generator.text2mel
+- exp/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder
+- exp/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adam
+optim_conf:
+    lr: 1.25e-5
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adam
+optim2_conf:
+    lr: 1.25e-5
+    betas: [0.5, 0.9]
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: true # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 500            # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - text2mel_loss
+    - min
+-   - train
+    - text2mel_loss
+    - min
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/ljspeech/tts1/conf/tuning/train_joint_conformer_fastspeech2_hifigan.yaml b/egs2/ljspeech/tts1/conf/tuning/train_joint_conformer_fastspeech2_hifigan.yaml
new file mode 100644
index 00000000000..6503515837d
--- /dev/null
+++ b/egs2/ljspeech/tts1/conf/tuning/train_joint_conformer_fastspeech2_hifigan.yaml
@@ -0,0 +1,226 @@
+# This EXPERIMENTAL configuration is for ESPnet2 to train
+# Conformer FastSpeech2 + HiFiGAN vocoder jointly. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 22050 hz audio as the
+# training data (mainly tested on LJspeech).
+# This configuration tested on 4 GPUs with 12GB GPU memory.
+# It takes around 1.5 weeks to finish the training but 100k
+# iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: joint_text2wav
+tts_conf:
+    # text2mel related
+    text2mel_type: fastspeech2
+    text2mel_params:
+        adim: 384         # attention dimension
+        aheads: 2         # number of attention heads
+        elayers: 4        # number of encoder layers
+        eunits: 1536      # number of encoder ff units
+        dlayers: 4        # number of decoder layers
+        dunits: 1536      # number of decoder ff units
+        positionwise_layer_type: conv1d   # type of position-wise layer
+        positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+        duration_predictor_layers: 2      # number of layers of duration predictor
+        duration_predictor_chans: 256     # number of channels of duration predictor
+        duration_predictor_kernel_size: 3 # filter size of duration predictor
+        postnet_layers: 5                 # number of layers of postnset
+        postnet_filts: 5                  # filter size of conv layers in postnet
+        postnet_chans: 256                # number of channels of conv layers in postnet
+        use_masking: True                 # whether to apply masking for padded part in loss calculation
+        encoder_normalize_before: True    # whether to perform layer normalization before the input
+        decoder_normalize_before: True    # whether to perform layer normalization before the input
+        reduction_factor: 1               # reduction factor
+        encoder_type: conformer           # encoder type
+        decoder_type: conformer           # decoder type
+        conformer_rel_pos_type: latest               # relative positional encoding type
+        conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+        conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+        conformer_activation_type: swish             # conformer activation type
+        use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+        use_cnn_in_conformer: true                   # whether to use CNN in conformer
+        conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+        conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+        init_type: xavier_uniform                    # initialization type
+        transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+        transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+        transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+        transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+        transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+        transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+        pitch_predictor_layers: 5                    # number of conv layers in pitch predictor
+        pitch_predictor_chans: 256                   # number of channels of conv layers in pitch predictor
+        pitch_predictor_kernel_size: 5               # kernel size of conv leyers in pitch predictor
+        pitch_predictor_dropout: 0.5                 # dropout rate in pitch predictor
+        pitch_embed_kernel_size: 1                   # kernel size of conv embedding layer for pitch
+        pitch_embed_dropout: 0.0                     # dropout rate after conv embedding layer for pitch
+        stop_gradient_from_pitch_predictor: true     # whether to stop the gradient from pitch predictor to encoder
+        energy_predictor_layers: 2                   # number of conv layers in energy predictor
+        energy_predictor_chans: 256                  # number of channels of conv layers in energy predictor
+        energy_predictor_kernel_size: 3              # kernel size of conv leyers in energy predictor
+        energy_predictor_dropout: 0.5                # dropout rate in energy predictor
+        energy_embed_kernel_size: 1                  # kernel size of conv embedding layer for energy
+        energy_embed_dropout: 0.0                    # dropout rate after conv embedding layer for energy
+        stop_gradient_from_energy_predictor: false   # whether to stop the gradient from energy predictor to encoder
+
+    # vocoder related
+    vocoder_type: hifigan_generator
+    vocoder_params:
+        out_channels: 1
+        channels: 512
+        global_channels: -1
+        kernel_size: 7
+        upsample_scales: [8, 8, 2, 2]
+        upsample_kernel_sizes: [16, 16, 4, 4]
+        resblock_kernel_sizes: [3, 7, 11]
+        resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_additional_convs: true
+        bias: true
+        nonlinear_activation: "LeakyReLU"
+        nonlinear_activation_params:
+            negative_slope: 0.1
+        use_weight_norm: true
+
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    use_feat_match_loss: true            # whether to use feat match loss
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    use_mel_loss: true     # whether to use mel-spectrogram loss
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_text2mel: 1.0   # loss scaling coefficient for text2mel loss
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    segment_size: 32              # segment size for random windowed discriminator
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_extract_conf:
+    reduction_factor: 1
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_extract_conf:
+    reduction_factor: 1
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: true # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - text2mel_loss
+    - min
+-   - train
+    - text2mel_loss
+    - min
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/ljspeech/tts1/conf/tuning/train_vits.yaml b/egs2/ljspeech/tts1/conf/tuning/train_vits.yaml
new file mode 100644
index 00000000000..574b8febab4
--- /dev/null
+++ b/egs2/ljspeech/tts1/conf/tuning/train_vits.yaml
@@ -0,0 +1,185 @@
+# This configuration is for ESPnet2 to train VITS, which
+# is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 22050 hz audio as
+# the training data (mainly tested on LJspeech).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/ljspeech/tts1/run.sh b/egs2/ljspeech/tts1/run.sh
index 25afc3465b4..7a7174b459b 100755
--- a/egs2/ljspeech/tts1/run.sh
+++ b/egs2/ljspeech/tts1/run.sh
@@ -10,7 +10,7 @@ n_fft=1024
 n_shift=256
 
 opts=
-if [ "${fs}" -eq 48000 ]; then
+if [ "${fs}" -eq 22050 ]; then
     # To suppress recreation, specify wav format
     opts="--audio_format wav "
 else
diff --git a/egs2/lrs2/lipreading1/asr.sh b/egs2/lrs2/lipreading1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/lrs2/lipreading1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/lrs2/lipreading1/cmd.sh b/egs2/lrs2/lipreading1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/lrs2/lipreading1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/lrs2/lipreading1/conf/fbank.conf b/egs2/lrs2/lipreading1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/lrs2/lipreading1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/lrs2/lipreading1/conf/pbs.conf b/egs2/lrs2/lipreading1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/lrs2/lipreading1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/lrs2/lipreading1/conf/pitch.conf b/egs2/lrs2/lipreading1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/lrs2/lipreading1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/lrs2/lipreading1/conf/queue.conf b/egs2/lrs2/lipreading1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/lrs2/lipreading1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/lrs2/lipreading1/conf/slurm.conf b/egs2/lrs2/lipreading1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/lrs2/lipreading1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/lrs2/lipreading1/conf/train_asr_transformer.yaml b/egs2/lrs2/lipreading1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..b78ef7a7242
--- /dev/null
+++ b/egs2/lrs2/lipreading1/conf/train_asr_transformer.yaml
@@ -0,0 +1,46 @@
+batch_type: folded
+batch_size: 16
+accum_grad: 1
+max_epoch: 100
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: linear
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
diff --git a/egs2/lrs2/lipreading1/conf/train_lm.yaml b/egs2/lrs2/lipreading1/conf/train_lm.yaml
new file mode 100644
index 00000000000..c9260fcbf12
--- /dev/null
+++ b/egs2/lrs2/lipreading1/conf/train_lm.yaml
@@ -0,0 +1,15 @@
+grad_clip: 5.0
+batch_type: folded
+batch_size: 512
+optim: adam
+optim_conf:
+    lr: 0.0005
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 4
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/lrs2/lipreading1/conf/tuning/train_asr_transformer.yaml b/egs2/lrs2/lipreading1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..194a865fd68
--- /dev/null
+++ b/egs2/lrs2/lipreading1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,46 @@
+batch_type: folded
+batch_size: 32
+accum_grad: 8
+max_epoch: 100
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
diff --git a/egs2/lrs2/lipreading1/db.sh b/egs2/lrs2/lipreading1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/lrs2/lipreading1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/lrs2/lipreading1/local/data.sh b/egs2/lrs2/lipreading1/local/data.sh
new file mode 100755
index 00000000000..580ffcb2f63
--- /dev/null
+++ b/egs2/lrs2/lipreading1/local/data.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+cmd=run.pl
+nj=2
+stage=1
+stop_stage=5
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    for dataset in train val test; do
+        mkdir -p data/${dataset}
+        awk  -v lrs2=${LRS2} -F '[/ ]' '{print $1"_"$2, lrs2"/main/"$1"/"$2".mp4"}' ${LRS2}/${dataset}.txt | sort > data/${dataset}/video.scp
+        awk '{print $1, "ffmpeg -i " $2 " -ar 16000 -ac 1  -f wav pipe:1 |" }' data/${dataset}/video.scp > data/${dataset}/wav.scp
+        awk '{print $2}' data/${dataset}/video.scp | sed -e 's/.mp4/.txt/g' | while read -r line 
+        do 
+            grep 'Text:' $line | sed -e 's/Text:  //g'
+        done > data/${dataset}/text_tmp
+        paste  <(awk '{print $1}' data/${dataset}/wav.scp)  data/${dataset}/text_tmp >  data/${dataset}/text
+        rm data/${dataset}/text_tmp
+        awk '{print $1, $1}' data/${dataset}/wav.scp > data/${dataset}/utt2spk
+        awk '{print $1, $1}' data/${dataset}/wav.scp > data/${dataset}/spk2utt
+
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # Download the pretrained model to extract visual features.
+    # The model was trained by Chenda Li (lichenda1996@sjtu.edu.cn), 
+    # following the paper by Stafylakis, T., & Tzimiropoulos, G. (2017). 
+    # "Combining residual networks with LSTMs for lipreading".
+    if [ ! -f ./local/feature_extract/lipread_lrw_pretrain.pt.tgz ]; then
+        wget https://zenodo.org/record/5090353/files/lipread_lrw_pretrain.pt.tgz -O ./local/feature_extract/lipread_lrw_pretrain.pt.tgz
+        tar xzvf ./local/feature_extract/lipread_lrw_pretrain.pt.tgz -C ./local/feature_extract/
+    fi
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    
+  if python -c "import skvideo, skimage, face_alignment" &> /dev/null; then
+    echo 'requirements installed'
+  else
+    echo 'please install required packages by run ". ./path.sh; pip install sk-video scikit-image face_alignment"'
+    exit 1;
+  fi
+
+    for dataset in train val test; do
+        echo "extracting visual feature for [${dataset}]"
+        
+        log_dir=data/${dataset}/split_${nj}
+        split_scps=""
+        mkdir -p ${log_dir}
+        for n in $(seq $nj); do
+            split_scps="$split_scps ${log_dir}/video.$n.scp"
+        done
+        ./utils/split_scp.pl data/${dataset}/video.scp $split_scps || exit 1
+
+        $cmd JOB=1:$nj ${log_dir}/extract_visual_feature.JOB.log python ./local/feature_extract/extract_visual_feature.py \
+         ${log_dir}/video.JOB.scp \
+         scp,ark:${log_dir}/vfeature.JOB.scp,${log_dir}/vfeature.JOB.ark || exit 1
+
+        for n in $(seq $nj); do
+            cat ${log_dir}/vfeature.${n}.scp
+        done > data/${dataset}/vfeature.scp
+        cp data/${dataset}/vfeature.scp data/${dataset}/feats.scp
+
+        ./steps/compute_cmvn_stats.sh data/${dataset}/ data/${dataset}/cmvn data/${dataset}
+    done
+
+fi
diff --git a/egs2/lrs2/lipreading1/local/feature_extract/cvtransforms.py b/egs2/lrs2/lipreading1/local/feature_extract/cvtransforms.py
new file mode 100644
index 00000000000..8a2c0710a7d
--- /dev/null
+++ b/egs2/lrs2/lipreading1/local/feature_extract/cvtransforms.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+import random
+import cv2
+import numpy as np
+
+
+def CenterCrop(batch_img, size):
+    w, h = batch_img[0][0].shape[1], batch_img[0][0].shape[0]
+    th, tw = size
+    img = np.zeros((len(batch_img), len(batch_img[0]), th, tw))
+    for i in range(len(batch_img)):
+        x1 = int(round((w - tw)) / 2.0)
+        y1 = int(round((h - th)) / 2.0)
+        img[i] = batch_img[i, :, y1 : y1 + th, x1 : x1 + tw]
+    return img
+
+
+def RandomCrop(batch_img, size):
+    w, h = batch_img[0][0].shape[1], batch_img[0][0].shape[0]
+    th, tw = size
+    img = np.zeros((len(batch_img), len(batch_img[0]), th, tw))
+    for i in range(len(batch_img)):
+        x1 = random.randint(0, 8)
+        y1 = random.randint(0, 8)
+        img[i] = batch_img[i, :, y1 : y1 + th, x1 : x1 + tw]
+    return img
+
+
+def HorizontalFlip(batch_img):
+    for i in range(len(batch_img)):
+        if random.random() > 0.5:
+            for j in range(len(batch_img[i])):
+                batch_img[i][j] = cv2.flip(batch_img[i][j], 1)
+    return batch_img
+
+
+def ColorNormalize(batch_img):
+    mean = 0.413621
+    std = 0.1700239
+    batch_img = (batch_img - mean) / std
+    return batch_img
diff --git a/egs2/lrs2/lipreading1/local/feature_extract/extract_visual_feature.py b/egs2/lrs2/lipreading1/local/feature_extract/extract_visual_feature.py
new file mode 100644
index 00000000000..8164bdb54ba
--- /dev/null
+++ b/egs2/lrs2/lipreading1/local/feature_extract/extract_visual_feature.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+from distutils.util import strtobool
+import logging
+
+import kaldiio
+import numpy
+import resampy
+from video_processing import *
+
+from espnet.utils.cli_utils import get_commandline_args
+from espnet.utils.cli_writers import file_writer_helper
+from espnet2.utils.types import int_or_none
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="extract visual feature from videos",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
+    )
+    parser.add_argument(
+        "--filetype",
+        type=str,
+        default="mat",
+        choices=["mat", "hdf5"],
+        help="Specify the file format for output. "
+        '"mat" is the matrix format in kaldi',
+    )
+    parser.add_argument(
+        "--compress", type=strtobool, default=False, help="Save in compressed format"
+    )
+    parser.add_argument(
+        "--compression-method",
+        type=int,
+        default=2,
+        help="Specify the method(if mat) or " "gzip-level(if hdf5)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+
+    parser.add_argument("rspecifier", type=str, help="WAV scp file")
+    parser.add_argument(
+        "--segments",
+        type=str,
+        help="segments-file format: each line is either"
+        "<segment-id> <recording-id> <start-time> <end-time>"
+        "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5",
+    )
+    parser.add_argument("wspecifier", type=str, help="Write specifier")
+    return parser
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    if args.verbose > 0:
+        logging.basicConfig(level=logging.INFO, format=logfmt)
+    else:
+        logging.basicConfig(level=logging.WARN, format=logfmt)
+    logging.info(get_commandline_args())
+
+    with VideoReader(args.rspecifier) as reader, file_writer_helper(
+        args.wspecifier,
+        filetype=args.filetype,
+        write_num_frames=args.write_num_frames,
+        compress=args.compress,
+        compression_method=args.compression_method,
+    ) as writer:
+        for utt_id, v_feature in reader:
+            writer[utt_id] = v_feature
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/lrs2/lipreading1/local/feature_extract/models/__init__.py b/egs2/lrs2/lipreading1/local/feature_extract/models/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/lrs2/lipreading1/local/feature_extract/models/pretrained.py b/egs2/lrs2/lipreading1/local/feature_extract/models/pretrained.py
new file mode 100755
index 00000000000..ef0fd388231
--- /dev/null
+++ b/egs2/lrs2/lipreading1/local/feature_extract/models/pretrained.py
@@ -0,0 +1,256 @@
+# coding: utf-8
+import math
+import numpy as np
+
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        self.bnfc = nn.BatchNorm1d(num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm1d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        return x
+        x = self.fc(x)
+        x = self.bnfc(x)
+        return x
+
+
+class GRU(nn.Module):
+    def __init__(
+        self, input_size, hidden_size, num_layers, num_classes, every_frame=True
+    ):
+        super(GRU, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.every_frame = every_frame
+        self.gru = nn.GRU(
+            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
+        )
+        self.fc = nn.Linear(hidden_size * 2, num_classes)
+
+    def forward(self, x):
+        h0 = Variable(torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size))
+        out, _ = self.gru(x, h0)
+        if self.every_frame:
+            out = self.fc(out)  # predictions based on every time step
+        else:
+            out = self.fc(out[:, -1, :])  # predictions based on last time-step
+        return out
+
+
+class Lipreading(nn.Module):
+    def __init__(
+        self,
+        mode,
+        inputDim=256,
+        hiddenDim=512,
+        nClasses=500,
+        frameLen=29,
+        every_frame=True,
+    ):
+        super(Lipreading, self).__init__()
+        self.mode = mode
+        self.inputDim = inputDim
+        self.hiddenDim = hiddenDim
+        self.nClasses = nClasses
+        self.frameLen = frameLen
+        self.every_frame = every_frame
+        self.nLayers = 2
+        # frontend3D
+        self.frontend3D = nn.Sequential(
+            nn.Conv3d(
+                1,
+                64,
+                kernel_size=(5, 7, 7),
+                stride=(1, 2, 2),
+                padding=(2, 3, 3),
+                bias=False,
+            ),
+            nn.BatchNorm3d(64),
+            nn.ReLU(True),
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)),
+        )
+        # resnet
+        # self.resnet34 = ResNet(BasicBlock, [3, 4, 6, 3], num_classes=self.inputDim)
+        self.resnet18 = ResNet(BasicBlock, [2, 2, 2, 2], num_classes=self.inputDim)
+        # backend_conv
+        self.backend_conv1 = nn.Sequential(
+            nn.Conv1d(self.inputDim, 2 * self.inputDim, 5, 2, 0, bias=False),
+            nn.BatchNorm1d(2 * self.inputDim),
+            nn.ReLU(True),
+            nn.MaxPool1d(2, 2),
+            nn.Conv1d(2 * self.inputDim, 4 * self.inputDim, 5, 2, 0, bias=False),
+            nn.BatchNorm1d(4 * self.inputDim),
+            nn.ReLU(True),
+        )
+        self.backend_conv2 = nn.Sequential(
+            nn.Linear(4 * self.inputDim, self.inputDim),
+            nn.BatchNorm1d(self.inputDim),
+            nn.ReLU(True),
+            nn.Linear(self.inputDim, self.nClasses),
+        )
+        # backend_gru
+        self.gru = GRU(
+            self.inputDim, self.hiddenDim, self.nLayers, self.nClasses, self.every_frame
+        )
+        # initialize
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.frontend3D(x)
+        x = x.transpose(1, 2)
+        x = x.contiguous()
+        x = x.view(-1, 64, x.size(3), x.size(4))
+        # x = self.resnet34(x)
+        x = self.resnet18(x)
+        return x
+        # if self.mode == 'temporalConv':
+        #     x = x.view(-1, self.frameLen, self.inputDim)
+        #     x = x.transpose(1, 2)
+        #     x = self.backend_conv1(x)
+        #     x = torch.mean(x, 2)
+        #     x = self.backend_conv2(x)
+        # elif self.mode == 'backendGRU' or self.mode == 'finetuneGRU':
+        #     x = x.view(-1, self.frameLen, self.inputDim)
+        #     x = self.gru(x)
+        # else:
+        #     raise Exception('No model is selected')
+        # return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                n = (
+                    m.kernel_size[0]
+                    * m.kernel_size[1]
+                    * m.kernel_size[2]
+                    * m.out_channels
+                )
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+            elif isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+            elif isinstance(m, nn.Conv1d):
+                n = m.kernel_size[0] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+            elif isinstance(m, nn.BatchNorm1d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+def lipreading(
+    mode, inputDim=256, hiddenDim=512, nClasses=500, frameLen=29, every_frame=True
+):
+    model = Lipreading(
+        mode,
+        inputDim=inputDim,
+        hiddenDim=hiddenDim,
+        nClasses=nClasses,
+        frameLen=frameLen,
+        every_frame=every_frame,
+    )
+    return model
diff --git a/egs2/lrs2/lipreading1/local/feature_extract/video_processing.py b/egs2/lrs2/lipreading1/local/feature_extract/video_processing.py
new file mode 100644
index 00000000000..216de5d03f1
--- /dev/null
+++ b/egs2/lrs2/lipreading1/local/feature_extract/video_processing.py
@@ -0,0 +1,212 @@
+import skvideo.io
+import skimage.transform
+import face_alignment
+import numpy as np
+import cvtransforms
+import torch
+from models import pretrained
+
+
+def reload_model(model, path=""):
+    if not bool(path):
+        return model
+    else:
+        model_dict = model.state_dict()
+        pretrained_dict = torch.load(path, map_location="cpu")
+        pretrained_dict = {
+            k: v
+            for k, v in pretrained_dict.items()
+            if k in model_dict and v.size() == model_dict[k].size()
+        }
+        model_dict.update(pretrained_dict)
+        print("load {} parameters".format(len(pretrained_dict)))
+        model.load_state_dict(model_dict)
+        return model
+
+
+class BoundingBox(object):
+    """
+    A 2D bounding box
+    """
+
+    def __init__(self, points):
+        if len(points) == 0:
+            raise ValueError("Can't compute bounding box of empty list")
+        self.minx, self.miny = 255, 255
+        self.maxx, self.maxy = 0, 0
+        for x, y in points:
+            # Set min coords
+            if x < self.minx:
+                self.minx = int(x)
+            if y < self.miny:
+                self.miny = int(y)
+            # Set max coords
+            if x > self.maxx:
+                self.maxx = int(x)
+            if y > self.maxy:
+                self.maxy = int(y)
+        if self.maxx <= self.minx or self.maxy <= self.miny:
+            print("Box failed, return center box")
+            self.minx, self.miny = 192, 192
+            self.maxx, self.maxy = 64, 64
+
+    @property
+    def width(self):
+        return self.maxx - self.minx
+
+    @property
+    def height(self):
+        return self.maxy - self.miny
+
+    def __repr__(self):
+        return "BoundingBox({}, {}, {}, {})".format(
+            self.minx, self.maxx, self.miny, self.maxy
+        )
+
+
+def parse_scripts(scp_path, value_processor=lambda x: x, num_tokens=2):
+    """
+    Parse kaldi's script(.scp) file
+    If num_tokens >= 2, function will check token number
+    """
+    scp_dict = dict()
+    line = 0
+    with open(scp_path, "r") as f:
+        for raw_line in f:
+            scp_tokens = raw_line.strip().split()
+            line += 1
+            if num_tokens >= 2 and len(scp_tokens) != num_tokens or len(scp_tokens) < 2:
+                raise RuntimeError(
+                    "For {}, format error in line[{:d}]: {}".format(
+                        scp_path, line, raw_line
+                    )
+                )
+            if num_tokens == 2:
+                key, value = scp_tokens
+            else:
+                key, value = scp_tokens[0], scp_tokens[1:]
+            if key in scp_dict:
+                raise ValueError(
+                    "Duplicated key '{0}' exists in {1}".format(key, scp_path)
+                )
+            scp_dict[key] = value_processor(value)
+    return scp_dict
+
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+model = pretrained.Lipreading(mode="temporalConv", nClasses=500)
+model = reload_model(model, "./local/feature_extract/lipread_lrw_pretrain.pt")
+model = model.float()
+model.eval()
+model.to(device)
+
+
+class VideoReader(object):
+    """
+    Basic Reader Class
+    """
+
+    def __init__(self, scp_path, value_processor=lambda x: x):
+        self.index_dict = parse_scripts(
+            scp_path, value_processor=value_processor, num_tokens=2
+        )
+        self.index_keys = list(self.index_dict.keys())
+        self.face_align_model = face_alignment.FaceAlignment(
+            face_alignment.LandmarksType._2D,
+            flip_input=False,
+            device="cuda:0" if torch.cuda.is_available() else "cpu",
+        )
+
+    def video_face_crop(self, input_video):
+        video = input_video
+
+        preds = []
+
+        for i in range(0, len(video), 3):
+            pred = self.face_align_model.get_landmarks(video[i])
+            if pred:
+                preds.append(pred[0])
+        preds = np.array(preds)
+        heatmap = np.median(preds, axis=0)
+
+        bounding_box = BoundingBox(heatmap[2:15])
+
+        croped = video[
+            :,
+            bounding_box.miny : bounding_box.maxy,
+            bounding_box.minx : bounding_box.maxx,
+            :,
+        ]
+
+        crop_resize = np.zeros((np.shape(video)[0], 112, 112, np.shape(video)[-1]))
+
+        for i in range(len(croped)):
+            try:
+                crop_resize[i] = skimage.transform.resize(
+                    croped[i], (112, 112), preserve_range=True
+                )
+            except Exception:
+                print(croped)
+                print("frame fails")
+
+        crop_resize = crop_resize.astype(np.uint8)
+
+        return crop_resize
+
+    def transform_to_gray(self, data):
+        r, g, b = data[..., 0], data[..., 1], data[..., 2]
+        data = (0.2989 * r + 0.5870 * g + 0.1140 * b) / 255
+        return data
+
+    def extract_feature(self, inputs):
+        inputs = cvtransforms.ColorNormalize(inputs)
+        inputs = torch.from_numpy(inputs)
+        inputs = inputs.unsqueeze(0).float()
+        inputs = inputs.unsqueeze(1)
+        with torch.no_grad():
+            outputs = model(inputs.to(device))
+        return outputs.cpu().numpy()
+
+    def _load(self, key):
+        # return path
+        video = skvideo.io.vread(self.index_dict[key])
+        v = self.video_face_crop(video)
+        v = self.transform_to_gray(v)
+        v = self.extract_feature(v)
+        return v
+        # return self.index_dict[key]
+
+    # number of utterance
+    def __len__(self):
+        return len(self.index_dict)
+
+    # avoid key error
+    def __contains__(self, key):
+        return key in self.index_dict
+
+    # sequential index
+    def __iter__(self):
+        for key in self.index_keys:
+            yield key, self._load(key)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+    # random index, support str/int as index
+    def __getitem__(self, index):
+        if type(index) not in [int, str]:
+            raise IndexError("Unsupported index type: {}".format(type(index)))
+        if type(index) == int:
+            # from int index to key
+            num_utts = len(self.index_keys)
+            if index >= num_utts or index < 0:
+                raise KeyError(
+                    "Interger index out of range, {:d} vs {:d}".format(index, num_utts)
+                )
+            index = self.index_keys[index]
+        if index not in self.index_dict:
+            raise KeyError("Missing utterance {}!".format(index))
+        return self._load(index)
diff --git a/egs2/lrs2/lipreading1/local/path.sh b/egs2/lrs2/lipreading1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/lrs2/lipreading1/path.sh b/egs2/lrs2/lipreading1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/lrs2/lipreading1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/lrs2/lipreading1/pyscripts b/egs2/lrs2/lipreading1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/lrs2/lipreading1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/lrs2/lipreading1/run.sh b/egs2/lrs2/lipreading1/run.sh
new file mode 100755
index 00000000000..0ef8e3f887b
--- /dev/null
+++ b/egs2/lrs2/lipreading1/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="val"
+test_set="test "
+
+./asr.sh \
+    --lang en \
+    --train_set ${train_set} \
+    --token_type bpe\
+    --nbpe 200\
+    --lm_config conf/train_lm.yaml \
+    --valid_set ${valid_set} \
+    --test_sets ${test_set} \
+    --feats_type extracted \
+    --asr_config conf/train_asr_transformer.yaml \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/lrs2/lipreading1/scripts b/egs2/lrs2/lipreading1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/lrs2/lipreading1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/lrs2/lipreading1/steps b/egs2/lrs2/lipreading1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/lrs2/lipreading1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/lrs2/lipreading1/utils b/egs2/lrs2/lipreading1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/lrs2/lipreading1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/RESULTS.md b/egs2/lrs3/asr1/RESULTS.md
new file mode 100644
index 00000000000..be579a0ee64
--- /dev/null
+++ b/egs2/lrs3/asr1/RESULTS.md
@@ -0,0 +1,32 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Mon Mar  7 16:57:48 EST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- Git hash: `ce48b589cd2d04b00a867a24352fc8d45fc6afc9`
+  - Commit date: `Mon Mar 7 09:20:56 2022 -0500`
+
+## asr_train_asr_transformer_no_lm
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|30060|81.8|15.2|3.0|4.0|22.2|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|9890|90.0|8.9|1.1|1.9|11.9|46.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|155720|91.2|4.5|4.3|4.0|12.8|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|49750|95.2|2.7|2.1|1.7|6.5|46.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave/dev|2686|36737|77.1|13.2|9.7|2.9|25.8|75.3|
+|inference_asr_model_valid.acc.ave/test|1321|11831|86.5|8.0|5.5|1.3|14.7|46.6|
+
diff --git a/egs2/lrs3/asr1/asr.sh b/egs2/lrs3/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/lrs3/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/cmd.sh b/egs2/lrs3/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/lrs3/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/lrs3/asr1/conf/fbank.conf b/egs2/lrs3/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/lrs3/asr1/conf/pbs.conf b/egs2/lrs3/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/lrs3/asr1/conf/pitch.conf b/egs2/lrs3/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/lrs3/asr1/conf/queue.conf b/egs2/lrs3/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/lrs3/asr1/conf/slurm.conf b/egs2/lrs3/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/lrs3/asr1/conf/train_asr_transformer.yaml b/egs2/lrs3/asr1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..6b2da79b3d4
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1,84 @@
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+log_interval: 400   
+num_att_plot: 0     
+num_workers: 4      
+sort_in_batch: descending       # how to sort data in making batch
+sort_batch: descending          # how to sort created batches
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+use_amp: true      
+cudnn_deterministic: false  
+cudnn_benchmark: false      
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/lrs3/asr1/conf/train_lm.yaml b/egs2/lrs3/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..c9260fcbf12
--- /dev/null
+++ b/egs2/lrs3/asr1/conf/train_lm.yaml
@@ -0,0 +1,15 @@
+grad_clip: 5.0
+batch_type: folded
+batch_size: 512
+optim: adam
+optim_conf:
+    lr: 0.0005
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 4
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/lrs3/asr1/db.sh b/egs2/lrs3/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/lrs3/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/local/data.sh b/egs2/lrs3/asr1/local/data.sh
new file mode 100755
index 00000000000..954e04473b9
--- /dev/null
+++ b/egs2/lrs3/asr1/local/data.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+download_and_untar=false
+mp4_to_wav=true
+
+# Manually fill the lrs3_username, lrs3_password
+lrs3_username=
+lrs3_password=
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 1
+fi
+
+if [ -z "${LRS3}" ]; then
+    log "Fill the value of 'LRS3' of db.sh"
+    exit 1
+fi
+
+if $download_and_untar; then
+    log "Downloading and Untarring the LRS3 with username ${lrs3_username} and passwoed ${lrs3_password}."
+    local/download_and_untar.sh --remove-archive ${LRS3} ${lrs3_username} ${lrs3_password}
+fi
+
+if $mp4_to_wav; then
+    log "Extacting .wav files from .mp4 files and storing it under the same directory"
+    local/mp4_to_wav.sh ${LRS3}
+fi
+
+# Make the Folders where ESPNet data-prep files will be stored
+for dataset in train dev test; do
+    log "Creating the ./data/${dataset} folders"
+    mkdir -p ./data/${dataset}
+done
+
+# generate the utt2spk, wav.scp and text files
+log "Generating the utt2spk, wav.scp and text files"
+python3 ./local/data_prep.py --train_val_path ${LRS3}/trainval --test_path ${LRS3}/test 
+
+log "Generating the spk2utt files"
+utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+utils/utt2spk_to_spk2utt.pl data/dev/utt2spk > data/dev/spk2utt
+utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+
+log "Fix sorting issues by calling fix_data_dir.sh"
+utils/fix_data_dir.sh data/train
+utils/fix_data_dir.sh data/test
+utils/fix_data_dir.sh data/dev
+
+log "Validate the data directory"
+utils/validate_data_dir.sh data/train --no-feats
+utils/validate_data_dir.sh data/test --no-feats
+utils/validate_data_dir.sh data/dev --no-feats
diff --git a/egs2/lrs3/asr1/local/data_prep.py b/egs2/lrs3/asr1/local/data_prep.py
new file mode 100644
index 00000000000..2ba8c7a816b
--- /dev/null
+++ b/egs2/lrs3/asr1/local/data_prep.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+
+# Copyright 2022  Debayan Ghosh
+#           2022  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import argparse
+import logging
+import numpy as np
+from pathlib import Path
+from typing import Union, List
+
+
+class Utils:
+    @staticmethod
+    def read_text(text_file: str) -> str:
+        """Extracts the transcript from the database-reference text file
+
+        Args:
+        text_file (str) : Path to the database-reference text file
+
+        Return:
+        (str) The text transcript
+        """
+        with open(text_file, encoding="ISO-8859-1") as f:
+            first_line = f.readline()
+        text_val = first_line.split("Text:")[1]
+        text_val = text_val.strip("\n")
+        text_val = text_val.replace(
+            "{LG}", ""
+        )  # Special code to avoid scoring seg-fault due to utterance n706Sqp20Mk_50005
+        return text_val
+
+    @staticmethod
+    def save_list_to_file(list_data: list, save_path: str) -> None:
+        """ "Writes content of list_data to a file, line-by-line
+
+        Args:
+        list_data: List of Text to be saved to the text file
+        save_path: file to save the list_data
+        """
+        with open(save_path, "w") as f:
+            for line in list_data:
+                f.write(line + "\n")
+
+    @staticmethod
+    def get_parser():
+        """Returns the Parser object required to take inputs to data_prep.py"""
+        parser = argparse.ArgumentParser(
+            description="LRS-3 Data Preparation steps",
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        )
+        parser.add_argument(
+            "--train_val_path", type=str, help="Path to the Train/ Validation files"
+        )
+        parser.add_argument("--test_path", type=str, help="Path to the Test files")
+        return parser
+
+
+class DatasetUtils:
+    @staticmethod
+    def train_val_files(
+        train_val_path: str, train_val_ratio: float = 0.92, random_seed: int = 0
+    ) -> Union[List[str], List[str]]:
+        """Splits the folders in 'train_val_path' into the train set and test set,
+           and returns the full Train/Validation files.
+
+        Args:
+        train_val_path (str): Path to the Folder with the Train/Val data
+        train_val_ratio (float): Ratio of the Train/Test file ratio
+        random_seed (int): Seed for the file shufling
+
+        Returns:
+        speakers_train (list) : Paths of Speaker Folders for Training Data
+        speakers_val (list) : Paths of Speaker Folders for Validation Data
+        """
+        speaker_folders = os.listdir(train_val_path)
+
+        np.random.seed(random_seed)
+        np.random.shuffle(speaker_folders)
+        num_speakers = len(speaker_folders)
+
+        num_train = int(train_val_ratio * num_speakers)
+        speakers_train = speaker_folders[0:num_train]
+        speakers_val = speaker_folders[num_train:]
+
+        speakers_train = [
+            os.path.join(train_val_path, folder) for folder in speakers_train
+        ]
+        speakers_val = [os.path.join(train_val_path, folder) for folder in speakers_val]
+
+        return speakers_train, speakers_val
+
+    @staticmethod
+    def test_files(test_path: str) -> List[str]:
+        """Returns the full path to the Test files
+
+        Args:
+        test_path (str): Path to the Folder with the Test data
+
+        Returns:
+        speakers_test (list) : Paths of Speaker Folders for Test Data
+        """
+        speakers_test = os.listdir(test_path)
+        speakers_test = [os.path.join(test_path, folder) for folder in speakers_test]
+        return speakers_test
+
+    @staticmethod
+    def generate_espnet_data(
+        speaker_folders: list, dataset: str
+    ) -> Union[List[str], List[str], List[str]]:
+        """Generates the utt2spk, text and wav data required by ESPNET
+
+        Args:
+        speaker_folders (list): The folders from where to extract data
+        dataset (str): The dataset we are working with (train, test, dev)
+
+        Returns:
+        utt2spk (list) : Utterence to Speaker data
+        text (list) : Utterence to Transcript data
+        wav (list) : Utterence to Wav-Path data
+        """
+        utt2spk = []
+        text = []
+        wav = []
+
+        for speaker_folder in speaker_folders:
+
+            spk_id = os.path.basename(speaker_folder)
+
+            for wav_file in os.listdir(speaker_folder):
+
+                if not wav_file.endswith(".wav"):
+                    continue
+                text_file = wav_file.replace("wav", "txt")
+
+                wav_full_path = os.path.join(speaker_folder, wav_file)
+                text_full_path = os.path.join(speaker_folder, text_file)
+
+                assert os.path.exists(wav_full_path)
+                assert os.path.exists(text_full_path)
+
+                utt_id = spk_id + "_" + Path(wav_full_path).stem
+
+                utt2spk.append(utt_id + " " + spk_id)
+                wav.append(utt_id + " " + wav_full_path)
+                text.append(utt_id + " " + Utils.read_text(text_full_path))
+        return utt2spk, text, wav
+
+    @staticmethod
+    def perform_data_prep(speaker_folders: list, dataset: str) -> None:
+        """Performs ESPNET related Data-Preparation.
+        Generates the utt2spk, text and wav.scp files
+
+        Args:
+        speaker_folders (list): The folders from where to extract data
+        dataset (str): The dataset we are working with (train, test, dev)
+        """
+        utt2spk, text, wav = DatasetUtils.generate_espnet_data(speaker_folders, dataset)
+
+        utt2spk_file = os.path.join("data", dataset, "utt2spk")
+        text_file = os.path.join("data", dataset, "text")
+        wav_file = os.path.join("data", dataset, "wav.scp")
+
+        Utils.save_list_to_file(utt2spk, utt2spk_file)
+        Utils.save_list_to_file(text, text_file)
+        Utils.save_list_to_file(wav, wav_file)
+
+
+def main():
+    parser = Utils.get_parser()
+    args = parser.parse_args()
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    logging.basicConfig(level=logging.INFO, format=logfmt)
+
+    train_folders, dev_folders = DatasetUtils.train_val_files(args.train_val_path)
+    test_folders = DatasetUtils.test_files(args.test_path)
+
+    logging.info(f"Performing Data Preparation for TEST")
+    DatasetUtils.perform_data_prep(test_folders, "test")
+
+    logging.info(f"Performing Data Preparation for TRAIN")
+    DatasetUtils.perform_data_prep(train_folders, "train")
+
+    logging.info(f"Performing Data Preparation for DEV")
+    DatasetUtils.perform_data_prep(dev_folders, "dev")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/lrs3/asr1/local/download_and_untar.sh b/egs2/lrs3/asr1/local/download_and_untar.sh
new file mode 100755
index 00000000000..e0960f4b9ee
--- /dev/null
+++ b/egs2/lrs3/asr1/local/download_and_untar.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -lt 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base-path> <lrs3-username> <lrs3-password>"
+  echo "--args [--remove-archive] (Optional) : Remove tar files after successfully untaring"
+  echo "--args <data-base-path> : The path where to download the dataset"
+  echo "--args <lrs3-username> : The username required to download the dataset"
+  echo "--args <lrs3-password> : The password required to download the dataset"
+  echo "If you do not have a username/password, please request from: https://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs3.html"
+  exit 1
+fi
+
+data=$1
+lrs3_username=$2
+lrs3_password=$3
+lrs3_base_url=https://thor.robots.ox.ac.uk/~vgg/data/lip_reading/data3/
+lrs3_train_val_file=lrs3_trainval.zip
+lrs3_test_file=lrs3_test_v0.4.zip
+
+echo "Downloading Train/Val data from ${lrs3_base_url}${lrs3_train_val_file}"
+
+if [ -f ${data}/${lrs3_train_val_file} ]; then
+     rm  ${data}/${lrs3_train_val_file}
+fi
+
+if ! wget  --user ${lrs3_username} --password ${lrs3_password} -P $data  ${lrs3_base_url}${lrs3_train_val_file} ; then
+  echo "$0: error executing wget  --user ${lrs3_username} --password ${lrs3_password} -P $data  ${lrs3_base_url}${lrs3_train_val_file}"
+  exit 1
+fi
+
+echo "Downloading Test data from ${lrs3_base_url}${lrs3_test_file}"
+
+if [ -f ${data}/${lrs3_test_file} ]; then
+     rm  ${data}/${lrs3_test_file}
+fi
+
+if ! wget  --user ${lrs3_username} --password ${lrs3_password} -P $data   ${lrs3_base_url}${lrs3_test_file} ; then
+  echo "$0: error executing wget  --user ${lrs3_username} --password ${lrs3_password} -P $data   ${lrs3_base_url}${lrs3_test_file}"
+  exit 1
+fi
+
+
+if [ -e ${data}/trainval ]; then
+    echo "Removing existing files in ${data}/trainval before unzipping"
+    rm -rf ${data}/trainval
+fi
+
+echo "Un-Zipping Train/Val data from ${data}/${lrs3_train_val_file}"
+
+if ! unzip -qq ${data}/${lrs3_train_val_file} -d ${data}; then
+    echo "Failed to unzip ${data}/${lrs3_train_val_file}"
+    exit 1
+fi
+
+
+if [ -e ${data}/test ]; then
+    echo "Removing existing files in ${data}/test before unzipping"
+    rm -rf ${data}/test
+fi
+
+echo "Un-Zipping Test data from ${data}/${lrs3_test_file}"
+
+if ! unzip -qq ${data}/${lrs3_test_file} -d ${data}; then
+    echo "Failed to unzip ${data}/${lrs3_test_file}"
+    exit 1
+fi
+
+echo "$0: Successfully downloaded and un-tarred ${data}/${lrs3_train_val_file} and ${data}/${lrs3_test_file}"
+
+if $remove_archive; then
+  echo "$0: removing${data}/${lrs3_train_val_file} and  ${data}/${lrs3_test_file} file since --remove-archive option was supplied."
+  rm ${data}/${lrs3_train_val_file}
+  rm ${data}/${lrs3_test_file}
+fi
+
diff --git a/egs2/lrs3/asr1/local/mp4_to_wav.sh b/egs2/lrs3/asr1/local/mp4_to_wav.sh
new file mode 100755
index 00000000000..374d80de631
--- /dev/null
+++ b/egs2/lrs3/asr1/local/mp4_to_wav.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+
+if [ $# -lt 1 ]; then
+  echo "Usage: $0 <data-base-path>"
+  echo "--args <data-base-path> : The path to the dataset"
+  exit 1
+fi
+
+data=$1
+
+for dataset in trainval test; do
+    for mp4_path in ${data}/${dataset}/*/*.mp4; do
+        # Store the .wav file in the same folder where the .mp4 file is
+        wav_path=${mp4_path//.mp4/.wav}
+        if ! [ -f  ${wav_path} ]; then 
+          ffmpeg -y -i ${mp4_path} -loglevel panic -ar 16000 -ac 1 ${wav_path} 
+        fi
+    done
+done
diff --git a/egs2/lrs3/asr1/local/path.sh b/egs2/lrs3/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/lrs3/asr1/path.sh b/egs2/lrs3/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/lrs3/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/pyscripts b/egs2/lrs3/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/lrs3/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/run.sh b/egs2/lrs3/asr1/run.sh
new file mode 100755
index 00000000000..54d7f17454d
--- /dev/null
+++ b/egs2/lrs3/asr1/run.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test dev"
+
+
+asr_tag=train_asr_transformer
+asr_config=conf/train_asr_transformer.yaml
+lm_config=conf/train_lm.yaml  # Not Used, as use_lm=false
+
+./asr.sh \
+    --skip_data_prep false \
+    --skip_train false \
+    --skip_eval false \
+    --stage 1 \
+    --lang en \
+    --ngpu 1 \
+    --nj 32 \
+    --inference_nj 32 \
+    --nbpe 5000 \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --audio_format "wav" \
+    --feats_type raw \
+    --use_lm false \
+    --asr_tag "${asr_tag}" \
+    --lm_config ${lm_config} \
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/lrs3/asr1/scripts b/egs2/lrs3/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/lrs3/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/steps b/egs2/lrs3/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/lrs3/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/lrs3/asr1/utils b/egs2/lrs3/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/lrs3/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mini_an4/asr1/cmd.sh b/egs2/mini_an4/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/mini_an4/asr1/cmd.sh
+++ b/egs2/mini_an4/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/mini_an4/asr1/conf/decode_asr_transformer_with_k2.yaml b/egs2/mini_an4/asr1/conf/decode_asr_transformer_with_k2.yaml
new file mode 100644
index 00000000000..64319140f8f
--- /dev/null
+++ b/egs2/mini_an4/asr1/conf/decode_asr_transformer_with_k2.yaml
@@ -0,0 +1,7 @@
+search_beam_size: 20
+output_beam_size: 20
+blank_bias: -0.0
+lattice_weight: 1.0
+am_weight: 1.0
+decoder_weight: 0.5
+nnlm_weight: 1.0
diff --git a/egs2/mini_an4/enh1/cmd.sh b/egs2/mini_an4/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/mini_an4/enh1/cmd.sh
+++ b/egs2/mini_an4/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/mini_an4/enh1/conf/train.yaml b/egs2/mini_an4/enh1/conf/train.yaml
index c4db4cd073d..9d62b5c6767 100644
--- a/egs2/mini_an4/enh1/conf/train.yaml
+++ b/egs2/mini_an4/enh1/conf/train.yaml
@@ -1,6 +1,4 @@
-model_conf:
-    loss_type: mask_mse
-    mask_type: psm
+
 encoder: stft
 encoder_conf:
     n_fft: 512
@@ -17,3 +15,15 @@ separator_conf:
     layer: 1
     unit: 128
     dropout: 0.2
+
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM^2
+    # the wrapper for the current criterion
+    # for single-talker case, we simplely use fixed_order wrapper
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/cmd.sh b/egs2/mini_an4/ssl1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mini_an4/ssl1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mini_an4/ssl1/conf/pbs.conf b/egs2/mini_an4/ssl1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mini_an4/ssl1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mini_an4/ssl1/conf/queue.conf b/egs2/mini_an4/ssl1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mini_an4/ssl1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mini_an4/ssl1/conf/slurm.conf b/egs2/mini_an4/ssl1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mini_an4/ssl1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mini_an4/ssl1/conf/train_asr_hubert_base_pretrain_it0.yaml b/egs2/mini_an4/ssl1/conf/train_asr_hubert_base_pretrain_it0.yaml
new file mode 100644
index 00000000000..d42fb778e9c
--- /dev/null
+++ b/egs2/mini_an4/ssl1/conf/train_asr_hubert_base_pretrain_it0.yaml
@@ -0,0 +1,59 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 1500000
+accum_grad: 1
+max_epoch: 100
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 100
+encoder: hubert_pretrain
+encoder_conf:
+    output_size: 32
+    linear_units: 256
+    attention_heads: 8
+    num_blocks: 2
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    # TODO(jzmo): add comments here
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(64,10,5)] + [(64,3,2)] * 4 + [(64,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 100
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
diff --git a/egs2/mini_an4/ssl1/conf/train_asr_hubert_base_pretrain_it1.yaml b/egs2/mini_an4/ssl1/conf/train_asr_hubert_base_pretrain_it1.yaml
new file mode 100644
index 00000000000..ce581320202
--- /dev/null
+++ b/egs2/mini_an4/ssl1/conf/train_asr_hubert_base_pretrain_it1.yaml
@@ -0,0 +1,59 @@
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 1500000
+accum_grad: 1
+max_epoch: 100
+patience: none
+# Use self-defined function for initialization
+init: xavier_uniform 
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+input_size: 100
+encoder: hubert_pretrain
+encoder_conf:
+    output_size: 32
+    linear_units: 256
+    attention_heads: 8
+    num_blocks: 2
+    dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    dropout_input: 0.1
+    dropout_features: 0.1
+    # TODO(jzmo): add comments here
+    skip_masked: false
+    skip_nomask: false
+    mask_prob: 0.80
+    extractor_mode: default
+    conv_feature_layers: '[(64,10,5)] + [(64,3,2)] * 4 + [(64,2,2)] * 2'
+    final_dim: 256
+    encoder_layerdrop: 0.05
+    feature_grad_mult: 0.1
+    untie_final_proj: true
+    label_rate: 50
+    sample_rate: 16000
+
+model_conf:
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    pred_masked_weight: 1.0
+    pred_nomask_weight: 0.0
+    loss_weights: 10.0
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+unused_parameters: true
+
+frontend: null
+
+normalize: null
+
+specaug: null
diff --git a/egs2/mini_an4/ssl1/db.sh b/egs2/mini_an4/ssl1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mini_an4/ssl1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/downloads.tar.gz b/egs2/mini_an4/ssl1/downloads.tar.gz
new file mode 120000
index 00000000000..e916a05f1df
--- /dev/null
+++ b/egs2/mini_an4/ssl1/downloads.tar.gz
@@ -0,0 +1 @@
+../../../egs/mini_an4/asr1/downloads.tar.gz
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/hubert.sh b/egs2/mini_an4/ssl1/hubert.sh
new file mode 120000
index 00000000000..d02e853c54c
--- /dev/null
+++ b/egs2/mini_an4/ssl1/hubert.sh
@@ -0,0 +1 @@
+../../TEMPLATE/ssl1/hubert.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/local/data.sh b/egs2/mini_an4/ssl1/local/data.sh
new file mode 100755
index 00000000000..7c729274023
--- /dev/null
+++ b/egs2/mini_an4/ssl1/local/data.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+
+an4_root=./downloads/an4
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+train_set="train_nodev"
+train_dev="train_dev"
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Untar downloads.tar.gz"
+    if [ ! -e downloads/ ]; then
+        tar -xvf downloads.tar.gz
+    fi
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data preparation"
+    mkdir -p data/{train,test}
+
+    if [ ! -f ${an4_root}/README ]; then
+        echo Cannot find an4 root! Exiting...
+        exit 1
+    fi
+
+    python3 local/data_prep.py ${an4_root} sph2pipe
+
+    for x in test train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+    done
+
+    # make a dev set
+    utils/subset_data_dir.sh --first data/train 1 data/${train_dev}
+    n=$(($(wc -l < data/train/text) - 1))
+    utils/subset_data_dir.sh --last data/train ${n} data/${train_set}
+
+    # Create "test_seg" in order to test the use case of segments
+    rm -rf data/test_seg
+    utils/copy_data_dir.sh data/test data/test_seg
+    <data/test/wav.scp awk '{ for(i=2;i<=NF;i++){a=a " " $i}; print($1 "_org", a) }' > data/test_seg/wav.scp
+    cat << EOF > data/test_seg/segments
+fcaw-cen8-b fcaw-cen8-b_org 0.0 2.9
+mmxg-cen8-b mmxg-cen8-b_org 0.0 2.3
+EOF
+
+    # for enh task
+    for x in test ${train_set} ${train_dev}; do
+        cp data/${x}/wav.scp data/${x}/spk1.scp
+    done
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mini_an4/ssl1/local/data_prep.py b/egs2/mini_an4/ssl1/local/data_prep.py
new file mode 120000
index 00000000000..d416349ede8
--- /dev/null
+++ b/egs2/mini_an4/ssl1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/local/download_and_untar.sh b/egs2/mini_an4/ssl1/local/download_and_untar.sh
new file mode 120000
index 00000000000..40bf437ab02
--- /dev/null
+++ b/egs2/mini_an4/ssl1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/an4/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/local/path.sh b/egs2/mini_an4/ssl1/local/path.sh
new file mode 100644
index 00000000000..306a709a086
--- /dev/null
+++ b/egs2/mini_an4/ssl1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import fairseq" > /dev/null; then
+    echo "Error: fairseq is not installed." >&2
+    echo "Error: please install fairseq and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make fairseq.done" >&2
+    return 1
+fi
diff --git a/egs2/mini_an4/ssl1/path.sh b/egs2/mini_an4/ssl1/path.sh
new file mode 120000
index 00000000000..2206738bc13
--- /dev/null
+++ b/egs2/mini_an4/ssl1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/ssl1/path.sh
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/pyscripts b/egs2/mini_an4/ssl1/pyscripts
new file mode 120000
index 00000000000..9a529559b00
--- /dev/null
+++ b/egs2/mini_an4/ssl1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/ssl1/pyscripts/
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/run.sh b/egs2/mini_an4/ssl1/run.sh
new file mode 100755
index 00000000000..06a192a73c0
--- /dev/null
+++ b/egs2/mini_an4/ssl1/run.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+pretrain_start_iter=0
+pretrain_stop_iter=1
+
+n_clusters_iter0=10
+n_clusters_iter1=10
+
+feature_iter0="mfcc"
+feature_iter1="hubert6"
+
+train_set="train_nodev"
+valid_set="train_dev"
+
+pretrain_config_iter0=conf/train_asr_hubert_base_pretrain_it0.yaml
+pretrain_config_iter1=conf/train_asr_hubert_base_pretrain_it1.yaml
+
+./hubert.sh \
+    --lang en \
+    --pretrain_start_iter "${pretrain_start_iter}"\
+    --pretrain_stop_iter "${pretrain_stop_iter}" \
+    --nj 32 \
+    --max_wav_duration 30 \
+    --pretrain_configs "${pretrain_config_iter0} ${pretrain_config_iter1}" \
+    --n_clusters "${n_clusters_iter0} ${n_clusters_iter1}" \
+    --features_km "${feature_iter0} ${feature_iter1}" \
+    --portion_km 1.0 \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" "$@"
diff --git a/egs2/mini_an4/ssl1/scripts b/egs2/mini_an4/ssl1/scripts
new file mode 120000
index 00000000000..22268f36d8e
--- /dev/null
+++ b/egs2/mini_an4/ssl1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/ssl1/scripts/
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/steps b/egs2/mini_an4/ssl1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mini_an4/ssl1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mini_an4/ssl1/utils b/egs2/mini_an4/ssl1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mini_an4/ssl1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mini_an4/tts1/cmd.sh b/egs2/mini_an4/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/mini_an4/tts1/cmd.sh
+++ b/egs2/mini_an4/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/mini_librispeech/diar1/README.md b/egs2/mini_librispeech/diar1/README.md
new file mode 100644
index 00000000000..e82d139c3e1
--- /dev/null
+++ b/egs2/mini_librispeech/diar1/README.md
@@ -0,0 +1,59 @@
+<!-- Generated by scripts/utils/show_diar_result.sh -->
+# RESULTS
+## EEND-EDA model with no prior information of the number of speakers
+Note that this is a preliminary experiment, and the number of speakers in both training and test sets is limited to `2` (the system still "estimates" the number of speakers).
+## Environments
+- date: `Tue Sep 28 18:39:31 EDT 2021`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `5ffc323b3c7b6123fc48eabb768776e214bc2d68`
+  - Commit date: `Tue Sep 28 18:14:14 2021 -0400`
+
+## diar_train_diar_eda_raw_max_epoch25
+- Training Config: `conf/train_diar_eda.yaml`
+- Inference Config: `conf/decode_diar.yaml`
+### DER
+`dev_clean_2_ns2_beta2_500`
+
+|threshold_median_collar|DER|
+|---|---|
+|result_th0.3_med11_collar0.0|33.51|
+|result_th0.3_med1_collar0.0|33.62|
+|result_th0.4_med11_collar0.0|32.75|
+|result_th0.4_med1_collar0.0|32.85|
+|**result_th0.5_med11_collar0.0**|**32.50**|
+|result_th0.5_med1_collar0.0|32.65|
+|result_th0.6_med11_collar0.0|33.05|
+|result_th0.6_med1_collar0.0|33.22|
+|result_th0.7_med11_collar0.0|34.83|
+|result_th0.7_med1_collar0.0|35.07|
+
+## SA-EEND model with known number of speakers
+## Environments
+- date: `Wed Aug 25 23:29:07 EDT 2021`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.2a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `19bcd34f9395e01e54a97c4db5ecbcedb429dd92`
+  - Commit date: `Tue Aug 24 19:50:44 2021 -0400`
+
+## diar_train_diar_raw_max_epoch20
+- Training Config: `conf/train_diar.yaml`
+- Inference Config: `conf/decode_diar.yaml`
+- Pretrained Model: https://zenodo.org/record/5264020
+### DER
+`dev_clean_2_ns2_beta2_500`
+
+|threshold_median_collar|DER|
+|---|---|
+|result_th0.3_med1_collar0.0|32.42|
+|result_th0.3_med11_collar0.0|32.03|
+|result_th0.4_med1_collar0.0|30.96|
+|result_th0.4_med11_collar0.0|30.26|
+|result_th0.5_med1_collar0.0|30.35|
+|**result_th0.5_med11_collar0.0**|**29.37**|
+|result_th0.6_med1_collar0.0|30.77|
+|result_th0.6_med11_collar0.0|29.52|
+|result_th0.7_med1_collar0.0|32.60|
+|result_th0.7_med11_collar0.0|31.03|
diff --git a/egs2/mini_librispeech/diar1/cmd.sh b/egs2/mini_librispeech/diar1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/mini_librispeech/diar1/cmd.sh
+++ b/egs2/mini_librispeech/diar1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/mini_librispeech/diar1/conf/decode_diar.yaml b/egs2/mini_librispeech/diar1/conf/decode_diar.yaml
index ed2bb03b06d..4b579c349ea 100644
--- a/egs2/mini_librispeech/diar1/conf/decode_diar.yaml
+++ b/egs2/mini_librispeech/diar1/conf/decode_diar.yaml
@@ -1,3 +1,3 @@
 fs: 8000
 segment_size: 10
-
+# num_spk: 2 # The number of speakers will be estimated if "num_spk" is "None" (model_type "eda")
diff --git a/egs2/mini_librispeech/diar1/conf/train_diar.yaml b/egs2/mini_librispeech/diar1/conf/train_diar.yaml
index 3ab61fac58b..ff793e2bfb3 100644
--- a/egs2/mini_librispeech/diar1/conf/train_diar.yaml
+++ b/egs2/mini_librispeech/diar1/conf/train_diar.yaml
@@ -26,10 +26,10 @@ batch_size: 16
 optim: adam
 accum_grad: 2
 grad_clip: 5
-patience: 0
+patience: 3
 max_epoch: 100
 optim_conf:
-    lr: 1.0
+    lr: 0.01
 scheduler: noamlr
 scheduler_conf:
     warmup_steps: 1000
@@ -41,4 +41,4 @@ best_model_criterion:
    - max
 keep_nbest_models: 3
 
-init: chainer
+init: xavier_uniform
diff --git a/egs2/mini_librispeech/diar1/conf/train_diar_eda.yaml b/egs2/mini_librispeech/diar1/conf/train_diar_eda.yaml
new file mode 100644
index 00000000000..97d9a6ab098
--- /dev/null
+++ b/egs2/mini_librispeech/diar1/conf/train_diar_eda.yaml
@@ -0,0 +1,58 @@
+#    If "attractor" is "None" or not specified, SA-EEND will be used. 
+#    Else if "attractor" is not "None" EEND-EDA will be used.
+#    For the details about SA-EEND and EEND-EDA, refer to the following papers:
+#    SA-EEND: https://arxiv.org/pdf/1909.06247.pdf
+#    EEND-EDA: https://arxiv.org/pdf/2005.09921.pdf, https://arxiv.org/pdf/2106.10654.pdf
+
+
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "linear"
+    num_blocks: 4
+    linear_units: 512
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# attractor related
+attractor: rnn
+attractor_conf:
+    unit: 256 # same as encoder output size
+    layer: 1
+    dropout: 0.1
+    attractor_grad: True
+
+# decoder related
+#decoder: linear
+
+# chunk-based training
+iterator_type: chunk
+chunk_length: 200000
+num_cache_chunks: 64
+
+# minibatch related
+batch_type: folded
+batch_size: 16
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+max_epoch: 100
+optim_conf:
+    lr: 0.02
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 1000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 3
+
+init: xavier_uniform
diff --git a/egs2/mini_librispeech/diar1/local/data.sh b/egs2/mini_librispeech/diar1/local/data.sh
index 10f05b25344..5a297707e39 100755
--- a/egs2/mini_librispeech/diar1/local/data.sh
+++ b/egs2/mini_librispeech/diar1/local/data.sh
@@ -68,10 +68,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         if [ ! -e "${MINI_LIBRISPEECH}"/sim_rir_8k.zip ]; then
             wget -nv --no-check-certificate "${rir_url}" -P "${MINI_LIBRISPEECH}"
         fi
-        unzip -q sim_rir_8k.zip -d "${MINI_LIBRISPEECH}"/sim_rir_8k
+        unzip -q "${MINI_LIBRISPEECH}"/sim_rir_8k.zip -d "${MINI_LIBRISPEECH}"/sim_rir_8k
         find "${MINI_LIBRISPEECH}"/sim_rir_8k -iname "*.wav" \
             | awk '{n=split($1,A,/[\/\.]/); print A[n-3]"_"A[n-1], $1}' \
-            | sort > "${MINI_LIBRISPEECH}"/simu_rirs_8k/wav.scp
+            | sort > data/simu_rirs_8k/wav.scp
         awk '{print $1, $1}' data/simu_rirs_8k/wav.scp > data/simu_rirs_8k/utt2spk
         utils/fix_data_dir.sh data/simu_rirs_8k
         touch "${MINI_LIBRISPEECH}"/simu_rirs_8k.done
diff --git a/egs2/mini_librispeech/diar1/local/simulation/make_mixture.py b/egs2/mini_librispeech/diar1/local/simulation/make_mixture.py
index 7693f691b97..ad16f72ec18 100755
--- a/egs2/mini_librispeech/diar1/local/simulation/make_mixture.py
+++ b/egs2/mini_librispeech/diar1/local/simulation/make_mixture.py
@@ -106,8 +106,8 @@
     else:
         noise_data = noise_data[:maxlen]
     # noise power is scaled according to selected SNR, then mixed
-    signal_power = np.sum(mixture ** 2) / len(mixture)
-    noise_power = np.sum(noise_data ** 2) / len(noise_data)
+    signal_power = np.sum(mixture**2) / len(mixture)
+    noise_power = np.sum(noise_data**2) / len(noise_data)
     scale = math.sqrt(math.pow(10, -noise_snr / 10) * signal_power / noise_power)
     mixture += noise_data * scale
     # output the wav file and write wav.scp
diff --git a/egs2/mini_librispeech/diar1/local/simulation/make_mixture_nooverlap.py b/egs2/mini_librispeech/diar1/local/simulation/make_mixture_nooverlap.py
index 7d2d73c9916..9b8c24cd87f 100755
--- a/egs2/mini_librispeech/diar1/local/simulation/make_mixture_nooverlap.py
+++ b/egs2/mini_librispeech/diar1/local/simulation/make_mixture_nooverlap.py
@@ -90,8 +90,8 @@
     else:
         noise_data = noise_data[:maxlen]
     # noise power is scaled according to selected SNR, then mixed
-    signal_power = np.sum(mixture ** 2) / len(mixture)
-    noise_power = np.sum(noise_data ** 2) / len(noise_data)
+    signal_power = np.sum(mixture**2) / len(mixture)
+    noise_power = np.sum(noise_data**2) / len(noise_data)
     scale = math.sqrt(math.pow(10, -noise_snr / 10) * signal_power / noise_power)
     mixture += noise_data * scale
     # output the wav file and write wav.scp
diff --git a/egs2/mini_librispeech/diar1/run.sh b/egs2/mini_librispeech/diar1/run.sh
index 1cad226df7b..040cb3109e2 100755
--- a/egs2/mini_librispeech/diar1/run.sh
+++ b/egs2/mini_librispeech/diar1/run.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Copyright 2021 Jiatong Shi
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
@@ -18,8 +18,6 @@ train_config=conf/train_diar.yaml
 decode_config=conf/decode_diar.yaml
 
 ./diar.sh \
-    --stage 1 \
-    --stop_stage 7 \
     --collar 0.0 \
     --train_set "${train_set}" \
     --valid_set "${valid_set}" \
diff --git a/egs2/misp2021/asr1/README.md b/egs2/misp2021/asr1/README.md
new file mode 100644
index 00000000000..03505ee751a
--- /dev/null
+++ b/egs2/misp2021/asr1/README.md
@@ -0,0 +1,23 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Sat Oct 30 04:07:35 UTC 2021`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.4a1`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `5d941dd5909291df991b25273741f32b0631ea91`
+  - Commit date: `Mon Oct 25 06:12:01 2021 +0000`
+
+## asr_train_asr_conformer_fbank_pitch_zh_char
+### CER
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev_far|18985|254320|38.5|40.2|21.3|3.8|65.3|92.0|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test_far|14949|312412|20.7|29.9|49.4|0.5|79.8|93.6|
+
+## asr_train_asr_conformer_raw_zh_char
+### CER
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev_far|19015|254944|42.6|38.5|18.9|4.1|61.5|91.2|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test_hire_far|14949|312412|23.1|29.1|47.8|0.5|77.5|93.1|
diff --git a/egs2/misp2021/asr1/asr.sh b/egs2/misp2021/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/misp2021/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/misp2021/asr1/cmd.sh b/egs2/misp2021/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/misp2021/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/misp2021/asr1/conf/beamformit.cfg b/egs2/misp2021/asr1/conf/beamformit.cfg
new file mode 100644
index 00000000000..7b63eb01790
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/beamformit.cfg
@@ -0,0 +1,44 @@
+#configuration file for the delay_sum algorithm
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+do_compute_reference = 1
+#reference_channel = 2
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
diff --git a/egs2/misp2021/asr1/conf/decode_asr.yaml b/egs2/misp2021/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..40c05133e3a
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 1.0
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/misp2021/asr1/conf/fbank.conf b/egs2/misp2021/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/misp2021/asr1/conf/pbs.conf b/egs2/misp2021/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/misp2021/asr1/conf/pitch.conf b/egs2/misp2021/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/misp2021/asr1/conf/queue.conf b/egs2/misp2021/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/misp2021/asr1/conf/slurm.conf b/egs2/misp2021/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/misp2021/asr1/conf/train_asr_transformer.yaml b/egs2/misp2021/asr1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..3569eaab393
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 100
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# criterion
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
diff --git a/egs2/misp2021/asr1/conf/train_lm.yaml b/egs2/misp2021/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..26ea1e4f96e
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/train_lm.yaml
@@ -0,0 +1,19 @@
+# rnnlm related
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 2
+
+# optimization related
+grad_clip: 5.0
+batch_type: folded
+batch_size: 64  # batch size in LM training
+max_epoch: 20   # if the data size is large, we can reduce this
+patience: 3
+optim: sgd
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/misp2021/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/misp2021/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100755
index 00000000000..bcf0eeab9a8
--- /dev/null
+++ b/egs2/misp2021/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 2000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 100
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/misp2021/asr1/db.sh b/egs2/misp2021/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/misp2021/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/misp2021/asr1/local/data.sh b/egs2/misp2021/asr1/local/data.sh
new file mode 100755
index 00000000000..bcbdbd07d4b
--- /dev/null
+++ b/egs2/misp2021/asr1/local/data.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=0
+stop_stage=1
+log "$0 $*"
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+
+if [ ! -e "${MISP2021}" ]; then
+    log "Fill the value of 'MISP2021' of db.sh"
+    exit 1
+fi
+
+
+enhancement_dir=data/misp2021_far_WPE
+
+###########################################################################
+# wpe+beamformit
+###########################################################################
+# use nara-wpe and beamformit to enhance multichannel misp data
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  log "stage 0: Nara-wpe and Beamformit"
+  for x in dev train ; do
+    local/enhancement.sh $MISP2021/audio/$x ${enhancement_dir}/audio/$x  || exit 1;
+  done
+fi
+
+###########################################################################
+# prepare data
+###########################################################################
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  for x in dev train ; do
+    if [[ ! -f data/${x}_far/.done ]]; then
+      local/prepare_data.sh $MISP2021 $enhancement_dir $x data/${x}_far || exit 1;
+    fi
+  done
+fi
diff --git a/egs2/misp2021/asr1/local/enhancement.sh b/egs2/misp2021/asr1/local/enhancement.sh
new file mode 100755
index 00000000000..52e9a407795
--- /dev/null
+++ b/egs2/misp2021/asr1/local/enhancement.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# Copyright 2018 USTC (Authors: Zhaoxu Nian, Hang Chen, Yen-Ju Lu)
+# Apache 2.0
+
+# use nara-wpe and BeamformIt to enhance multichannel data
+
+set -e -o pipefail
+
+# configs
+stage=0
+nj=6
+cmd=run.pl
+
+. utils/parse_options.sh
+
+. ./path.sh || exit 1;
+
+pip show -f nara_wpe >/dev/null || pip install nara_wpe
+
+if [ -z $BEAMFORMIT ] ; then
+  export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt
+fi
+export PATH=${PATH}:$BEAMFORMIT
+! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/kaldi/tools; extras/install_beamformit.sh; cd -;'" && exit 1
+
+if [ $# != 2 ]; then
+ echo "Usage: $0 <corpus-data-dir> <enhancement-dir>"
+ echo " $0 /path/misp2021 /path/wpe_output"
+ exit 1;
+fi
+
+data_root=$1
+out_root=$2
+
+echo "start speech enhancement"
+# wpe
+if [ $stage -le 0 ]; then
+  echo "start wpe"
+  python local/find_wav.py -nj $nj $data_root $out_root/log wpe Far
+  for n in `seq $nj`; do
+    cat <<-EOF > $out_root/log/wpe.$n.sh
+    python local/run_wpe.py $out_root/log/wpe.$n.scp $data_root $out_root
+EOF
+  done
+  chmod a+x $out_root/log/wpe.*.sh
+  $cmd JOB=1:$nj $out_root/log/wpe.JOB.log $out_root/log/wpe.JOB.sh
+  echo "finish wpe"
+fi
+
+# BeamformIt
+if [ $stage -le 1 ]; then
+  echo "start beamformit"
+  python local/find_wav.py $PWD/$out_root $out_root/log beamformit Far
+  python local/run_beamformit.py $BEAMFORMIT/BeamformIt conf/beamformit.cfg / $out_root/log/beamformit.scp $out_root
+  echo "end beamformit"
+fi
+echo "end speech enhancement"
diff --git a/egs2/misp2021/asr1/local/find_wav.py b/egs2/misp2021/asr1/local/find_wav.py
new file mode 100755
index 00000000000..216c30de4c9
--- /dev/null
+++ b/egs2/misp2021/asr1/local/find_wav.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import glob
+import codecs
+import argparse
+
+
+def find_wav(data_root, scp_dir, scp_name="wpe", wav_type="Far", n_split=1):
+    type2group_num = {"Far": 6, "Middle": 2}
+    wav_dir = os.path.join(data_root, wav_type.lower())
+    all_wav_paths = glob.glob(os.path.join(wav_dir, "*.wav"))
+
+    sorted_wav_paths = sorted(all_wav_paths)
+    lines = ["" for _ in range(n_split)]
+    for wav_idx in range(len(sorted_wav_paths) // type2group_num[wav_type]):
+        line = sorted_wav_paths[wav_idx * type2group_num[wav_type]]
+        group_name = "_".join(line.split("/")[-1].split("_")[:5])
+        line = group_name + " " + line
+        for i in range(1, type2group_num[wav_type]):
+            line = line + " " + sorted_wav_paths[wav_idx * type2group_num[wav_type] + i]
+        line += "\n"
+        lines[wav_idx % n_split] += line
+
+    if not os.path.exists(scp_dir):
+        os.makedirs(scp_dir, exist_ok=True)
+
+    if n_split == 1:
+        with codecs.open(
+            os.path.join(scp_dir, "{}.scp".format(scp_name)), "w"
+        ) as handle:
+            handle.write(lines[0])
+        return None
+    for j in range(n_split):
+        with codecs.open(
+            os.path.join(scp_dir, "{}.{}.scp".format(scp_name, j + 1)), "w"
+        ) as handle:
+            handle.write(lines[j])
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("")
+    parser.add_argument("data_dir", type=str, default="wpe", help="dir of misp data")
+    parser.add_argument("scp_dir", type=str, default="wpe", help="dir of scp file")
+    parser.add_argument("scp_name", type=str, default="wpe", help="name of scp file")
+    parser.add_argument("wav_type", type=str, default="Far", help="wav type")
+    parser.add_argument("-nj", type=int, default=1, help="number of split files")
+    args = parser.parse_args()
+
+    find_wav(
+        data_root=args.data_dir,
+        scp_dir=args.scp_dir,
+        scp_name=args.scp_name,
+        wav_type=args.wav_type,
+        n_split=args.nj,
+    )
diff --git a/egs2/misp2021/asr1/local/path.sh b/egs2/misp2021/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/misp2021/asr1/local/prepare_data.sh b/egs2/misp2021/asr1/local/prepare_data.sh
new file mode 100755
index 00000000000..b3db61ce09a
--- /dev/null
+++ b/egs2/misp2021/asr1/local/prepare_data.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Copyright 2018 USTC (Authors: Hang Chen, Yen-Ju Lu)
+# Apache 2.0
+
+# transform misp data to kaldi format
+
+set -e -o pipefail
+
+# configs
+nj=1
+
+. ./cmd.sh || exit 1
+. ./path.sh || exit 1
+. ./utils/parse_options.sh || exit 1
+
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 <corpus-data-dir> <enhancement-data-dir> <data-set> <store-dir>"
+  echo " $0 /path/misp /path/misp_WPE train data/train_far"
+  exit 1;
+fi
+
+data_root=$1
+enhancement_root=$2
+data_type=$3
+store_dir=$4
+
+# wav.scp segments text utt2spk
+echo "prepare wav.scp segments text utt2spk"
+python local/prepare_far_data.py -nj $nj $enhancement_root/audio $data_root/video $data_root/transcription $data_type $store_dir
+cat $store_dir/temp/wav.scp | sort -k 1 | uniq > $store_dir/wav.scp
+cat $store_dir/temp/mp4.scp | sort -k 1 | uniq > $store_dir/mp4.scp
+cat $store_dir/temp/segments | sort -k 1 | uniq > $store_dir/segments
+cat $store_dir/temp/utt2spk | sort -k 1 | uniq > $store_dir/utt2spk
+cat $store_dir/temp/text_sentence | sort -k 1 | uniq > $store_dir/text
+
+echo "prepare done"
+
+# spk2utt
+utils/utt2spk_to_spk2utt.pl $store_dir/utt2spk | sort -k 1 | uniq > $store_dir/spk2utt
+
+touch data/nlsyms.txt
+
+echo "local/prepare_data.sh succeeded"
+exit 0
\ No newline at end of file
diff --git a/egs2/misp2021/asr1/local/prepare_far_data.py b/egs2/misp2021/asr1/local/prepare_far_data.py
new file mode 100755
index 00000000000..b98766f2168
--- /dev/null
+++ b/egs2/misp2021/asr1/local/prepare_far_data.py
@@ -0,0 +1,526 @@
+#!/usr/bin/env python
+# -- coding: UTF-8
+import os
+import glob
+import codecs
+import argparse
+from multiprocessing import Pool
+import sys
+
+
+def text2lines(textpath, lines_content=None):
+    """
+    read lines from text or write lines to txt
+    :param textpath: filepath of text
+    :param lines_content: list of lines or None, None means read
+    :return: processed lines content for read while None for write
+    """
+    if lines_content is None:
+        with codecs.open(textpath, "r") as handle:
+            lines_content = handle.readlines()
+        processed_lines = list(
+            map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)
+        )
+        return processed_lines
+    else:
+        processed_lines = list(
+            map(lambda x: x if x[-1] in ["\n"] else "{}\n".format(x), lines_content)
+        )
+        with codecs.open(textpath, "w") as handle:
+            handle.write("".join(processed_lines))
+        return None
+
+
+# class definition
+class TextGrid(object):
+    def __init__(
+        self,
+        file_type="",
+        object_class="",
+        xmin=0.0,
+        xmax=0.0,
+        tiers_status="",
+        tiers=[],
+    ):
+        self.file_type = file_type
+        self.object_class = object_class
+        self.xmin = xmin
+        self.xmax = xmax
+        self.tiers_status = tiers_status
+        self.tiers = tiers
+
+        if self.xmax < self.xmin:
+            raise ValueError("xmax ({}) < xmin ({})".format(self.xmax, self.xmin))
+
+    def cutoff(self, xstart=None, xend=None):
+        if xstart is None:
+            xstart = self.xmin
+
+        if xend is None:
+            xend = self.xmax
+
+        if xend < xstart:
+            raise ValueError("xend ({}) < xstart ({})".format(xend, xstart))
+
+        new_xmax = xend - xstart + self.xmin
+        new_xmin = self.xmin
+        new_tiers = []
+
+        for tier in self.tiers:
+            new_tiers.append(tier.cutoff(xstart=xstart, xend=xend))
+        return TextGrid(
+            file_type=self.file_type,
+            object_class=self.object_class,
+            xmin=new_xmin,
+            xmax=new_xmax,
+            tiers_status=self.tiers_status,
+            tiers=new_tiers,
+        )
+
+
+class Tier(object):
+    def __init__(self, tier_class="", name="", xmin=0.0, xmax=0.0, intervals=[]):
+        self.tier_class = tier_class
+        self.name = name
+        self.xmin = xmin
+        self.xmax = xmax
+        self.intervals = intervals
+
+        if self.xmax < self.xmin:
+            raise ValueError("xmax ({}) < xmin ({})".format(self.xmax, self.xmin))
+
+    def cutoff(self, xstart=None, xend=None):
+        if xstart is None:
+            xstart = self.xmin
+
+        if xend is None:
+            xend = self.xmax
+
+        if xend < xstart:
+            raise ValueError("xend ({}) < xstart ({})".format(xend, xstart))
+
+        bias = xstart - self.xmin
+        new_xmax = xend - bias
+        new_xmin = self.xmin
+        new_intervals = []
+        for interval in self.intervals:
+            if interval.xmax <= xstart or interval.xmin >= xend:
+                pass
+            elif interval.xmin < xstart:
+                new_intervals.append(
+                    Interval(
+                        xmin=new_xmin, xmax=interval.xmax - bias, text=interval.text
+                    )
+                )
+            elif interval.xmax > xend:
+                new_intervals.append(
+                    Interval(
+                        xmin=interval.xmin - bias, xmax=new_xmax, text=interval.text
+                    )
+                )
+            else:
+                new_intervals.append(
+                    Interval(
+                        xmin=interval.xmin - bias,
+                        xmax=interval.xmax - bias,
+                        text=interval.text,
+                    )
+                )
+
+        return Tier(
+            tier_class=self.tier_class,
+            name=self.name,
+            xmin=new_xmin,
+            xmax=new_xmax,
+            intervals=new_intervals,
+        )
+
+
+class Interval(object):
+    def __init__(self, xmin=0.0, xmax=0.0, text=""):
+        self.xmin = xmin
+        self.xmax = xmax
+        self.text = text
+
+        if self.xmax < self.xmin:
+            raise ValueError("xmax ({}) < xmin ({})".format(self.xmax, self.xmin))
+
+
+# io
+def read_textgrid_from_file(filepath):
+    with codecs.open(filepath, "r", encoding="utf-8") as handle:
+        lines = handle.readlines()
+    if lines[-1] == "\r\n":
+        lines = lines[:-1]
+
+    assert "File type" in lines[0], "error line 0, {}".format(lines[0])
+    file_type = (
+        lines[0]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "Object class" in lines[1], "error line 1, {}".format(lines[1])
+    object_class = (
+        lines[1]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert lines[2] == "\r\n", "error line 2, {}".format(lines[2])
+
+    assert "xmin" in lines[3], "error line 3, {}".format(lines[3])
+    xmin = float(
+        lines[3].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "xmax" in lines[4], "error line 4, {}".format(lines[4])
+    xmax = float(
+        lines[4].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "tiers?" in lines[5], "error line 5, {}".format(lines[5])
+    tiers_status = (
+        lines[5].split("?")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "size" in lines[6], "error line 6, {}".format(lines[6])
+    size = int(
+        lines[6].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert lines[7] == "item []:\r\n", "error line 7, {}".format(lines[7])
+
+    tier_start = []
+    for item_idx in range(size):
+        tier_start.append(lines.index(" " * 4 + "item [{}]:\r\n".format(item_idx + 1)))
+
+    tier_end = tier_start[1:] + [len(lines)]
+
+    tiers = []
+    for tier_idx in range(size):
+        tiers.append(
+            read_tier_from_lines(
+                tier_lines=lines[tier_start[tier_idx] + 1 : tier_end[tier_idx]]
+            )
+        )
+
+    return TextGrid(
+        file_type=file_type,
+        object_class=object_class,
+        xmin=xmin,
+        xmax=xmax,
+        tiers_status=tiers_status,
+        tiers=tiers,
+    )
+
+
+def read_tier_from_lines(tier_lines):
+    assert "class" in tier_lines[0], "error line 0, {}".format(tier_lines[0])
+    tier_class = (
+        tier_lines[0]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "name" in tier_lines[1], "error line 1, {}".format(tier_lines[1])
+    name = (
+        tier_lines[1]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "xmin" in tier_lines[2], "error line 2, {}".format(tier_lines[2])
+    xmin = float(
+        tier_lines[2].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "xmax" in tier_lines[3], "error line 3, {}".format(tier_lines[3])
+    xmax = float(
+        tier_lines[3].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "intervals: size" in tier_lines[4], "error line 4, {}".format(tier_lines[4])
+    intervals_num = int(
+        tier_lines[4].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert len(tier_lines[5:]) == intervals_num * 5, "error lines"
+
+    intervals = []
+    for intervals_idx in range(intervals_num):
+        assert tier_lines[
+            5 + 5 * intervals_idx + 0
+        ] == " " * 8 + "intervals [{}]:\r\n".format(intervals_idx + 1)
+        assert tier_lines[
+            5 + 5 * intervals_idx + 1
+        ] == " " * 8 + "intervals [{}]:\r\n".format(intervals_idx + 1)
+        intervals.append(
+            read_interval_from_lines(
+                interval_lines=tier_lines[
+                    7 + 5 * intervals_idx : 10 + 5 * intervals_idx
+                ]
+            )
+        )
+    return Tier(
+        tier_class=tier_class, name=name, xmin=xmin, xmax=xmax, intervals=intervals
+    )
+
+
+def read_interval_from_lines(interval_lines):
+    assert len(interval_lines) == 3, "error lines"
+
+    assert "xmin" in interval_lines[0], "error line 0, {}".format(interval_lines[0])
+    xmin = float(
+        interval_lines[0]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "xmax" in interval_lines[1], "error line 1, {}".format(interval_lines[1])
+    xmax = float(
+        interval_lines[1]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "text" in interval_lines[2], "error line 2, {}".format(interval_lines[2])
+    text = (
+        interval_lines[2]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    return Interval(xmin=xmin, xmax=xmax, text=text)
+
+
+# wav.scp <recording-id> <extended-filename>
+def prepare_wav_scp(data_root, store_dir, set_type="train"):
+    id_postfix = "Far"
+    wav_postfix = "_Far.wav"
+    all_wav_lines = []
+    wav_path_list = glob.glob(
+        os.path.join(data_root, set_type, "*", "*{}".format(wav_postfix))
+    )
+    for wav_path in wav_path_list:
+        id_prefix = os.path.split(wav_path)[-1].split("_")[:4]
+        record_id = "_".join(id_prefix) + "_" + id_postfix
+        all_wav_lines.append("{} {}".format(record_id, wav_path))
+    if not os.path.exists("{}/temp".format(store_dir)):
+        os.makedirs("{}/temp".format(store_dir))
+    text2lines(
+        textpath="{}/temp/wav.scp".format(store_dir), lines_content=all_wav_lines
+    )
+    return None
+
+
+def prepare_mp4_scp(data_root, store_dir, set_type="train"):
+    id_postfix = "Far"
+    mp4_postfix = "_Far.mp4"
+    all_mp4_lines = []
+    mp4_path_list = glob.glob(
+        os.path.join(data_root, set_type, "*", "*{}".format(mp4_postfix))
+    )
+    for mp4_path in mp4_path_list:
+        id_prefix = os.path.split(mp4_path)[-1].split("_")[:4]
+        record_id = "_".join(id_prefix) + "_" + id_postfix
+        all_mp4_lines.append("{} {}".format(record_id, mp4_path))
+    if not os.path.exists("{}/temp".format(store_dir)):
+        os.makedirs("{}/temp".format(store_dir))
+    text2lines(
+        textpath="{}/temp/mp4.scp".format(store_dir), lines_content=all_mp4_lines
+    )
+    return None
+
+
+# segments <utterance-id> <recording-id> <segment-begin> <segment-end>
+# text <utterance-id> <words>
+# utt2spk <utterance-id> <speaker-id>
+def prepare_segments_text_utt2spk_worker(
+    transcription_dir, set_type, store_dir, processing_id=None, processing_num=None
+):
+    segments_lines = []
+    text_sentence_lines = []
+    utt2spk_lines = []
+    tier_name = "内容层"
+    rejected_text_list = ["<NOISE>", "<DEAF>"]
+    punctuation_list = ["。", "，", "？"]
+    sound_list = ["呃", "啊", "噢", "嗯", "唉"]
+    min_duration = 0.04
+
+    wav_lines = sorted(
+        text2lines(textpath="{}/temp/wav.scp".format(store_dir), lines_content=None)
+    )
+    for wav_idx in range(len(wav_lines)):
+        if processing_id is None:
+            processing_token = True
+        else:
+            if wav_idx % processing_num == processing_id:
+                processing_token = True
+            else:
+                processing_token = False
+        if processing_token:
+            wav_id, wav_path = wav_lines[wav_idx].split(" ")
+            room, speakers, config, index = wav_id.split("_")[:4]
+            speaker_list = [speakers[i : i + 3] for i in range(1, len(speakers), 3)]
+            for speaker in speaker_list:
+                tg = read_textgrid_from_file(
+                    filepath=os.path.join(
+                        transcription_dir,
+                        set_type,
+                        "{}_{}_{}_{}_Near_{}.TextGrid".format(
+                            room, speakers, config, index, speaker
+                        ),
+                    )
+                )
+                target_tier = False
+                for tier in tg.tiers:
+                    if tier.name == tier_name:
+                        target_tier = tier
+                if not target_tier:
+                    raise ValueError("no tier: {}".format(tier_name))
+                for interval in target_tier.intervals:
+                    if (
+                        interval.text not in rejected_text_list
+                        and interval.xmax - interval.xmin >= min_duration
+                    ):
+                        start_stamp = interval.xmin - interval.xmin % 0.04
+                        start_stamp = round(start_stamp, 2)
+                        end_stamp = (
+                            interval.xmax + 0.04 - interval.xmax % 0.04
+                            if interval.xmax % 0.04 != 0
+                            else interval.xmax
+                        )
+                        end_stamp = round(end_stamp, 2)
+                        utterance_id = (
+                            "S{}_{}_{}_{}_{}_".format(
+                                speaker, room, speakers, config, index
+                            )
+                            + "{0:06d}".format(int(round(start_stamp * 100, 0)))
+                            + "-"
+                            + "{0:06d}".format(int(round(end_stamp * 100, 0)))
+                        )
+                        text = interval.text
+                        for punctuation in punctuation_list:
+                            text = text.replace(punctuation, "")
+                        if text not in sound_list:
+                            segments_lines.append(
+                                "{} {} {} {}".format(
+                                    utterance_id, wav_id, start_stamp, end_stamp
+                                )
+                            )
+                            text_sentence_lines.append(
+                                "{} {}".format(utterance_id, text)
+                            )
+                            utt2spk_lines.append("{} S{}".format(utterance_id, speaker))
+    return [segments_lines, text_sentence_lines, utt2spk_lines]
+
+
+def prepare_segments_text_utt2spk_manager(
+    transcription_dir, set_type, store_dir, processing_num=1
+):
+    if processing_num > 1:
+        pool = Pool(processes=processing_num)
+        all_result = []
+        for i in range(processing_num):
+            part_result = pool.apply_async(
+                prepare_segments_text_utt2spk_worker,
+                kwds={
+                    "transcription_dir": transcription_dir,
+                    "set_type": set_type,
+                    "store_dir": store_dir,
+                    "processing_id": i,
+                    "processing_num": processing_num,
+                },
+            )
+            all_result.append(part_result)
+        pool.close()
+        pool.join()
+        segments_lines, text_sentence_lines, utt2spk_lines = [], [], []
+        for item in all_result:
+            (
+                part_segments_lines,
+                part_text_sentence_lines,
+                part_utt2spk_lines,
+            ) = item.get()
+            segments_lines += part_segments_lines
+            text_sentence_lines += part_text_sentence_lines
+            utt2spk_lines += part_utt2spk_lines
+    else:
+        (
+            segments_lines,
+            text_sentence_lines,
+            utt2spk_lines,
+        ) = prepare_segments_text_utt2spk_worker(
+            transcription_dir=transcription_dir, set_type=set_type, store_dir=store_dir
+        )
+
+    text2lines(
+        textpath="{}/temp/segments".format(store_dir), lines_content=segments_lines
+    )
+    text2lines(
+        textpath="{}/temp/text_sentence".format(store_dir),
+        lines_content=text_sentence_lines,
+    )
+    text2lines(
+        textpath="{}/temp/utt2spk".format(store_dir), lines_content=utt2spk_lines
+    )
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("")
+    parser.add_argument("wav_dir", type=str, default="", help="directory of wav")
+    parser.add_argument("mp4_dir", type=str, default="", help="directory of mp4")
+    parser.add_argument(
+        "transcription_dir", type=str, default="", help="directory of transcription"
+    )
+    parser.add_argument("set_type", type=str, default="train", help="set type")
+    parser.add_argument(
+        "store_dir", type=str, default="data/train_far", help="set types"
+    )
+    parser.add_argument(
+        "-o", "--only_wav", type=bool, default=False, help="only prepare wav.scp"
+    )
+    parser.add_argument("-nj", type=int, default=15, help="number of process")
+    args = parser.parse_args()
+
+    print("Preparing wav.scp in {} for {} set".format(args.store_dir, args.set_type))
+    prepare_wav_scp(
+        data_root=args.wav_dir, store_dir=args.store_dir, set_type=args.set_type
+    )
+    print("Preparing mp4.scp in {} for {} set".format(args.store_dir, args.set_type))
+    prepare_mp4_scp(
+        data_root=args.mp4_dir, store_dir=args.store_dir, set_type=args.set_type
+    )
+    if not args.only_wav:
+        print(
+            "Preparing segments,text_sentence,utt2spk in {} for {} set".format(
+                args.store_dir, args.set_type
+            )
+        )
+        prepare_segments_text_utt2spk_manager(
+            transcription_dir=args.transcription_dir,
+            set_type=args.set_type,
+            store_dir=args.store_dir,
+            processing_num=args.nj,
+        )
diff --git a/egs2/misp2021/asr1/local/run_beamformit.py b/egs2/misp2021/asr1/local/run_beamformit.py
new file mode 100755
index 00000000000..8070542bb30
--- /dev/null
+++ b/egs2/misp2021/asr1/local/run_beamformit.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import argparse
+
+
+def beamformit_worker(
+    beamformit_tool, config_file, source_dir, channel_scp, output_root
+):
+    f = open(channel_scp)
+    for line in f:
+        show_id = line.split(" ")[0]
+        store_dir = os.path.join("/", *line.split(" ")[1].split("/")[:-1])
+        print("*" * 50)
+        print(store_dir)
+        print(show_id)
+        print("*" * 50)
+        if not os.path.exists(store_dir):
+            os.makedirs(store_dir, exist_ok=True)
+        cmd = "{} -s {} -c {} --config_file {} -source_dir {} --result_dir {}".format(
+            beamformit_tool, show_id, channel_scp, config_file, source_dir, store_dir
+        )
+        os.system(cmd)
+    f.close()
+    return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("")
+    parser.add_argument(
+        "beamformit_tool",
+        type=str,
+        default="./../se_misp/BeamformIt-master/BeamformIt",
+        help="path of beamformit tool",
+    )
+    parser.add_argument(
+        "config_file",
+        type=str,
+        default="./conf/all_conf.cfg",
+        help="path of config file",
+    )
+    parser.add_argument(
+        "source_dir",
+        type=str,
+        default="/yrfs2/cv1/hangchen2/data/MISP_121h_WPE_/",
+        help="wpe data dir",
+    )
+    parser.add_argument(
+        "channel_scp",
+        type=str,
+        default="exp/wpe_tmp/channels_misp",
+        help="path of config file",
+    )
+    parser.add_argument(
+        "output_root",
+        type=str,
+        default="/yrfs2/cv1/hangchen2/data/MISP_121h_WPE_/",
+        help="beamformit data dir",
+    )
+
+    args = parser.parse_args()
+    beamformit_worker(
+        beamformit_tool=args.beamformit_tool,
+        config_file=args.config_file,
+        source_dir=args.source_dir,
+        channel_scp=args.channel_scp,
+        output_root=args.output_root,
+    )
diff --git a/egs2/misp2021/asr1/local/run_wpe.py b/egs2/misp2021/asr1/local/run_wpe.py
new file mode 100755
index 00000000000..0815dd1dfae
--- /dev/null
+++ b/egs2/misp2021/asr1/local/run_wpe.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import codecs
+import argparse
+import numpy as np
+import scipy.io.wavfile as wf
+from multiprocessing import Pool
+from nara_wpe.wpe import wpe_v8 as wpe
+from nara_wpe.utils import stft, istft
+
+
+def wpe_worker(
+    wav_scp,
+    data_root="MISP_121h",
+    output_root="MISP_121h_WPE_",
+    processing_id=None,
+    processing_num=None,
+):
+    sampling_rate = 16000
+    iterations = 5
+    stft_options = dict(
+        size=512,
+        shift=128,
+        window_length=None,
+        fading=True,
+        pad=True,
+        symmetric_window=False,
+    )
+    with codecs.open(wav_scp, "r") as handle:
+        lines_content = handle.readlines()
+    wav_lines = [*map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)]
+    for wav_idx in range(len(wav_lines)):
+        if processing_id is None:
+            processing_token = True
+        else:
+            if wav_idx % processing_num == processing_id:
+                processing_token = True
+            else:
+                processing_token = False
+        if processing_token:
+            file_list = wav_lines[wav_idx].split(" ")
+            name, wav_list = file_list[0], file_list[1:]
+            file_exist = True
+            for wav_path in wav_list:
+                file_exist = file_exist and os.path.exists(
+                    wav_path.replace(data_root, output_root)
+                )
+                if not file_exist:
+                    break
+            if not file_exist:
+                print("wait to process {} : {}".format(wav_idx, wav_list[0]))
+                signal_list = []
+                for f in wav_list:
+                    _, data = wf.read(f)
+                    if data.dtype == np.int16:
+                        data = np.float32(data) / 32768
+                    signal_list.append(data)
+                min_len = len(signal_list[0])
+                max_len = len(signal_list[0])
+                for i in range(1, len(signal_list)):
+                    min_len = min(min_len, len(signal_list[i]))
+                    max_len = max(max_len, len(signal_list[i]))
+                if min_len != max_len:
+                    for i in range(len(signal_list)):
+                        signal_list[i] = signal_list[i][:min_len]
+                y = np.stack(signal_list, axis=0)
+                Y = stft(y, **stft_options).transpose(2, 0, 1)
+                Z = wpe(Y, iterations=iterations, statistics_mode="full").transpose(
+                    1, 2, 0
+                )
+                z = istft(Z, size=stft_options["size"], shift=stft_options["shift"])
+                for d in range(len(signal_list)):
+                    store_path = wav_list[d].replace(data_root, output_root)
+                    if not os.path.exists(os.path.split(store_path)[0]):
+                        os.makedirs(os.path.split(store_path)[0], exist_ok=True)
+                    tmpwav = np.int16(z[d, :] * 32768)
+                    wf.write(store_path, sampling_rate, tmpwav)
+            else:
+                print("file exist {} : {}".format(wav_idx, wav_list[0]))
+    return None
+
+
+def wpe_manager(
+    wav_scp, processing_num=1, data_root="MISP_121h", output_root="MISP_121h_WPE_"
+):
+    if processing_num > 1:
+        pool = Pool(processes=processing_num)
+        for i in range(processing_num):
+            pool.apply_async(
+                wpe_worker,
+                kwds={
+                    "wav_scp": wav_scp,
+                    "processing_id": i,
+                    "processing_num": processing_num,
+                    "data_root": data_root,
+                    "output_root": output_root,
+                },
+            )
+        pool.close()
+        pool.join()
+    else:
+        wpe_worker(wav_scp, data_root=data_root, output_root=output_root)
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("run_wpe")
+    parser.add_argument(
+        "wav_scp",
+        type=str,
+        default="./local/tmp/wpe.scp",
+        help="list file of wav, format is scp",
+    )
+    parser.add_argument(
+        "data_root", type=str, default="wpe", help="input misp data root"
+    )
+    parser.add_argument(
+        "output_root", type=str, default="wpe", help="output wpe data root"
+    )
+    parser.add_argument("-nj", type=int, default="1", help="number of process")
+    args = parser.parse_args()
+    print("wavfile=", args.wav_scp)
+    print("processingnum=", args.nj)
+    wpe_manager(
+        wav_scp=args.wav_scp,
+        processing_num=args.nj,
+        data_root=args.data_root,
+        output_root=args.output_root,
+    )
diff --git a/egs2/misp2021/asr1/path.sh b/egs2/misp2021/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/misp2021/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/misp2021/asr1/pyscripts b/egs2/misp2021/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/misp2021/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/misp2021/asr1/run.sh b/egs2/misp2021/asr1/run.sh
new file mode 100755
index 00000000000..454412fcac3
--- /dev/null
+++ b/egs2/misp2021/asr1/run.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+train_set=train_far 
+valid_set=dev_far
+test_sets=dev_far
+
+asr_config=conf/tuning/train_asr_conformer.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+use_lm=true
+use_word_lm=false
+
+
+./asr.sh                                   \
+    --lang zh \
+    --audio_format wav                     \
+    --nlsyms_txt data/nlsyms.txt           \
+    --ngpu 3                               \
+    --token_type char                      \
+    --feats_type raw                       \
+    --use_lm ${use_lm}                     \
+    --asr_config "${asr_config}"           \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}"             \
+    --use_word_lm ${use_word_lm}           \
+    --train_set "${train_set}"             \
+    --valid_set "${valid_set}"             \
+    --test_sets "${test_sets}"             \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/misp2021/asr1/scripts b/egs2/misp2021/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/misp2021/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/misp2021/asr1/steps b/egs2/misp2021/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/misp2021/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/misp2021/asr1/utils b/egs2/misp2021/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/misp2021/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/README.md b/egs2/misp2021/avsr1/README.md
new file mode 100644
index 00000000000..3db292eaf08
--- /dev/null
+++ b/egs2/misp2021/avsr1/README.md
@@ -0,0 +1,17 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue Nov  9 19:25:58 UTC 2021`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.4a1`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `5d941dd5909291df991b25273741f32b0631ea91`
+  - Commit date: `Mon Oct 25 06:12:01 2021 +0000`
+
+## asr_train_asr_conformer_extracted_zh_char
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev_far|18899|253057|39.4|41.4|19.2|4.4|65.0|92.0|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test_far|14949|312412|20.0|30.2|49.8|0.5|80.6|93.4|
diff --git a/egs2/misp2021/avsr1/asr.sh b/egs2/misp2021/avsr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/misp2021/avsr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/cmd.sh b/egs2/misp2021/avsr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/misp2021/avsr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/misp2021/avsr1/conf/beamformit.cfg b/egs2/misp2021/avsr1/conf/beamformit.cfg
new file mode 100644
index 00000000000..7b63eb01790
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/beamformit.cfg
@@ -0,0 +1,44 @@
+#configuration file for the delay_sum algorithm
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+do_compute_reference = 1
+#reference_channel = 2
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
diff --git a/egs2/misp2021/avsr1/conf/decode_asr.yaml b/egs2/misp2021/avsr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..40c05133e3a
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 1.0
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/misp2021/avsr1/conf/fbank.conf b/egs2/misp2021/avsr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/misp2021/avsr1/conf/mfcc.conf b/egs2/misp2021/avsr1/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs2/misp2021/avsr1/conf/pbs.conf b/egs2/misp2021/avsr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/misp2021/avsr1/conf/pitch.conf b/egs2/misp2021/avsr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/misp2021/avsr1/conf/queue.conf b/egs2/misp2021/avsr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/misp2021/avsr1/conf/slurm.conf b/egs2/misp2021/avsr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/misp2021/avsr1/conf/train_asr_transformer.yaml b/egs2/misp2021/avsr1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..3569eaab393
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/train_asr_transformer.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 100
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# criterion
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
diff --git a/egs2/misp2021/avsr1/conf/train_lm.yaml b/egs2/misp2021/avsr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..26ea1e4f96e
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/train_lm.yaml
@@ -0,0 +1,19 @@
+# rnnlm related
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 2
+
+# optimization related
+grad_clip: 5.0
+batch_type: folded
+batch_size: 64  # batch size in LM training
+max_epoch: 20   # if the data size is large, we can reduce this
+patience: 3
+optim: sgd
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/misp2021/avsr1/conf/tuning/train_asr_conformer.yaml b/egs2/misp2021/avsr1/conf/tuning/train_asr_conformer.yaml
new file mode 100755
index 00000000000..bcf0eeab9a8
--- /dev/null
+++ b/egs2/misp2021/avsr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 2000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 100
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/misp2021/avsr1/db.sh b/egs2/misp2021/avsr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/misp2021/avsr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/local/concatenate_feature.py b/egs2/misp2021/avsr1/local/concatenate_feature.py
new file mode 100755
index 00000000000..41676ca21a1
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/concatenate_feature.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import codecs
+import argparse
+import kaldiio
+import numpy as np
+from tqdm import tqdm
+
+
+def scp2array_dic(
+    scp_path, array_dic=None, ark_path=None, compression_method=None, append=False
+):
+    """
+    read array_dic from ark indexed by scp or
+    write array_dic to ark while create scp to index
+    :param scp_path: filepath of scp
+    :param array_dic: dic of array
+    :param ark_path: filepath of ark, default is scppath.replace('.scp', '.ark')
+    :param compression_method: compression method, default=None,
+                kAutomaticMethod=1, kSpeechFeature=2,
+                kTwoByteAuto=3,kTwoByteSignedInteger=4, kOneByteAuto=5,
+                kOneByteUnsignedInteger=6, kOneByteZeroOne=7
+    :param append: if True, append, else write
+    :return: dic of numpy array for read while None for write
+    """
+    if array_dic is None:
+        array_dic = kaldiio.load_scp(scp_path)
+        return array_dic
+    else:
+        if ark_path is None:
+            ark_path = scp_path.replace(".scp", ".ark")
+        else:
+            pass
+        kaldiio.save_ark(
+            ark=ark_path,
+            array_dict=array_dic,
+            scp=scp_path,
+            compression_method=compression_method,
+            append=append,
+        )
+        return None
+
+
+def main_concatenate(audio_dir, visual_dir, store_dir, ji=None, nj=None):
+    audio_loader = scp2array_dic(
+        scp_path=os.path.join(audio_dir, "feats.scp"),
+        array_dic=None,
+        ark_path=None,
+        compression_method=None,
+        append=False,
+    )
+    visual_npz_dic = {}
+    with codecs.open(os.path.join(visual_dir, "embedding.scp"), "r") as handle:
+        lines_content = handle.readlines()
+    for line_content in [
+        *map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)
+    ]:
+        key, path = line_content.split(" ")
+        visual_npz_dic[key] = path
+
+    common_keys = [*(set(audio_loader.keys()) & set(visual_npz_dic.keys()))]
+    store_scp = os.path.abspath(
+        os.path.join(store_dir, "raw_av_embedding.{}.scp".format(ji))
+    )
+    store_ark = os.path.abspath(
+        os.path.join(store_dir, "raw_av_embedding.{}.ark".format(ji))
+    )
+
+    for key_idx in tqdm(
+        range(len(common_keys)), leave=True, desc="0" if ji is None else str(ji)
+    ):
+        if ji is None:
+            processing_token = True
+        else:
+            if key_idx % nj == ji:
+                processing_token = True
+            else:
+                processing_token = False
+        if processing_token:
+            key = common_keys[key_idx]
+            audio_array = audio_loader[key]
+            visual_array = np.load(visual_npz_dic[key])["data"][0]
+            expend_visual_array = np.stack(
+                [visual_array for _ in range(4)], axis=-1
+            ).reshape(-1, visual_array.shape[-1])
+            expend_visual_array = expend_visual_array[1:]
+            expend_visual_array = expend_visual_array[: audio_array.shape[0]]
+            audio_visual_array = np.concatenate(
+                [audio_array, expend_visual_array], axis=-1
+            )
+            kaldiio.save_ark(
+                ark=store_ark,
+                array_dict={key: audio_visual_array},
+                scp=store_scp,
+                append=True,
+            )
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("concatenate_feature")
+    parser.add_argument(
+        "audio_dir",
+        type=str,
+        default="data/train_far_sp_hire",
+        help="data directory of audio",
+    )
+    parser.add_argument(
+        "visual_dir",
+        type=str,
+        default="data/train_far_video",
+        help="data directory of video",
+    )
+    parser.add_argument(
+        "store_dir",
+        type=str,
+        default="data/test_far_av",
+        help="store directory of av embedding",
+    )
+    parser.add_argument("--ji", type=int, default=0, help="index of process")
+    parser.add_argument("--nj", type=int, default=15, help="number of process")
+
+    args = parser.parse_args()
+
+    nj = args.nj
+    ji = args.ji if nj > 1 else 0
+
+    main_concatenate(
+        audio_dir=args.audio_dir,
+        visual_dir=args.visual_dir,
+        store_dir=args.store_dir,
+        nj=nj,
+        ji=ji,
+    )
diff --git a/egs2/misp2021/avsr1/local/data.sh b/egs2/misp2021/avsr1/local/data.sh
new file mode 100755
index 00000000000..04c83005646
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/data.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=0
+stop_stage=3
+log "$0 $*"
+. utils/parse_options.sh
+
+nj=6
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+
+if [ ! -e "${MISP2021}" ]; then
+    log "Fill the value of 'MISP2021' of db.sh"
+    exit 1
+fi
+
+if [[ ! -f local/lrw_resnet18_mstcn.pth.tar ]]; then
+    log "You need to download lrw_resnet18_mstcn.pth.tar from https://bit.ly/3glF4k5 or https://bit.ly/3513Ror (key: um1q) and put the pretrained model to local/"
+    exit 1
+fi
+
+if [ ! -d extractor ]; then
+    git clone https://github.com/mispchallenge/misp2021_baseline.git
+    mv misp2021_baseline/task2_avsr_nn_hmm/extractor/ extractor/
+    rm -rf misp2021_baseline
+    ln -s $PWD/local/lrw_resnet18_mstcn.pth.tar extractor/models/lrw_resnet18_mstcn.pth.tar
+fi
+
+
+enhancement_dir=data/misp2021_far_WPE
+data_roi=$MISP2021/roi
+
+###########################################################################
+# wpe+beamformit
+###########################################################################
+# use nara-wpe and beamformit to enhance multichannel misp data
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  log "stage 0: Nara-wpe and Beamformit"
+  for x in dev train ; do
+    local/enhancement.sh --nj ${nj} $MISP2021/audio/$x ${enhancement_dir}/audio/$x  || exit 1;
+  done
+fi
+
+###########################################################################
+# prepare data
+###########################################################################
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  for x in dev train ; do
+    if [[ ! -f data/${x}_far/.done ]]; then
+      local/prepare_data.sh $MISP2021 $enhancement_dir $x data/${x}_far || exit 1;
+    fi
+  done
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  for x in dev_far train_far; do
+    if [ ! -f data/$x/mfcc.done ]; then
+      mfccdir=mfcc
+      steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $nj data/$x exp/make_mfcc/$x $mfccdir
+      utils/fix_data_dir.sh data/$x
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+      utils/fix_data_dir.sh data/$x
+      touch data/$x/mfcc.done
+    fi
+  done
+fi
+
+###########################################################################
+# prepare video data
+###########################################################################
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  # extract visual ROI, store as npz (item: data); extract visual embedding; concatenate visual embedding and mfcc
+  for x in dev_far train_far ; do
+    local/extract_far_video_roi.sh  --nj ${nj} data/${x} $data_roi/${x} data/${x} || exit 1;
+  done
+fi
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/local/enhancement.sh b/egs2/misp2021/avsr1/local/enhancement.sh
new file mode 100755
index 00000000000..52e9a407795
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/enhancement.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# Copyright 2018 USTC (Authors: Zhaoxu Nian, Hang Chen, Yen-Ju Lu)
+# Apache 2.0
+
+# use nara-wpe and BeamformIt to enhance multichannel data
+
+set -e -o pipefail
+
+# configs
+stage=0
+nj=6
+cmd=run.pl
+
+. utils/parse_options.sh
+
+. ./path.sh || exit 1;
+
+pip show -f nara_wpe >/dev/null || pip install nara_wpe
+
+if [ -z $BEAMFORMIT ] ; then
+  export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt
+fi
+export PATH=${PATH}:$BEAMFORMIT
+! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/kaldi/tools; extras/install_beamformit.sh; cd -;'" && exit 1
+
+if [ $# != 2 ]; then
+ echo "Usage: $0 <corpus-data-dir> <enhancement-dir>"
+ echo " $0 /path/misp2021 /path/wpe_output"
+ exit 1;
+fi
+
+data_root=$1
+out_root=$2
+
+echo "start speech enhancement"
+# wpe
+if [ $stage -le 0 ]; then
+  echo "start wpe"
+  python local/find_wav.py -nj $nj $data_root $out_root/log wpe Far
+  for n in `seq $nj`; do
+    cat <<-EOF > $out_root/log/wpe.$n.sh
+    python local/run_wpe.py $out_root/log/wpe.$n.scp $data_root $out_root
+EOF
+  done
+  chmod a+x $out_root/log/wpe.*.sh
+  $cmd JOB=1:$nj $out_root/log/wpe.JOB.log $out_root/log/wpe.JOB.sh
+  echo "finish wpe"
+fi
+
+# BeamformIt
+if [ $stage -le 1 ]; then
+  echo "start beamformit"
+  python local/find_wav.py $PWD/$out_root $out_root/log beamformit Far
+  python local/run_beamformit.py $BEAMFORMIT/BeamformIt conf/beamformit.cfg / $out_root/log/beamformit.scp $out_root
+  echo "end beamformit"
+fi
+echo "end speech enhancement"
diff --git a/egs2/misp2021/avsr1/local/extract_far_video_roi.sh b/egs2/misp2021/avsr1/local/extract_far_video_roi.sh
new file mode 100755
index 00000000000..39b734e833b
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/extract_far_video_roi.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+# Copyright 2021 USTC (Authors: Hang Chen)
+# Apache 2.0
+
+# extract region of interest (roi) in the video, store as npz file, item name is "data"
+
+set -e
+# configs for 'chain'
+stage=0
+nj=15
+gpu_nj=4
+# End configuration section.
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+pip show -f opencv-python >/dev/null || pip install opencv-python
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <data-set> <roi-json-dir> <audio-dir>"
+  echo " $0 data/train_far /path/roi data/train_far_sp_hires"
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+data_set=$1
+roi_json_dir=$2
+audio_dir=$3
+video_dir="$data_set"_video
+av_dir="$audio_dir"_av
+segment_video_dir="$video_dir"/segments_data
+extractor_dir=local/extractor
+visual_embedding_dir="$video_dir"/visual_embedding
+
+###########################################################################
+# prepare mp4.scp, segments, vid2spk, spk2vid, text
+###########################################################################
+if [ $stage -le 0 ]; then
+  if [[ ! -f $video_dir/.done ]]; then
+    mkdir -p $video_dir
+    cat $data_set/mp4.scp > $video_dir/mp4.scp
+    cat $data_set/segments > $video_dir/segments
+    cat $data_set/spk2utt > $video_dir/spk2vid
+    cat $data_set/utt2spk > $video_dir/vid2spk
+    cat $data_set/text > $video_dir/text
+    touch $video_dir/.done
+  fi
+fi
+
+###########################################################################
+# segment mp4 and crop roi, store as npz, item name is data
+###########################################################################
+if [ $stage -le 1 ]; then
+  if [[ ! -f $video_dir/segment.done ]]; then
+    mkdir -p $segment_video_dir/log
+    for n in `seq $nj`; do
+      cat <<-EOF > $segment_video_dir/log/roi.$n.sh
+        python local/prepare_far_video_roi.py --ji $((n-1)) --nj $nj $video_dir $roi_json_dir $segment_video_dir
+EOF
+    done
+    chmod a+x $segment_video_dir/log/roi.*.sh
+    $train_cmd JOB=1:$nj $segment_video_dir/log/roi.JOB.log $segment_video_dir/log/roi.JOB.sh || exit 1;
+    rm -f $segment_video_dir/log/roi.*.sh
+    cat $segment_video_dir/log/roi.*.scp | sort -k 1 | uniq > $video_dir/roi.scp
+    rm -f $segment_video_dir/log/roi.*.scp
+    echo 'segment done'
+    touch $video_dir/segment.done
+  fi
+fi
+
+###########################################################################
+# extract visual embedding
+###########################################################################
+if [ $stage -le 2 ]; then
+  if [[ ! -f $video_dir/extract.done ]]; then
+    mkdir -p $visual_embedding_dir/log
+    #notice: -g 0 1 2 3 pay attention to your GPU numbers
+    python local/prepare_visual_embedding_extractor.py $video_dir -nj $nj -g 0 1 2 
+    chmod a+x $visual_embedding_dir/log/extract.*.sh
+    $train_cmd JOB=1:$nj $visual_embedding_dir/log/extract.JOB.log $visual_embedding_dir/log/extract.JOB.sh
+    rm -f $visual_embedding_dir/log/extract.*.sh
+    echo 'extract done'
+    touch $video_dir/extract.done
+  fi
+fi
+
+
+###########################################################################
+# concatenate audio-visual embedding
+###########################################################################
+if [ $stage -le 3 ]; then
+  if [ ! -f $video_dir/concatenate.done ]; then
+    mkdir -p $av_dir/data
+    mkdir -p $av_dir/log
+      
+    cat $data_set/segments > $av_dir/segments
+    cat $data_set/spk2utt > $av_dir/spk2utt
+    cat $data_set/utt2spk > $av_dir/utt2spk
+    cat $data_set/text > $av_dir/text
+    cat $data_set/wav.scp > $av_dir/wav.scp
+    cat $video_dir/mp4.scp > $av_dir/mp4.scp
+
+    python local/concatenate_feature.py --ji 0 --nj 1 $audio_dir $video_dir $av_dir/data
+
+    cat $av_dir/data/raw_av_embedding.*.scp | sort -k 1 | uniq > $av_dir/feats.scp
+    steps/compute_cmvn_stats.sh $av_dir || exit 1; 
+    utils/fix_data_dir.sh $av_dir || exit 1;
+    echo 'concatenate done'
+    touch $video_dir/concatenate.done
+  fi
+fi
diff --git a/egs2/misp2021/avsr1/local/find_wav.py b/egs2/misp2021/avsr1/local/find_wav.py
new file mode 100755
index 00000000000..216c30de4c9
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/find_wav.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import glob
+import codecs
+import argparse
+
+
+def find_wav(data_root, scp_dir, scp_name="wpe", wav_type="Far", n_split=1):
+    type2group_num = {"Far": 6, "Middle": 2}
+    wav_dir = os.path.join(data_root, wav_type.lower())
+    all_wav_paths = glob.glob(os.path.join(wav_dir, "*.wav"))
+
+    sorted_wav_paths = sorted(all_wav_paths)
+    lines = ["" for _ in range(n_split)]
+    for wav_idx in range(len(sorted_wav_paths) // type2group_num[wav_type]):
+        line = sorted_wav_paths[wav_idx * type2group_num[wav_type]]
+        group_name = "_".join(line.split("/")[-1].split("_")[:5])
+        line = group_name + " " + line
+        for i in range(1, type2group_num[wav_type]):
+            line = line + " " + sorted_wav_paths[wav_idx * type2group_num[wav_type] + i]
+        line += "\n"
+        lines[wav_idx % n_split] += line
+
+    if not os.path.exists(scp_dir):
+        os.makedirs(scp_dir, exist_ok=True)
+
+    if n_split == 1:
+        with codecs.open(
+            os.path.join(scp_dir, "{}.scp".format(scp_name)), "w"
+        ) as handle:
+            handle.write(lines[0])
+        return None
+    for j in range(n_split):
+        with codecs.open(
+            os.path.join(scp_dir, "{}.{}.scp".format(scp_name, j + 1)), "w"
+        ) as handle:
+            handle.write(lines[j])
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("")
+    parser.add_argument("data_dir", type=str, default="wpe", help="dir of misp data")
+    parser.add_argument("scp_dir", type=str, default="wpe", help="dir of scp file")
+    parser.add_argument("scp_name", type=str, default="wpe", help="name of scp file")
+    parser.add_argument("wav_type", type=str, default="Far", help="wav type")
+    parser.add_argument("-nj", type=int, default=1, help="number of split files")
+    args = parser.parse_args()
+
+    find_wav(
+        data_root=args.data_dir,
+        scp_dir=args.scp_dir,
+        scp_name=args.scp_name,
+        wav_type=args.wav_type,
+        n_split=args.nj,
+    )
diff --git a/egs2/misp2021/avsr1/local/path.sh b/egs2/misp2021/avsr1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/misp2021/avsr1/local/prepare_data.sh b/egs2/misp2021/avsr1/local/prepare_data.sh
new file mode 100755
index 00000000000..b3db61ce09a
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/prepare_data.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# Copyright 2018 USTC (Authors: Hang Chen, Yen-Ju Lu)
+# Apache 2.0
+
+# transform misp data to kaldi format
+
+set -e -o pipefail
+
+# configs
+nj=1
+
+. ./cmd.sh || exit 1
+. ./path.sh || exit 1
+. ./utils/parse_options.sh || exit 1
+
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 <corpus-data-dir> <enhancement-data-dir> <data-set> <store-dir>"
+  echo " $0 /path/misp /path/misp_WPE train data/train_far"
+  exit 1;
+fi
+
+data_root=$1
+enhancement_root=$2
+data_type=$3
+store_dir=$4
+
+# wav.scp segments text utt2spk
+echo "prepare wav.scp segments text utt2spk"
+python local/prepare_far_data.py -nj $nj $enhancement_root/audio $data_root/video $data_root/transcription $data_type $store_dir
+cat $store_dir/temp/wav.scp | sort -k 1 | uniq > $store_dir/wav.scp
+cat $store_dir/temp/mp4.scp | sort -k 1 | uniq > $store_dir/mp4.scp
+cat $store_dir/temp/segments | sort -k 1 | uniq > $store_dir/segments
+cat $store_dir/temp/utt2spk | sort -k 1 | uniq > $store_dir/utt2spk
+cat $store_dir/temp/text_sentence | sort -k 1 | uniq > $store_dir/text
+
+echo "prepare done"
+
+# spk2utt
+utils/utt2spk_to_spk2utt.pl $store_dir/utt2spk | sort -k 1 | uniq > $store_dir/spk2utt
+
+touch data/nlsyms.txt
+
+echo "local/prepare_data.sh succeeded"
+exit 0
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/local/prepare_far_data.py b/egs2/misp2021/avsr1/local/prepare_far_data.py
new file mode 100755
index 00000000000..b98766f2168
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/prepare_far_data.py
@@ -0,0 +1,526 @@
+#!/usr/bin/env python
+# -- coding: UTF-8
+import os
+import glob
+import codecs
+import argparse
+from multiprocessing import Pool
+import sys
+
+
+def text2lines(textpath, lines_content=None):
+    """
+    read lines from text or write lines to txt
+    :param textpath: filepath of text
+    :param lines_content: list of lines or None, None means read
+    :return: processed lines content for read while None for write
+    """
+    if lines_content is None:
+        with codecs.open(textpath, "r") as handle:
+            lines_content = handle.readlines()
+        processed_lines = list(
+            map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)
+        )
+        return processed_lines
+    else:
+        processed_lines = list(
+            map(lambda x: x if x[-1] in ["\n"] else "{}\n".format(x), lines_content)
+        )
+        with codecs.open(textpath, "w") as handle:
+            handle.write("".join(processed_lines))
+        return None
+
+
+# class definition
+class TextGrid(object):
+    def __init__(
+        self,
+        file_type="",
+        object_class="",
+        xmin=0.0,
+        xmax=0.0,
+        tiers_status="",
+        tiers=[],
+    ):
+        self.file_type = file_type
+        self.object_class = object_class
+        self.xmin = xmin
+        self.xmax = xmax
+        self.tiers_status = tiers_status
+        self.tiers = tiers
+
+        if self.xmax < self.xmin:
+            raise ValueError("xmax ({}) < xmin ({})".format(self.xmax, self.xmin))
+
+    def cutoff(self, xstart=None, xend=None):
+        if xstart is None:
+            xstart = self.xmin
+
+        if xend is None:
+            xend = self.xmax
+
+        if xend < xstart:
+            raise ValueError("xend ({}) < xstart ({})".format(xend, xstart))
+
+        new_xmax = xend - xstart + self.xmin
+        new_xmin = self.xmin
+        new_tiers = []
+
+        for tier in self.tiers:
+            new_tiers.append(tier.cutoff(xstart=xstart, xend=xend))
+        return TextGrid(
+            file_type=self.file_type,
+            object_class=self.object_class,
+            xmin=new_xmin,
+            xmax=new_xmax,
+            tiers_status=self.tiers_status,
+            tiers=new_tiers,
+        )
+
+
+class Tier(object):
+    def __init__(self, tier_class="", name="", xmin=0.0, xmax=0.0, intervals=[]):
+        self.tier_class = tier_class
+        self.name = name
+        self.xmin = xmin
+        self.xmax = xmax
+        self.intervals = intervals
+
+        if self.xmax < self.xmin:
+            raise ValueError("xmax ({}) < xmin ({})".format(self.xmax, self.xmin))
+
+    def cutoff(self, xstart=None, xend=None):
+        if xstart is None:
+            xstart = self.xmin
+
+        if xend is None:
+            xend = self.xmax
+
+        if xend < xstart:
+            raise ValueError("xend ({}) < xstart ({})".format(xend, xstart))
+
+        bias = xstart - self.xmin
+        new_xmax = xend - bias
+        new_xmin = self.xmin
+        new_intervals = []
+        for interval in self.intervals:
+            if interval.xmax <= xstart or interval.xmin >= xend:
+                pass
+            elif interval.xmin < xstart:
+                new_intervals.append(
+                    Interval(
+                        xmin=new_xmin, xmax=interval.xmax - bias, text=interval.text
+                    )
+                )
+            elif interval.xmax > xend:
+                new_intervals.append(
+                    Interval(
+                        xmin=interval.xmin - bias, xmax=new_xmax, text=interval.text
+                    )
+                )
+            else:
+                new_intervals.append(
+                    Interval(
+                        xmin=interval.xmin - bias,
+                        xmax=interval.xmax - bias,
+                        text=interval.text,
+                    )
+                )
+
+        return Tier(
+            tier_class=self.tier_class,
+            name=self.name,
+            xmin=new_xmin,
+            xmax=new_xmax,
+            intervals=new_intervals,
+        )
+
+
+class Interval(object):
+    def __init__(self, xmin=0.0, xmax=0.0, text=""):
+        self.xmin = xmin
+        self.xmax = xmax
+        self.text = text
+
+        if self.xmax < self.xmin:
+            raise ValueError("xmax ({}) < xmin ({})".format(self.xmax, self.xmin))
+
+
+# io
+def read_textgrid_from_file(filepath):
+    with codecs.open(filepath, "r", encoding="utf-8") as handle:
+        lines = handle.readlines()
+    if lines[-1] == "\r\n":
+        lines = lines[:-1]
+
+    assert "File type" in lines[0], "error line 0, {}".format(lines[0])
+    file_type = (
+        lines[0]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "Object class" in lines[1], "error line 1, {}".format(lines[1])
+    object_class = (
+        lines[1]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert lines[2] == "\r\n", "error line 2, {}".format(lines[2])
+
+    assert "xmin" in lines[3], "error line 3, {}".format(lines[3])
+    xmin = float(
+        lines[3].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "xmax" in lines[4], "error line 4, {}".format(lines[4])
+    xmax = float(
+        lines[4].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "tiers?" in lines[5], "error line 5, {}".format(lines[5])
+    tiers_status = (
+        lines[5].split("?")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "size" in lines[6], "error line 6, {}".format(lines[6])
+    size = int(
+        lines[6].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert lines[7] == "item []:\r\n", "error line 7, {}".format(lines[7])
+
+    tier_start = []
+    for item_idx in range(size):
+        tier_start.append(lines.index(" " * 4 + "item [{}]:\r\n".format(item_idx + 1)))
+
+    tier_end = tier_start[1:] + [len(lines)]
+
+    tiers = []
+    for tier_idx in range(size):
+        tiers.append(
+            read_tier_from_lines(
+                tier_lines=lines[tier_start[tier_idx] + 1 : tier_end[tier_idx]]
+            )
+        )
+
+    return TextGrid(
+        file_type=file_type,
+        object_class=object_class,
+        xmin=xmin,
+        xmax=xmax,
+        tiers_status=tiers_status,
+        tiers=tiers,
+    )
+
+
+def read_tier_from_lines(tier_lines):
+    assert "class" in tier_lines[0], "error line 0, {}".format(tier_lines[0])
+    tier_class = (
+        tier_lines[0]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "name" in tier_lines[1], "error line 1, {}".format(tier_lines[1])
+    name = (
+        tier_lines[1]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "xmin" in tier_lines[2], "error line 2, {}".format(tier_lines[2])
+    xmin = float(
+        tier_lines[2].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "xmax" in tier_lines[3], "error line 3, {}".format(tier_lines[3])
+    xmax = float(
+        tier_lines[3].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert "intervals: size" in tier_lines[4], "error line 4, {}".format(tier_lines[4])
+    intervals_num = int(
+        tier_lines[4].split("=")[1].replace(" ", "").replace("\r", "").replace("\n", "")
+    )
+
+    assert len(tier_lines[5:]) == intervals_num * 5, "error lines"
+
+    intervals = []
+    for intervals_idx in range(intervals_num):
+        assert tier_lines[
+            5 + 5 * intervals_idx + 0
+        ] == " " * 8 + "intervals [{}]:\r\n".format(intervals_idx + 1)
+        assert tier_lines[
+            5 + 5 * intervals_idx + 1
+        ] == " " * 8 + "intervals [{}]:\r\n".format(intervals_idx + 1)
+        intervals.append(
+            read_interval_from_lines(
+                interval_lines=tier_lines[
+                    7 + 5 * intervals_idx : 10 + 5 * intervals_idx
+                ]
+            )
+        )
+    return Tier(
+        tier_class=tier_class, name=name, xmin=xmin, xmax=xmax, intervals=intervals
+    )
+
+
+def read_interval_from_lines(interval_lines):
+    assert len(interval_lines) == 3, "error lines"
+
+    assert "xmin" in interval_lines[0], "error line 0, {}".format(interval_lines[0])
+    xmin = float(
+        interval_lines[0]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "xmax" in interval_lines[1], "error line 1, {}".format(interval_lines[1])
+    xmax = float(
+        interval_lines[1]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    assert "text" in interval_lines[2], "error line 2, {}".format(interval_lines[2])
+    text = (
+        interval_lines[2]
+        .split("=")[1]
+        .replace(" ", "")
+        .replace('"', "")
+        .replace("\r", "")
+        .replace("\n", "")
+    )
+
+    return Interval(xmin=xmin, xmax=xmax, text=text)
+
+
+# wav.scp <recording-id> <extended-filename>
+def prepare_wav_scp(data_root, store_dir, set_type="train"):
+    id_postfix = "Far"
+    wav_postfix = "_Far.wav"
+    all_wav_lines = []
+    wav_path_list = glob.glob(
+        os.path.join(data_root, set_type, "*", "*{}".format(wav_postfix))
+    )
+    for wav_path in wav_path_list:
+        id_prefix = os.path.split(wav_path)[-1].split("_")[:4]
+        record_id = "_".join(id_prefix) + "_" + id_postfix
+        all_wav_lines.append("{} {}".format(record_id, wav_path))
+    if not os.path.exists("{}/temp".format(store_dir)):
+        os.makedirs("{}/temp".format(store_dir))
+    text2lines(
+        textpath="{}/temp/wav.scp".format(store_dir), lines_content=all_wav_lines
+    )
+    return None
+
+
+def prepare_mp4_scp(data_root, store_dir, set_type="train"):
+    id_postfix = "Far"
+    mp4_postfix = "_Far.mp4"
+    all_mp4_lines = []
+    mp4_path_list = glob.glob(
+        os.path.join(data_root, set_type, "*", "*{}".format(mp4_postfix))
+    )
+    for mp4_path in mp4_path_list:
+        id_prefix = os.path.split(mp4_path)[-1].split("_")[:4]
+        record_id = "_".join(id_prefix) + "_" + id_postfix
+        all_mp4_lines.append("{} {}".format(record_id, mp4_path))
+    if not os.path.exists("{}/temp".format(store_dir)):
+        os.makedirs("{}/temp".format(store_dir))
+    text2lines(
+        textpath="{}/temp/mp4.scp".format(store_dir), lines_content=all_mp4_lines
+    )
+    return None
+
+
+# segments <utterance-id> <recording-id> <segment-begin> <segment-end>
+# text <utterance-id> <words>
+# utt2spk <utterance-id> <speaker-id>
+def prepare_segments_text_utt2spk_worker(
+    transcription_dir, set_type, store_dir, processing_id=None, processing_num=None
+):
+    segments_lines = []
+    text_sentence_lines = []
+    utt2spk_lines = []
+    tier_name = "内容层"
+    rejected_text_list = ["<NOISE>", "<DEAF>"]
+    punctuation_list = ["。", "，", "？"]
+    sound_list = ["呃", "啊", "噢", "嗯", "唉"]
+    min_duration = 0.04
+
+    wav_lines = sorted(
+        text2lines(textpath="{}/temp/wav.scp".format(store_dir), lines_content=None)
+    )
+    for wav_idx in range(len(wav_lines)):
+        if processing_id is None:
+            processing_token = True
+        else:
+            if wav_idx % processing_num == processing_id:
+                processing_token = True
+            else:
+                processing_token = False
+        if processing_token:
+            wav_id, wav_path = wav_lines[wav_idx].split(" ")
+            room, speakers, config, index = wav_id.split("_")[:4]
+            speaker_list = [speakers[i : i + 3] for i in range(1, len(speakers), 3)]
+            for speaker in speaker_list:
+                tg = read_textgrid_from_file(
+                    filepath=os.path.join(
+                        transcription_dir,
+                        set_type,
+                        "{}_{}_{}_{}_Near_{}.TextGrid".format(
+                            room, speakers, config, index, speaker
+                        ),
+                    )
+                )
+                target_tier = False
+                for tier in tg.tiers:
+                    if tier.name == tier_name:
+                        target_tier = tier
+                if not target_tier:
+                    raise ValueError("no tier: {}".format(tier_name))
+                for interval in target_tier.intervals:
+                    if (
+                        interval.text not in rejected_text_list
+                        and interval.xmax - interval.xmin >= min_duration
+                    ):
+                        start_stamp = interval.xmin - interval.xmin % 0.04
+                        start_stamp = round(start_stamp, 2)
+                        end_stamp = (
+                            interval.xmax + 0.04 - interval.xmax % 0.04
+                            if interval.xmax % 0.04 != 0
+                            else interval.xmax
+                        )
+                        end_stamp = round(end_stamp, 2)
+                        utterance_id = (
+                            "S{}_{}_{}_{}_{}_".format(
+                                speaker, room, speakers, config, index
+                            )
+                            + "{0:06d}".format(int(round(start_stamp * 100, 0)))
+                            + "-"
+                            + "{0:06d}".format(int(round(end_stamp * 100, 0)))
+                        )
+                        text = interval.text
+                        for punctuation in punctuation_list:
+                            text = text.replace(punctuation, "")
+                        if text not in sound_list:
+                            segments_lines.append(
+                                "{} {} {} {}".format(
+                                    utterance_id, wav_id, start_stamp, end_stamp
+                                )
+                            )
+                            text_sentence_lines.append(
+                                "{} {}".format(utterance_id, text)
+                            )
+                            utt2spk_lines.append("{} S{}".format(utterance_id, speaker))
+    return [segments_lines, text_sentence_lines, utt2spk_lines]
+
+
+def prepare_segments_text_utt2spk_manager(
+    transcription_dir, set_type, store_dir, processing_num=1
+):
+    if processing_num > 1:
+        pool = Pool(processes=processing_num)
+        all_result = []
+        for i in range(processing_num):
+            part_result = pool.apply_async(
+                prepare_segments_text_utt2spk_worker,
+                kwds={
+                    "transcription_dir": transcription_dir,
+                    "set_type": set_type,
+                    "store_dir": store_dir,
+                    "processing_id": i,
+                    "processing_num": processing_num,
+                },
+            )
+            all_result.append(part_result)
+        pool.close()
+        pool.join()
+        segments_lines, text_sentence_lines, utt2spk_lines = [], [], []
+        for item in all_result:
+            (
+                part_segments_lines,
+                part_text_sentence_lines,
+                part_utt2spk_lines,
+            ) = item.get()
+            segments_lines += part_segments_lines
+            text_sentence_lines += part_text_sentence_lines
+            utt2spk_lines += part_utt2spk_lines
+    else:
+        (
+            segments_lines,
+            text_sentence_lines,
+            utt2spk_lines,
+        ) = prepare_segments_text_utt2spk_worker(
+            transcription_dir=transcription_dir, set_type=set_type, store_dir=store_dir
+        )
+
+    text2lines(
+        textpath="{}/temp/segments".format(store_dir), lines_content=segments_lines
+    )
+    text2lines(
+        textpath="{}/temp/text_sentence".format(store_dir),
+        lines_content=text_sentence_lines,
+    )
+    text2lines(
+        textpath="{}/temp/utt2spk".format(store_dir), lines_content=utt2spk_lines
+    )
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("")
+    parser.add_argument("wav_dir", type=str, default="", help="directory of wav")
+    parser.add_argument("mp4_dir", type=str, default="", help="directory of mp4")
+    parser.add_argument(
+        "transcription_dir", type=str, default="", help="directory of transcription"
+    )
+    parser.add_argument("set_type", type=str, default="train", help="set type")
+    parser.add_argument(
+        "store_dir", type=str, default="data/train_far", help="set types"
+    )
+    parser.add_argument(
+        "-o", "--only_wav", type=bool, default=False, help="only prepare wav.scp"
+    )
+    parser.add_argument("-nj", type=int, default=15, help="number of process")
+    args = parser.parse_args()
+
+    print("Preparing wav.scp in {} for {} set".format(args.store_dir, args.set_type))
+    prepare_wav_scp(
+        data_root=args.wav_dir, store_dir=args.store_dir, set_type=args.set_type
+    )
+    print("Preparing mp4.scp in {} for {} set".format(args.store_dir, args.set_type))
+    prepare_mp4_scp(
+        data_root=args.mp4_dir, store_dir=args.store_dir, set_type=args.set_type
+    )
+    if not args.only_wav:
+        print(
+            "Preparing segments,text_sentence,utt2spk in {} for {} set".format(
+                args.store_dir, args.set_type
+            )
+        )
+        prepare_segments_text_utt2spk_manager(
+            transcription_dir=args.transcription_dir,
+            set_type=args.set_type,
+            store_dir=args.store_dir,
+            processing_num=args.nj,
+        )
diff --git a/egs2/misp2021/avsr1/local/prepare_far_video_roi.py b/egs2/misp2021/avsr1/local/prepare_far_video_roi.py
new file mode 100755
index 00000000000..32d8a0fae4d
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/prepare_far_video_roi.py
@@ -0,0 +1,456 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import cv2
+import json
+import time
+import codecs
+import argparse
+import numpy as np
+from tqdm import tqdm
+from multiprocessing import Pool
+
+
+def crop_frame_roi(frame, roi_bound, roi_size=(96, 96)):
+    bound_l = max(roi_bound[3] - roi_bound[1], roi_bound[2] - roi_bound[0])
+    bound_h_extend = (bound_l - roi_bound[2] + roi_bound[0]) / 2
+    bound_w_extend = (bound_l - roi_bound[3] + roi_bound[1]) / 2
+    x_start, x_end = int(roi_bound[1] - bound_w_extend), int(
+        roi_bound[3] + bound_w_extend
+    )
+    if x_start < 0:
+        x_start = 0
+    if x_end > frame.shape[0]:
+        x_end = frame.shape[0]
+    y_start, y_end = int(roi_bound[0] - bound_h_extend), int(
+        roi_bound[2] + bound_h_extend
+    )
+    if y_start < 0:
+        y_start = 0
+    if y_end > frame.shape[1]:
+        y_end = frame.shape[1]
+    roi_frame = frame[x_start:x_end, y_start:y_end, :]
+    resized_roi_frame = cv2.resize(roi_frame, roi_size, interpolation=cv2.INTER_AREA)
+    return cv2.cvtColor(resized_roi_frame, cv2.COLOR_BGR2GRAY)
+
+
+def crop_roi(frames_array, roi_bound, roi_size=(96, 96)):
+    frames_num = frames_array.shape[0]
+    assert frames_num == roi_bound.shape[0]
+    roi_array = []
+    for frame_idx in range(frames_num):
+        roi_array.append(
+            crop_frame_roi(
+                frame=frames_array[frame_idx],
+                roi_bound=roi_bound[frame_idx],
+                roi_size=roi_size,
+            )
+        )
+    return np.stack(roi_array, axis=0)
+
+
+def segment_roi_json(
+    roi_json_path,
+    segments_name,
+    segments_speaker,
+    segments_start,
+    segments_end,
+    total_frames_num,
+):
+    with codecs.open(roi_json_path, "r") as handle:
+        roi_dic = json.load(handle)
+
+    def get_from_frame_detection(frame_i, target_id):
+        if str(frame_i) in roi_dic:
+            for roi_info in roi_dic[str(frame_i)]:
+                if roi_info["id"] == target_id:
+                    return [
+                        roi_info["x1"],
+                        roi_info["y1"],
+                        roi_info["x2"],
+                        roi_info["y2"],
+                    ]
+        return []
+
+    delete_segments_key = []
+    segments_roi_bound = {}
+    for _, (name, speaker_id, frame_start, frame_end) in enumerate(
+        zip(segments_name, segments_speaker, segments_start, segments_end)
+    ):
+        if frame_end >= total_frames_num:
+            delete_segments_key.append(name)
+            print(
+                "{}: sengment end cross the line, {} but {}, skip".format(
+                    name, frame_end, total_frames_num
+                )
+            )
+        else:
+            segment_roi_bound = []
+            segment_roi_idx = []
+            for frame_idx in range(frame_start, frame_end):
+                segment_roi_bound.append(
+                    get_from_frame_detection(frame_idx, speaker_id)
+                )
+                segment_roi_idx.append(frame_idx)
+
+            frame_roi_exist_num = np.sum([*map(bool, segment_roi_bound)]).item()
+
+            if float(frame_roi_exist_num) / float(frame_end - frame_start) < 0.5:
+                delete_segments_key.append(name)
+                print(
+                    "{}: {}/{} frames have detection result, skip".format(
+                        name, frame_roi_exist_num, frame_end - frame_start
+                    )
+                )
+            elif frame_roi_exist_num == frame_end - frame_start:
+                segments_roi_bound[name] = segment_roi_bound
+                print(
+                    "{}: {}/{} frames have detection result, prefect".format(
+                        name, frame_roi_exist_num, frame_end - frame_start
+                    )
+                )
+            else:
+                print(
+                    "{}: {}/{} frames have detection result, insert".format(
+                        name, frame_roi_exist_num, frame_end - frame_start
+                    )
+                )
+                i = 1
+                forward_buffer = []
+                forward_buffer_idx = -1
+                while frame_start - i >= 0:
+                    if get_from_frame_detection(frame_start - i, speaker_id):
+                        forward_buffer = get_from_frame_detection(
+                            frame_start - i, speaker_id
+                        )
+                        forward_buffer_idx = frame_start - i
+                        break
+                    else:
+                        i += 1
+
+                need_insert_idxes = []
+                for i, (frame_idx, frame_roi_bound) in enumerate(
+                    zip(segment_roi_idx, segment_roi_bound)
+                ):
+                    if frame_roi_bound:
+                        while need_insert_idxes:
+                            need_insert_idx = need_insert_idxes.pop(0)
+                            if forward_buffer_idx == -1:
+                                segment_roi_bound[need_insert_idx] = frame_roi_bound
+                                print(
+                                    need_insert_idx,
+                                    segment_roi_bound[need_insert_idx],
+                                    segment_roi_idx[need_insert_idx],
+                                    frame_roi_bound,
+                                    frame_idx,
+                                )
+                            else:
+                                segment_roi_bound[need_insert_idx] = (
+                                    (
+                                        np.array(forward_buffer)
+                                        + (
+                                            segment_roi_idx[need_insert_idx]
+                                            - forward_buffer_idx
+                                        )
+                                        * (
+                                            np.array(frame_roi_bound)
+                                            - np.array(forward_buffer)
+                                        )
+                                        / (frame_idx - forward_buffer_idx)
+                                    )
+                                    .astype(np.int64)
+                                    .tolist()
+                                )
+                                print(
+                                    need_insert_idx,
+                                    segment_roi_bound[need_insert_idx],
+                                    segment_roi_idx[need_insert_idx],
+                                    frame_roi_bound,
+                                    frame_idx,
+                                    forward_buffer,
+                                    forward_buffer_idx,
+                                )
+                        forward_buffer = frame_roi_bound
+                        forward_buffer_idx = frame_idx
+                    else:
+                        need_insert_idxes.append(i)
+
+                if need_insert_idxes:
+                    i = 0
+                    backward_buffer = []
+                    backward_buffer_idx = -1
+                    while frame_end + i < total_frames_num:
+                        if get_from_frame_detection(frame_end + i, speaker_id):
+                            backward_buffer = get_from_frame_detection(
+                                frame_end + i, speaker_id
+                            )
+                            backward_buffer_idx = frame_end + i
+                            break
+                        else:
+                            i += 1
+                    while need_insert_idxes:
+                        need_insert_idx = need_insert_idxes.pop(0)
+                        if forward_buffer_idx == -1 and backward_buffer_idx == -1:
+                            raise ValueError("no context cannot pad")
+                        elif forward_buffer_idx == -1:
+                            segment_roi_bound[need_insert_idx] = backward_buffer
+                            print(
+                                need_insert_idx,
+                                segment_roi_bound[need_insert_idx],
+                                segment_roi_idx[need_insert_idx],
+                                backward_buffer,
+                                backward_buffer_idx,
+                            )
+                        elif backward_buffer_idx == -1:
+                            segment_roi_bound[need_insert_idx] = forward_buffer
+                            print(
+                                need_insert_idx,
+                                segment_roi_bound[need_insert_idx],
+                                segment_roi_idx[need_insert_idx],
+                                forward_buffer,
+                                forward_buffer_idx,
+                            )
+                        else:
+                            segment_roi_bound[need_insert_idx] = (
+                                (
+                                    np.array(forward_buffer)
+                                    + (
+                                        segment_roi_idx[need_insert_idx]
+                                        - forward_buffer_idx
+                                    )
+                                    * (
+                                        np.array(backward_buffer)
+                                        - np.array(forward_buffer)
+                                    )
+                                    / (backward_buffer_idx - forward_buffer_idx)
+                                )
+                                .astype(np.int64)
+                                .tolist()
+                            )
+                            print(
+                                need_insert_idx,
+                                segment_roi_bound[need_insert_idx],
+                                segment_roi_idx[need_insert_idx],
+                                backward_buffer,
+                                backward_buffer_idx,
+                                forward_buffer,
+                                forward_buffer_idx,
+                            )
+                assert not need_insert_idxes
+                segments_roi_bound[name] = segment_roi_bound
+    return segments_roi_bound, delete_segments_key
+
+
+def segment_video_roi_json(
+    video_path,
+    roi_json_path,
+    roi_store_dir,
+    segments_name,
+    segments_speaker,
+    segments_start,
+    segments_end,
+    file_handle,
+):
+    segments_num = len(segments_start)
+    assert segments_num > 0
+    assert segments_num == len(segments_end)
+
+    video_capture = cv2.VideoCapture(video_path)
+    total_frames_num = int(video_capture.get(7))
+    print(
+        "using roi info from {}, all {} frames, generating {} segments".format(
+            roi_json_path, total_frames_num, segments_num
+        )
+    )
+
+    segments_roi_bound, delete_segments_key = segment_roi_json(
+        roi_json_path,
+        segments_name,
+        segments_speaker,
+        segments_start,
+        segments_end,
+        total_frames_num,
+    )
+    frame2segment_roi_bound = {}
+    for i, segment_name in enumerate(segments_name):
+        if segment_name not in delete_segments_key:
+            segments_path = os.path.join(
+                os.path.abspath(roi_store_dir), "{}.npz".format(segment_name)
+            )
+            file_handle.write("{} {}\n".format(segment_name, segments_path))
+            if not os.path.exists(segments_path):
+                for in_frame_idx in range(segments_end[i] - segments_start[i]):
+                    if segments_start[i] + in_frame_idx in frame2segment_roi_bound:
+                        frame2segment_roi_bound[
+                            segments_start[i] + in_frame_idx
+                        ].append(
+                            [
+                                segment_name,
+                                in_frame_idx,
+                                segments_end[i] - segments_start[i],
+                                segments_roi_bound[segment_name][in_frame_idx],
+                            ]
+                        )
+                    else:
+                        frame2segment_roi_bound[segments_start[i] + in_frame_idx] = [
+                            [
+                                segment_name,
+                                in_frame_idx,
+                                segments_end[i] - segments_start[i],
+                                segments_roi_bound[segment_name][in_frame_idx],
+                            ]
+                        ]
+
+    if not os.path.exists(roi_store_dir):
+        os.makedirs(roi_store_dir, exist_ok=True)
+
+    if frame2segment_roi_bound:
+        segments_roi_frames_buffer = {}
+
+        # segment_video_writer = None
+        frames_idx = 0
+
+        # frames_bar = tqdm(total=total_frames_num, leave=True, desc='Frame')
+        while video_capture.isOpened():
+            ret, frame = video_capture.read()
+            if ret and frame2segment_roi_bound:
+                if frames_idx in frame2segment_roi_bound:
+                    frame_info_list = frame2segment_roi_bound.pop(frames_idx)
+                    for frame_info in frame_info_list:
+                        if frame_info[1] == 0:
+                            assert frame_info[0] not in segments_roi_frames_buffer
+                            segments_roi_frames_buffer[frame_info[0]] = [
+                                crop_frame_roi(frame, frame_info[3], (96, 96))
+                            ]
+                        else:
+                            segments_roi_frames_buffer[frame_info[0]].append(
+                                crop_frame_roi(frame, frame_info[3], (96, 96))
+                            )
+
+                        if frame_info[1] == frame_info[2] - 1:
+                            np.savez(
+                                os.path.join(
+                                    roi_store_dir, "{}.npz".format(frame_info[0])
+                                ),
+                                data=segments_roi_frames_buffer.pop(frame_info[0]),
+                            )
+                frames_idx += 1
+                # frames_bar.update(1)
+            else:
+                break
+        # frames_bar.close()
+        assert not frame2segment_roi_bound
+        video_capture.release()
+        print(
+            "skip {} segments: {}".format(
+                len(delete_segments_key), ",".join(delete_segments_key)
+            )
+        )
+    return None
+
+
+def input_interface(data_root, roi_json_dir):
+    fps = 25
+
+    video_dic = {}
+    with codecs.open(os.path.join(data_root, "mp4.scp"), "r") as handle:
+        lines_content = handle.readlines()
+    for video_line in [*map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)]:
+        name, path = video_line.split(" ")
+        video_dic[name] = path
+
+    vid2spk_dic = {}
+    with codecs.open(os.path.join(data_root, "vid2spk"), "r") as handle:
+        lines_content = handle.readlines()
+    for vid2spk_line in [
+        *map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)
+    ]:
+        name, speaker = vid2spk_line.split(" ")
+        vid2spk_dic[name] = int(speaker[1:])
+
+    segments_dic = {}
+    with codecs.open(os.path.join(data_root, "segments"), "r") as handle:
+        lines_content = handle.readlines()
+    for segment_line in [
+        *map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)
+    ]:
+        segment_name, name, start, end = segment_line.split(" ")
+        if video_dic[name] not in segments_dic:
+            segments_dic[video_dic[name]] = {
+                "roi_json_path": os.path.join(roi_json_dir, "{}.json".format(name)),
+                "segments_name": [segment_name],
+                "segments_speaker": [vid2spk_dic[segment_name]],
+                "segments_start": [int(np.around(float(start) * fps))],
+                "segments_end": [int(np.around(float(end) * fps))],
+            }
+        else:
+            segments_dic[video_dic[name]]["segments_name"].append(segment_name)
+            segments_dic[video_dic[name]]["segments_speaker"].append(
+                vid2spk_dic[segment_name]
+            )
+            segments_dic[video_dic[name]]["segments_start"].append(
+                int(np.around(float(start) * fps))
+            )
+            segments_dic[video_dic[name]]["segments_end"].append(
+                int(np.around(float(end) * fps))
+            )
+
+    return segments_dic
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("prepare_far_video_roi")
+    parser.add_argument(
+        "data_root",
+        type=str,
+        default="data/test_far_video",
+        help="root directory of dataset",
+    )
+    parser.add_argument(
+        "roi_json_dir", type=str, default="", help="directory of roi json"
+    )
+    parser.add_argument(
+        "roi_store_dir",
+        type=str,
+        default="data/test_far_video",
+        help="store directory of roi npz",
+    )
+    parser.add_argument("--ji", type=int, default=0, help="index of process")
+    parser.add_argument("--nj", type=int, default=15, help="number of process")
+
+    args = parser.parse_args()
+
+    all_input_params = input_interface(
+        data_root=args.data_root, roi_json_dir=args.roi_json_dir
+    )
+    all_sentences = sorted([*all_input_params.keys()])
+    nj = args.nj
+    ji = args.ji if nj > 1 else 0
+    start_time = time.time()
+    handle = codecs.open(
+        os.path.join(args.roi_store_dir, "log", "roi.{}.scp".format(ji + 1)), "w"
+    )
+
+    for sentence_idx, sentence_path in enumerate(all_sentences):
+        if sentence_idx % nj == ji:
+            print("#" * 72)
+            print("processing {}".format(sentence_path))
+            segment_video_roi_json(
+                video_path=sentence_path,
+                roi_store_dir=args.roi_store_dir,
+                file_handle=handle,
+                **all_input_params[sentence_path]
+            )
+            in_len = (len(all_sentences) - ji) // nj
+            in_index = (sentence_idx - ji) // nj
+            current_dur = round((time.time() - start_time) / 60.0, 2)
+            print(
+                "{}/{} {}/{} min".format(
+                    in_index,
+                    in_len,
+                    current_dur,
+                    round(current_dur * (in_len + 1) / (in_index + 1), 2),
+                )
+            )
+
+    handle.close()
diff --git a/egs2/misp2021/avsr1/local/prepare_visual_embedding_extractor.py b/egs2/misp2021/avsr1/local/prepare_visual_embedding_extractor.py
new file mode 100755
index 00000000000..38eb7f60611
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/prepare_visual_embedding_extractor.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import codecs
+import argparse
+from tqdm import tqdm
+
+
+def text2lines(textpath, lines_content=None):
+    """
+    read lines from text or write lines to txt
+    :param textpath: filepath of text
+    :param lines_content: list of lines or None, None means read
+    :return: processed lines content for read while None for write
+    """
+    if lines_content is None:
+        with codecs.open(textpath, "r") as handle:
+            lines_content = handle.readlines()
+        processed_lines = [
+            *map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)
+        ]
+        return processed_lines
+    else:
+        processed_lines = [
+            *map(lambda x: x if x[-1] in ["\n"] else "{}\n".format(x), lines_content)
+        ]
+        with codecs.open(textpath, "w") as handle:
+            handle.write("".join(processed_lines))
+        return None
+
+
+def generate_extractor_shell(video_dir, nj, python_path="", used_gpus=[0, 1, 2, 3]):
+    print("Avail gpus: {}".format(used_gpus))
+    process2gpu = [used_gpus[i % len(used_gpus)] for i in range(nj)]
+    print("Allocate gpu for {} processes: {}".format(nj, process2gpu))
+    roi_npz_lines = text2lines(os.path.join(video_dir, "roi.scp"))
+    embedding_scp_lines = []
+    extractor_shells = [["set -e"] for _ in range(nj)]
+    i = 0
+    for line in tqdm(roi_npz_lines):
+        key, roi_npz_path = line.split(" ")
+        embedding_npz_path = os.path.join(
+            video_dir, "visual_embedding", "{}.npz".format(key)
+        )
+        if os.path.exists(roi_npz_path):
+            embedding_scp_lines.append(
+                "{} {}".format(key, os.path.abspath(embedding_npz_path))
+            )
+            if not os.path.exists(embedding_npz_path):
+                extractor_shells[i % nj].append(
+                    "CUDA_VISIBLE_DEVICES={} {}python extractor/main.py \
+                        --extract-feats \
+                        --config-path extractor/configs/lrw_resnet18_mstcn.json \
+                        --model-path extractor/models/lrw_resnet18_mstcn.pth.tar \
+                        --mouth-patch-path {} --mouth-embedding-out-path {}".format(
+                        process2gpu[(i % nj)],
+                        python_path,
+                        roi_npz_path,
+                        embedding_npz_path,
+                    )
+                )
+                i += 1
+    for i, extractor_shell in enumerate(extractor_shells):
+        text2lines(
+            textpath=os.path.join(
+                video_dir, "visual_embedding", "log", "extract.{}.sh".format(i + 1)
+            ),
+            lines_content=extractor_shells[i],
+        )
+    text2lines(
+        textpath=os.path.join(video_dir, "embedding.scp"),
+        lines_content=embedding_scp_lines,
+    )
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("prepare_visual_embedding_extractor")
+    parser.add_argument(
+        "video_dir",
+        type=str,
+        default="/yrfs1/intern/hangchen2/experiment/EASE",
+        help="video_dir",
+    )
+    parser.add_argument("-nj", type=int, default=15, help="number of process")
+    parser.add_argument(
+        "-g", type=int, nargs="+", default=[0, 1, 2, 3], help="number of process"
+    )
+    parser.add_argument("-p", type=str, default="", help="python_path")
+    args = parser.parse_args()
+
+    print("prepare visual embedding extractor")
+    generate_extractor_shell(
+        video_dir=args.video_dir, nj=args.nj, python_path=args.p, used_gpus=args.g
+    )
+    print(
+        "Done: all {} shell for visual embedding extractor, {}".format(
+            args.nj,
+            os.path.join(args.video_dir, "visual_embedding", "log", "extract.*.sh"),
+        )
+    )
diff --git a/egs2/misp2021/avsr1/local/run_beamformit.py b/egs2/misp2021/avsr1/local/run_beamformit.py
new file mode 100755
index 00000000000..8070542bb30
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/run_beamformit.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import argparse
+
+
+def beamformit_worker(
+    beamformit_tool, config_file, source_dir, channel_scp, output_root
+):
+    f = open(channel_scp)
+    for line in f:
+        show_id = line.split(" ")[0]
+        store_dir = os.path.join("/", *line.split(" ")[1].split("/")[:-1])
+        print("*" * 50)
+        print(store_dir)
+        print(show_id)
+        print("*" * 50)
+        if not os.path.exists(store_dir):
+            os.makedirs(store_dir, exist_ok=True)
+        cmd = "{} -s {} -c {} --config_file {} -source_dir {} --result_dir {}".format(
+            beamformit_tool, show_id, channel_scp, config_file, source_dir, store_dir
+        )
+        os.system(cmd)
+    f.close()
+    return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("")
+    parser.add_argument(
+        "beamformit_tool",
+        type=str,
+        default="./../se_misp/BeamformIt-master/BeamformIt",
+        help="path of beamformit tool",
+    )
+    parser.add_argument(
+        "config_file",
+        type=str,
+        default="./conf/all_conf.cfg",
+        help="path of config file",
+    )
+    parser.add_argument(
+        "source_dir",
+        type=str,
+        default="/yrfs2/cv1/hangchen2/data/MISP_121h_WPE_/",
+        help="wpe data dir",
+    )
+    parser.add_argument(
+        "channel_scp",
+        type=str,
+        default="exp/wpe_tmp/channels_misp",
+        help="path of config file",
+    )
+    parser.add_argument(
+        "output_root",
+        type=str,
+        default="/yrfs2/cv1/hangchen2/data/MISP_121h_WPE_/",
+        help="beamformit data dir",
+    )
+
+    args = parser.parse_args()
+    beamformit_worker(
+        beamformit_tool=args.beamformit_tool,
+        config_file=args.config_file,
+        source_dir=args.source_dir,
+        channel_scp=args.channel_scp,
+        output_root=args.output_root,
+    )
diff --git a/egs2/misp2021/avsr1/local/run_wpe.py b/egs2/misp2021/avsr1/local/run_wpe.py
new file mode 100755
index 00000000000..0815dd1dfae
--- /dev/null
+++ b/egs2/misp2021/avsr1/local/run_wpe.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# _*_ coding: UTF-8 _*_
+import os
+import codecs
+import argparse
+import numpy as np
+import scipy.io.wavfile as wf
+from multiprocessing import Pool
+from nara_wpe.wpe import wpe_v8 as wpe
+from nara_wpe.utils import stft, istft
+
+
+def wpe_worker(
+    wav_scp,
+    data_root="MISP_121h",
+    output_root="MISP_121h_WPE_",
+    processing_id=None,
+    processing_num=None,
+):
+    sampling_rate = 16000
+    iterations = 5
+    stft_options = dict(
+        size=512,
+        shift=128,
+        window_length=None,
+        fading=True,
+        pad=True,
+        symmetric_window=False,
+    )
+    with codecs.open(wav_scp, "r") as handle:
+        lines_content = handle.readlines()
+    wav_lines = [*map(lambda x: x[:-1] if x[-1] in ["\n"] else x, lines_content)]
+    for wav_idx in range(len(wav_lines)):
+        if processing_id is None:
+            processing_token = True
+        else:
+            if wav_idx % processing_num == processing_id:
+                processing_token = True
+            else:
+                processing_token = False
+        if processing_token:
+            file_list = wav_lines[wav_idx].split(" ")
+            name, wav_list = file_list[0], file_list[1:]
+            file_exist = True
+            for wav_path in wav_list:
+                file_exist = file_exist and os.path.exists(
+                    wav_path.replace(data_root, output_root)
+                )
+                if not file_exist:
+                    break
+            if not file_exist:
+                print("wait to process {} : {}".format(wav_idx, wav_list[0]))
+                signal_list = []
+                for f in wav_list:
+                    _, data = wf.read(f)
+                    if data.dtype == np.int16:
+                        data = np.float32(data) / 32768
+                    signal_list.append(data)
+                min_len = len(signal_list[0])
+                max_len = len(signal_list[0])
+                for i in range(1, len(signal_list)):
+                    min_len = min(min_len, len(signal_list[i]))
+                    max_len = max(max_len, len(signal_list[i]))
+                if min_len != max_len:
+                    for i in range(len(signal_list)):
+                        signal_list[i] = signal_list[i][:min_len]
+                y = np.stack(signal_list, axis=0)
+                Y = stft(y, **stft_options).transpose(2, 0, 1)
+                Z = wpe(Y, iterations=iterations, statistics_mode="full").transpose(
+                    1, 2, 0
+                )
+                z = istft(Z, size=stft_options["size"], shift=stft_options["shift"])
+                for d in range(len(signal_list)):
+                    store_path = wav_list[d].replace(data_root, output_root)
+                    if not os.path.exists(os.path.split(store_path)[0]):
+                        os.makedirs(os.path.split(store_path)[0], exist_ok=True)
+                    tmpwav = np.int16(z[d, :] * 32768)
+                    wf.write(store_path, sampling_rate, tmpwav)
+            else:
+                print("file exist {} : {}".format(wav_idx, wav_list[0]))
+    return None
+
+
+def wpe_manager(
+    wav_scp, processing_num=1, data_root="MISP_121h", output_root="MISP_121h_WPE_"
+):
+    if processing_num > 1:
+        pool = Pool(processes=processing_num)
+        for i in range(processing_num):
+            pool.apply_async(
+                wpe_worker,
+                kwds={
+                    "wav_scp": wav_scp,
+                    "processing_id": i,
+                    "processing_num": processing_num,
+                    "data_root": data_root,
+                    "output_root": output_root,
+                },
+            )
+        pool.close()
+        pool.join()
+    else:
+        wpe_worker(wav_scp, data_root=data_root, output_root=output_root)
+    return None
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("run_wpe")
+    parser.add_argument(
+        "wav_scp",
+        type=str,
+        default="./local/tmp/wpe.scp",
+        help="list file of wav, format is scp",
+    )
+    parser.add_argument(
+        "data_root", type=str, default="wpe", help="input misp data root"
+    )
+    parser.add_argument(
+        "output_root", type=str, default="wpe", help="output wpe data root"
+    )
+    parser.add_argument("-nj", type=int, default="1", help="number of process")
+    args = parser.parse_args()
+    print("wavfile=", args.wav_scp)
+    print("processingnum=", args.nj)
+    wpe_manager(
+        wav_scp=args.wav_scp,
+        processing_num=args.nj,
+        data_root=args.data_root,
+        output_root=args.output_root,
+    )
diff --git a/egs2/misp2021/avsr1/path.sh b/egs2/misp2021/avsr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/misp2021/avsr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/pyscripts b/egs2/misp2021/avsr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/misp2021/avsr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/run.sh b/egs2/misp2021/avsr1/run.sh
new file mode 100755
index 00000000000..fe7cdac242e
--- /dev/null
+++ b/egs2/misp2021/avsr1/run.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+
+train_set=train_far_av
+valid_set=dev_far_av
+test_sets=dev_far_av
+
+asr_config=conf/tuning/train_asr_conformer.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+use_lm=true
+use_word_lm=false
+
+
+./asr.sh                                   \
+    --lang zh \
+    --stage 1 \
+    --audio_format wav                     \
+    --nlsyms_txt data/nlsyms.txt           \
+    --ngpu 3                               \
+    --token_type char                      \
+    --feats_type extracted                 \
+    --use_lm ${use_lm}                     \
+    --asr_config "${asr_config}"           \
+    --inference_config "${inference_config}" \
+    --lm_config "${lm_config}"             \
+    --use_word_lm ${use_word_lm}           \
+    --train_set "${train_set}"             \
+    --valid_set "${valid_set}"             \
+    --test_sets "${test_sets}"             \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/misp2021/avsr1/scripts b/egs2/misp2021/avsr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/misp2021/avsr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/steps b/egs2/misp2021/avsr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/misp2021/avsr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/misp2021/avsr1/utils b/egs2/misp2021/avsr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/misp2021/avsr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mls/asr1/steps b/egs2/mls/asr1/steps
index 0c20af64f4a..91f2d234e20 120000
--- a/egs2/mls/asr1/steps
+++ b/egs2/mls/asr1/steps
@@ -1 +1 @@
-/home/jiatong/tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mls/asr1/utils b/egs2/mls/asr1/utils
index c3ac429afa1..f49247da827 120000
--- a/egs2/mls/asr1/utils
+++ b/egs2/mls/asr1/utils
@@ -1 +1 @@
-/home/jiatong/tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/README.md b/egs2/mr_openslr64/asr1/README.md
new file mode 100644
index 00000000000..0e6848f9c27
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/README.md
@@ -0,0 +1,36 @@
+# RESULTS
+## Environments
+- date: `Mon Mar 21 16:06:03 UTC 2022`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.11.0+cu102`
+- Git hash: `91325a1e58ca0b13494b94bf79b186b095fe0b58`
+  - Commit date: `Mon Mar 21 00:40:52 2022 +0000`
+
+## asr_train_asr_conformer_xlsr_raw_bpe150_sp
+
+This recipe is for the Marathi language and is trained on the [OpenSLR Marathi](https://www.openslr.org/64/) multi-speaker speech data set.
+
+The following results are obtained by using an XLSR frontend.
+
+Train ASR Config: [conf/tuning/train_asr_conformer_xlsr.yaml](conf/tuning/train_asr_conformer_xlsr.yaml)
+
+Trained Model: [espnet/marathi_openslr64](https://huggingface.co/espnet/marathi_openslr64)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|3625|72.9|22.5|4.7|1.7|28.9|88.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|20557|91.4|3.1|5.5|1.9|10.5|88.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_batch_size1_asr_model_valid.acc.ave/marathi_test|299|13562|86.5|6.3|7.1|1.4|14.9|88.6|
diff --git a/egs2/mr_openslr64/asr1/asr.sh b/egs2/mr_openslr64/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/cmd.sh b/egs2/mr_openslr64/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/decode_asr.yaml b/egs2/mr_openslr64/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/fbank.conf b/egs2/mr_openslr64/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mr_openslr64/asr1/conf/pbs.conf b/egs2/mr_openslr64/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/mr_openslr64/asr1/conf/pitch.conf b/egs2/mr_openslr64/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mr_openslr64/asr1/conf/queue.conf b/egs2/mr_openslr64/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mr_openslr64/asr1/conf/slurm.conf b/egs2/mr_openslr64/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mr_openslr64/asr1/conf/train_asr.yaml b/egs2/mr_openslr64/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/conf/train_lm.yaml b/egs2/mr_openslr64/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml b/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d8671a16988
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 16
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..f47a0df534c
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,67 @@
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml
new file mode 100644
index 00000000000..1dbd14da380
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_conformer_xlsr.yaml
@@ -0,0 +1,88 @@
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+    upstream: wav2vec2_xlsr  # Note: If the upstream is changed, please change the input_size in the preencoder.
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..a0a37a5c0e4
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,68 @@
+# This configuration requires 4 GPUs with 32GB memory
+batch_type: numel
+batch_bins: 10000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 5
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    attention_dropout_rate: 0.3
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 17
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 3
+    dropout_rate: 0.3
+    positional_dropout_rate: 0.3
+    self_attention_dropout_rate: 0.3
+    src_attention_dropout_rate: 0.3
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/db.sh b/egs2/mr_openslr64/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/local/data.sh b/egs2/mr_openslr64/asr1/local/data.sh
new file mode 100755
index 00000000000..c296b907d59
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/local/data.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+if [ -z "$MARATHI" ]; then
+    log "Variable MARATHI not set in db.sh"
+    exit 2
+fi
+
+mkdir -p ${MARATHI}
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    cd ${MARATHI}
+    wget https://www.openslr.org/resources/64/mr_in_female.zip
+    unzip -o mr_in_female.zip
+    rm -f mr_in_female.zip
+    
+    cd $workspace    
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/data_prep.py -d ${MARATHI}
+    utils/spk2utt_to_utt2spk.pl data/marathi_train/spk2utt > data/marathi_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/marathi_dev/spk2utt > data/marathi_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/marathi_test/spk2utt > data/marathi_test/utt2spk
+    utils/fix_data_dir.sh data/marathi_train
+    utils/fix_data_dir.sh data/marathi_dev
+    utils/fix_data_dir.sh data/marathi_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mr_openslr64/asr1/local/data_prep.py b/egs2/mr_openslr64/asr1/local/data_prep.py
new file mode 100644
index 00000000000..ed446ef71ae
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/local/data_prep.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/line_index.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = l_list[0].split("_")[1]
+        text = l_list[1]
+        path = "%s/%s.wav" % (args.d, fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 2000:
+            break
+
+    num_test_spks = 2
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 2000:
+                curr_num_fids = num_fids - 2000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s.wav -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid,
+                    sr,
+                )
+                text_strs.append("%s %s" % (utt, utt2text[fid]))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/marathi_%s" % phase
+        if not os.path.exists(phase_dir):
+            os.makedirs(phase_dir)
+
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/mr_openslr64/asr1/local/path.sh b/egs2/mr_openslr64/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mr_openslr64/asr1/path.sh b/egs2/mr_openslr64/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/pyscripts b/egs2/mr_openslr64/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/run.sh b/egs2/mr_openslr64/asr1/run.sh
new file mode 100755
index 00000000000..4b3fced2fb5
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="marathi_train"
+train_dev="marathi_dev"
+test_set="marathi_test"
+
+asr_config=conf/tuning/train_asr_conformer_xlsr.yaml
+inference_config=conf/decode_asr.yaml
+
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 10 \
+    --inference_nj 10 \
+    --gpu_inference true \
+    --audio_format "wav" \
+    --inference_args "--batch_size 1" \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe 150 \
+    --feats_type raw \
+    --feats_normalize utt_mvn \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
diff --git a/egs2/mr_openslr64/asr1/scripts b/egs2/mr_openslr64/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/steps b/egs2/mr_openslr64/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/mr_openslr64/asr1/utils b/egs2/mr_openslr64/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/mr_openslr64/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/README.md b/egs2/ms_indic_18/asr1/README.md
new file mode 100644
index 00000000000..b88cf2fee85
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/README.md
@@ -0,0 +1,94 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+
+# RESULTS
+## Environments
+- date: `Tue Mar 22 13:38:24 EDT 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1+cu111`
+- Git hash: `f91410f712d1287cd6809c5bf26b54c5a40fe314`
+  - Commit date: `Mon Mar 14 22:32:17 2022 -0400`
+- Pretrained model: [espnet/chai_microsoft_indian_langs_te](https://huggingface.co/espnet/chai_microsoft_indian_langs_te)
+
+## Self-supervised learning features [wav2vec2_xlsr, Conformer, utt_mvn](conf/tuning/train_asr_xlsr53_conformer.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml). During inference, all below models use the same [decoding parameters](conf/tuning/decode_asr_transformer.yaml).
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.5|2.5|2.4|24.4|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.4|2.6|2.4|24.4|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|78.0|19.5|2.6|2.5|24.5|79.9|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.2|2.2|1.6|6.1|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.2|2.2|1.6|6.0|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.6|2.1|2.2|1.6|6.0|79.9|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.7|4.7|2.6|1.6|8.9|80.1|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|146657|92.8|4.7|2.6|1.6|8.9|79.7|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.8|4.6|2.6|1.6|8.9|79.9|
+
+
+
+## Self-supervised learning features [wav2vec2_large_ll60k, Transformer, utt_mvn](conf/tuning/train_asr_wav2vec2.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|77.3|20.3|2.4|2.9|25.6|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|28413|77.4|20.1|2.5|2.8|25.3|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|28413|77.5|20.1|2.4|2.8|25.3|79.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.0|2.5|2.5|1.8|6.8|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|229419|95.1|2.4|2.5|1.8|6.7|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|229419|95.1|2.4|2.5|1.7|6.6|79.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|91.8|5.2|3.0|1.8|9.9|79.5|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave/test_te|3040|146657|91.9|5.1|2.9|1.8|9.8|79.3|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave/test_te|3040|146657|92.0|5.1|3.0|1.7|9.7|79.6|
+
+
+
+## Standard ASR model based on [Transformer](conf/tuning/train_asr_transformer.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml) and [RNN-LM](conf/tuning/train_lm_rnn.yaml)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|28413|75.7|22.0|2.4|3.2|27.6|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|28413|75.9|21.8|2.4|3.1|27.3|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|28413|76.1|21.5|2.4|3.1|27.0|82.6|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.7|2.7|2.6|2.0|7.3|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.8|2.7|2.6|2.0|7.2|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|229419|94.8|2.6|2.6|2.0|7.1|82.6|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.1|5.8|3.1|2.0|10.9|82.7|
+|decode_transformer_lm_lm_train_lm_rnn_te_bpe150_valid.loss.best_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.3|5.7|3.1|2.0|10.7|82.2|
+|decode_transformer_lm_lm_train_lm_transformer_te_bpe150_valid.loss.ave_asr_model_valid.acc.ave_10best/test_te|3040|146657|91.4|5.5|3.1|1.9|10.6|82.6|
+
diff --git a/egs2/ms_indic_18/asr1/asr.sh b/egs2/ms_indic_18/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/cmd.sh b/egs2/ms_indic_18/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/ms_indic_18/asr1/conf/decode_asr.yaml b/egs2/ms_indic_18/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..78955c67707
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/fbank.conf b/egs2/ms_indic_18/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/ms_indic_18/asr1/conf/pbs.conf b/egs2/ms_indic_18/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/ms_indic_18/asr1/conf/pitch.conf b/egs2/ms_indic_18/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/ms_indic_18/asr1/conf/queue.conf b/egs2/ms_indic_18/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/ms_indic_18/asr1/conf/slurm.conf b/egs2/ms_indic_18/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/ms_indic_18/asr1/conf/train_asr.yaml b/egs2/ms_indic_18/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/train_lm.yaml b/egs2/ms_indic_18/asr1/conf/train_lm.yaml
new file mode 120000
index 00000000000..dd50b722db8
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/train_lm.yaml
@@ -0,0 +1 @@
+tuning/train_lm_rnn.yaml
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml
new file mode 100644
index 00000000000..a3debc6a257
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/decode_asr_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.4
+lm_weight: 0.4
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..454db5bfdbf
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,55 @@
+cudnn_benchmark: false 
+cudnn_deterministic: false 
+use_amp: true 
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 6
+    linear_units: 1024
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.1
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 1024
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 256
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 3
+max_epoch: 50
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: xavier_uniform # Our empirical studies shows that this initialization
+                     # is very important to low-resource ASR training
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml
new file mode 100644
index 00000000000..f61200b2ca5
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_wav2vec2.yaml
@@ -0,0 +1,93 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+#frontend related 
+#freeze_param: ["frontend.upstream"] 
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 100     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 200  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.1
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 15
+max_epoch: 29
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 20000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml
new file mode 100644
index 00000000000..509ca8fd7c2
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_asr_xlsr53_conformer.yaml
@@ -0,0 +1,105 @@
+# network architecture
+encoder: conformer
+encoder_conf:
+  output_size: 256
+  attention_heads: 4
+  linear_units: 2048
+  num_blocks: 12
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  attention_dropout_rate: 0.1
+  input_layer: conv2d
+  normalize_before: true
+  macaron_style: true
+  pos_enc_layer_type: "rel_pos"
+  selfattention_layer_type: "rel_selfattn"
+  activation_type: "swish"
+  use_cnn_module: true
+  cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+#frontend related 
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: wav2vec2_xlsr
+      download_dir: ./hub
+      multilayer_feature: True
+
+  align_method: linear_projection
+  proj_dim: 200     
+
+preencoder: linear
+preencoder_conf:
+    input_size: 400  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 5
+patience: 15
+max_epoch: 50
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml
new file mode 100644
index 00000000000..a8cf1602296
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_rnn.yaml
@@ -0,0 +1,16 @@
+grad_clip: 5.0
+batch_type: folded
+batch_size: 512
+max_epoch: 30      # if the data size is large, we can reduce this
+optim: adam
+optim_conf:
+    lr: 0.0005
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 4
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 3
diff --git a/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..3ce4f20d99d
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,30 @@
+# Trained with Nvidia TESLA V100, with 16GM RAM, x4
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 350000
+accum_grad: 2
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 3
diff --git a/egs2/ms_indic_18/asr1/db.sh b/egs2/ms_indic_18/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/local/data.sh b/egs2/ms_indic_18/asr1/local/data.sh
new file mode 100755
index 00000000000..9a9c709a206
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/local/data.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+lang=te # te ta gu
+
+. utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ -z "${MS_INDIC_IS18}" ]; then
+    log "Fill the value of 'MS_INDIC_IS18' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    if [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Train" ]]; then
+        log "stage0: Download training data to ${MS_INDIC_IS18}. ${lang}-in-Train directory is missing"
+        exit 1
+    elif [[ ! -d "${MS_INDIC_IS18}/${lang}-in-Test" ]]; then
+        log "stage0: Download test data to ${MS_INDIC_IS18}. ${lang}-in-Test directory is missing"
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage1: Preparing data for Microsoft Speech Corpus (Indian languages)"
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    local/prepare_data.py ${MS_INDIC_IS18} ${lang}
+fi
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/ms_indic_18/asr1/local/path.sh b/egs2/ms_indic_18/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/ms_indic_18/asr1/local/prepare_data.py b/egs2/ms_indic_18/asr1/local/prepare_data.py
new file mode 100755
index 00000000000..464a1f43b11
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/local/prepare_data.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# Copyright 2021  Chaitanya Narisetty
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import random
+import sys
+import librosa
+
+
+if len(sys.argv) != 3:
+    print("Usage: python prepare_data.py [data-directory] [language-ID]")
+    sys.exit(1)
+
+datadir = sys.argv[1]
+lang = sys.argv[2]
+
+traindir = f"{datadir}/{lang}-in-Train/"
+testdir = f"{datadir}/{lang}-in-Test/"
+
+train_datadir = f"data/train_{lang}/"
+valid_datadir = f"data/dev_{lang}/"
+test_datadir = f"data/test_{lang}/"
+
+os.popen(f"mkdir -p {train_datadir}").read()
+os.popen(f"mkdir -p {valid_datadir}").read()
+os.popen(f"mkdir -p {test_datadir}").read()
+
+
+# prepare data for training and validation splits
+with open(traindir + "transcription.txt") as f:
+    train_lines = [line.rstrip() for line in f.readlines()]
+    train_id2text = {}
+    train_id2filepath = {}
+    for line in train_lines:
+        wav_id = line.split()[0]
+        filepath = f"{traindir}/Audios/{wav_id}.wav"
+        train_id2text[wav_id] = " ".join(line.split()[1:])
+        train_id2filepath[wav_id] = filepath
+
+wav_ids = list(train_id2text.keys())
+random.shuffle(wav_ids)
+valid_id2text = {}
+valid_totaldur = 2 * 60 * 60  # (in seconds) 2 hours taken for validation split
+for wav_id in wav_ids:
+    dur = librosa.get_duration(filename=train_id2filepath[wav_id])
+    valid_id2text[wav_id] = train_id2text.pop(wav_id)
+    valid_totaldur -= dur
+    if valid_totaldur < 0:
+        break
+
+
+with open(train_datadir + "text", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2text[wav_id]}\n")
+with open(train_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n")
+with open(train_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(train_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(train_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
+
+with open(valid_datadir + "text", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} {valid_id2text[wav_id]}\n")
+with open(valid_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} {train_id2filepath[wav_id]}\n")
+with open(valid_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(valid_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(valid_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
+
+
+# prepare test data
+with open(testdir + "transcription.txt") as f:
+    test_lines = [line.rstrip() for line in f.readlines()]
+    test_id2text = {}
+    test_id2filepath = {}
+    for line in test_lines:
+        wav_id = line.split()[0]
+        filepath = f"{testdir}/Audios/{wav_id}.wav"
+        test_id2text[wav_id] = " ".join(line.split()[1:])
+        test_id2filepath[wav_id] = filepath
+
+with open(test_datadir + "text", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} {test_id2text[wav_id]}\n")
+with open(test_datadir + "wav.scp", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} {test_id2filepath[wav_id]}\n")
+with open(test_datadir + "spk2utt", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"spk_{lang}_{wav_id} {lang}_{wav_id}\n")
+with open(test_datadir + "utt2spk", "w") as f:
+    for wav_id in sorted(test_id2text):
+        f.write(f"{lang}_{wav_id} spk_{lang}_{wav_id}\n")
diff --git a/egs2/ms_indic_18/asr1/path.sh b/egs2/ms_indic_18/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/pyscripts b/egs2/ms_indic_18/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/run.sh b/egs2/ms_indic_18/asr1/run.sh
new file mode 100755
index 00000000000..e2a8c317a51
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/run.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lang=te # te ta gu
+
+train_set=train_${lang}
+train_dev=dev_${lang}
+test_set="${train_dev} test_${lang}"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decoder_asr.yaml
+
+if [[ "zh" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=te
+  nbpe=2500
+elif [[ "fr" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=ta
+  nbpe=350
+elif [[ "es" == *"${lang}"* ]]; then # placeholder for optimal bpe when lang=gu
+  nbpe=235
+else
+  nbpe=150
+fi
+
+
+./asr.sh \
+    --ngpu 1 \
+    --lang "${lang}" \
+    --local_data_opts "--lang ${lang}" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --inference_asr_model valid.acc.ave.pth\
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --min_wav_duration 0.5 \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/ms_indic_18/asr1/scripts b/egs2/ms_indic_18/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/steps b/egs2/ms_indic_18/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/ms_indic_18/asr1/utils b/egs2/ms_indic_18/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/ms_indic_18/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/asr.sh b/egs2/mucs21_subtask1/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/cmd.sh b/egs2/mucs21_subtask1/asr1/cmd.sh
new file mode 100755
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mucs21_subtask1/asr1/conf/decode.yaml b/egs2/mucs21_subtask1/asr1/conf/decode.yaml
new file mode 100644
index 00000000000..0c923a7d1f6
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/decode.yaml
@@ -0,0 +1,4 @@
+batchsize: 0
+beam-size: 10
+ctc-weight: 0.4
+lm-weight: 0.6
diff --git a/egs2/mucs21_subtask1/asr1/conf/fbank.conf b/egs2/mucs21_subtask1/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..1ad20614eef
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=8000
+--num-mel-bins=80
diff --git a/egs2/mucs21_subtask1/asr1/conf/gpu.conf b/egs2/mucs21_subtask1/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/conf/lm.yaml b/egs2/mucs21_subtask1/asr1/conf/lm.yaml
new file mode 100644
index 00000000000..d084cab2ade
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/lm.yaml
@@ -0,0 +1,27 @@
+
+# network architecture
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+
+
+# minibatch related
+batch_size: 32
+
+# optimization related
+optim: adam
+optim_conf:
+   lr: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+max_epoch: 25
+grad_clip: 1.0
diff --git a/egs2/mucs21_subtask1/asr1/conf/pitch.conf b/egs2/mucs21_subtask1/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs2/mucs21_subtask1/asr1/conf/queue.conf b/egs2/mucs21_subtask1/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mucs21_subtask1/asr1/conf/slurm.conf b/egs2/mucs21_subtask1/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mucs21_subtask1/asr1/conf/train.yaml b/egs2/mucs21_subtask1/asr1/conf/train.yaml
new file mode 120000
index 00000000000..9cd6693eb3a
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_conformer.yaml
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/conf/tuning/train_conformer.yaml b/egs2/mucs21_subtask1/asr1/conf/tuning/train_conformer.yaml
new file mode 100644
index 00000000000..2557d0e902a
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/tuning/train_conformer.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_size: 32
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 50
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mucs21_subtask1/asr1/conf/tuning/train_transformer.yaml b/egs2/mucs21_subtask1/asr1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..0f8c0435c26
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,77 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_size: 32
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 50
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mucs21_subtask1/asr1/db.sh b/egs2/mucs21_subtask1/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/local/check_audio_data_folder.sh b/egs2/mucs21_subtask1/asr1/local/check_audio_data_folder.sh
new file mode 120000
index 00000000000..b76c91e8aa0
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/local/check_audio_data_folder.sh
@@ -0,0 +1 @@
+../../../../egs/mucs21_subtask1/asr1/local/check_audio_data_folder.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/local/data.sh b/egs2/mucs21_subtask1/asr1/local/data.sh
new file mode 100755
index 00000000000..1a06a6f5d15
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/local/data.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+ . utils/parse_options.sh || exit 1;
+
+ log() {
+     local fname=${BASH_SOURCE[1]##*/}
+     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+ }
+
+ if [ ! -e "${MUCS_SUBTASK1}" ]; then
+     log "Specify path for data in db.sh."
+     log "Download data from Tamil, Telugu & Gujarati from https://navana-tech.github.io/MUCS2021/data.html."
+     log "Place it inside data path."
+     exit 1
+ fi
+
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+     log "stage1: Download data to ${MUCS_SUBTASK1}"
+     mkdir -p ${MUCS_SUBTASK1}
+     local/download_data.sh ${MUCS_SUBTASK1}
+ fi
+
+ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     log "stage2: Preparing data for MUCS subtask1"
+     ### Task dependent. You have to make data the following preparation part by yourself.
+    mkdir -p data
+    local/prepare_data.sh ${MUCS_SUBTASK1}
+    local/check_audio_data_folder.sh ${MUCS_SUBTASK1}
+    local/test_data_prep.sh ${MUCS_SUBTASK1} data/test
+    local/train_data_prep.sh ${MUCS_SUBTASK1} data/train
+
+ fi
+
+ log "Successfully finished. [elapsed=${SECONDS}s]"
+
+
+#
diff --git a/egs2/mucs21_subtask1/asr1/local/download_data.sh b/egs2/mucs21_subtask1/asr1/local/download_data.sh
new file mode 120000
index 00000000000..6a42f7273bb
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/local/download_data.sh
@@ -0,0 +1 @@
+../../../../egs/mucs21_subtask1/asr1/local/download_data.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/local/path.sh b/egs2/mucs21_subtask1/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mucs21_subtask1/asr1/local/prepare_data.sh b/egs2/mucs21_subtask1/asr1/local/prepare_data.sh
new file mode 120000
index 00000000000..19823a2d868
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/local/prepare_data.sh
@@ -0,0 +1 @@
+../../../../egs/mucs21_subtask1/asr1/local/prepare_data.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/local/test_data_prep.sh b/egs2/mucs21_subtask1/asr1/local/test_data_prep.sh
new file mode 120000
index 00000000000..2a79b8faf1d
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/local/test_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/mucs21_subtask1/asr1/local/test_data_prep.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/local/train_data_prep.sh b/egs2/mucs21_subtask1/asr1/local/train_data_prep.sh
new file mode 120000
index 00000000000..65a590d5acf
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/local/train_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/mucs21_subtask1/asr1/local/train_data_prep.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/path.sh b/egs2/mucs21_subtask1/asr1/path.sh
new file mode 100755
index 00000000000..d2b90a67653
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/path.sh
@@ -0,0 +1,22 @@
+MAIN_ROOT=$PWD/../../..
+KALDI_ROOT=$MAIN_ROOT/tools/kaldi
+
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+. "${MAIN_ROOT}"/tools/activate_python.sh && . "${MAIN_ROOT}"/tools/extra_path.sh
+export PATH=$MAIN_ROOT/utils:$MAIN_ROOT/espnet/bin:$PATH
+
+export OMP_NUM_THREADS=1
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+
+# You need to change or unset NCCL_SOCKET_IFNAME according to your network environment
+# https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html#nccl-socket-ifname
+export NCCL_SOCKET_IFNAME="^lo,docker,virbr,vmnet,vboxnet"
+
+# NOTE(kamo): Source at the last to overwrite the setting
+. local/path.sh
diff --git a/egs2/mucs21_subtask1/asr1/pyscripts b/egs2/mucs21_subtask1/asr1/pyscripts
new file mode 120000
index 00000000000..91452c0a111
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts/
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/run.sh b/egs2/mucs21_subtask1/asr1/run.sh
new file mode 100755
index 00000000000..34776cb1a2f
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/run.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set='train'
+valid_set='test'
+test_set='test'
+
+asr_config=conf/train.yaml
+lm_config_=conf/lm.yaml
+./asr.sh \
+    --ngpu 2 \
+    --use_lm true \
+    --lm_config "${lm_config_}" \
+    --audio_format wav\
+    --feats_type fbank_pitch\
+    --max_wav_duration 30 \
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_set}" "$@"
diff --git a/egs2/mucs21_subtask1/asr1/scripts b/egs2/mucs21_subtask1/asr1/scripts
new file mode 120000
index 00000000000..e73ad6212e2
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts/
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/steps b/egs2/mucs21_subtask1/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mucs21_subtask1/asr1/utils b/egs2/mucs21_subtask1/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mucs21_subtask1/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/README.md b/egs2/mucs21_subtask2/asr1/README.md
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mucs21_subtask2/asr1/asr.sh b/egs2/mucs21_subtask2/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/cmd.sh b/egs2/mucs21_subtask2/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/mucs21_subtask2/asr1/conf/decode.yaml b/egs2/mucs21_subtask2/asr1/conf/decode.yaml
new file mode 100644
index 00000000000..0c923a7d1f6
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/decode.yaml
@@ -0,0 +1,4 @@
+batchsize: 0
+beam-size: 10
+ctc-weight: 0.4
+lm-weight: 0.6
diff --git a/egs2/mucs21_subtask2/asr1/conf/fbank.conf b/egs2/mucs21_subtask2/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/mucs21_subtask2/asr1/conf/gpu.conf b/egs2/mucs21_subtask2/asr1/conf/gpu.conf
new file mode 100644
index 00000000000..6d0a75b067a
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/conf/lm.yaml b/egs2/mucs21_subtask2/asr1/conf/lm.yaml
new file mode 100644
index 00000000000..d084cab2ade
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/lm.yaml
@@ -0,0 +1,27 @@
+
+# network architecture
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+
+
+# minibatch related
+batch_size: 32
+
+# optimization related
+optim: adam
+optim_conf:
+   lr: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+max_epoch: 25
+grad_clip: 1.0
diff --git a/egs2/mucs21_subtask2/asr1/conf/pitch.conf b/egs2/mucs21_subtask2/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/mucs21_subtask2/asr1/conf/queue.conf b/egs2/mucs21_subtask2/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/mucs21_subtask2/asr1/conf/slurm.conf b/egs2/mucs21_subtask2/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/mucs21_subtask2/asr1/conf/train.yaml b/egs2/mucs21_subtask2/asr1/conf/train.yaml
new file mode 120000
index 00000000000..9cd6693eb3a
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_conformer.yaml
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/conf/tuning/train_conformer.yaml b/egs2/mucs21_subtask2/asr1/conf/tuning/train_conformer.yaml
new file mode 100644
index 00000000000..2557d0e902a
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/tuning/train_conformer.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_size: 32
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 50
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mucs21_subtask2/asr1/conf/tuning/train_transformer.yaml b/egs2/mucs21_subtask2/asr1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..5f98d0f7e97
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,70 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_size: 32
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 50
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/mucs21_subtask2/asr1/db.sh b/egs2/mucs21_subtask2/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/local/data.sh b/egs2/mucs21_subtask2/asr1/local/data.sh
new file mode 100755
index 00000000000..2a377522912
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/local/data.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+lang=$1 #hi-en oe bn-en
+
+. utils/parse_options.sh || exit 1;
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+mkdir -p ${MUCS_SUBTASK2}
+if [ -z "${MUCS_SUBTASK2}" ]; then
+    log "Fill the value of 'MUCS_SUBTASK2' of db.sh"
+    exit 1
+fi
+
+set -e
+set -u
+set -o pipefail
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage1: Download data to ${MUCS_SUBTASK2}"
+    mkdir -p ${MUCS_SUBTASK2}
+    local/download_data.sh ${MUCS_SUBTASK2} $lang
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  if [ ! -e ${MUCS_SUBTASK2}/${lang}.path_done ]; then
+    log "stage2: Preparing data for MUCS subtask2"
+    for dset in test train; do
+        local/prepare_data.sh ${MUCS_SUBTASK2}/$lang/$dset/transcripts/wav.scp ${MUCS_SUBTASK2}/$lang/$dset/ out.scp
+      done
+    touch ${MUCS_SUBTASK2}/${lang}.path_done
+    else
+        echo "Path written already. Skipping."
+    fi
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mucs21_subtask2/asr1/local/download_data.sh b/egs2/mucs21_subtask2/asr1/local/download_data.sh
new file mode 120000
index 00000000000..199d665e900
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/local/download_data.sh
@@ -0,0 +1 @@
+../../../../egs/mucs21_subtask2/asr1/local/download_data.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/local/path.sh b/egs2/mucs21_subtask2/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/mucs21_subtask2/asr1/local/prepare_data.sh b/egs2/mucs21_subtask2/asr1/local/prepare_data.sh
new file mode 120000
index 00000000000..d20f84e85b0
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/local/prepare_data.sh
@@ -0,0 +1 @@
+../../../../egs/mucs21_subtask2/asr1/local/prepare_data.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/path.sh b/egs2/mucs21_subtask2/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/pyscripts b/egs2/mucs21_subtask2/asr1/pyscripts
new file mode 120000
index 00000000000..91452c0a111
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts/
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/run.sh b/egs2/mucs21_subtask2/asr1/run.sh
new file mode 100755
index 00000000000..cb486439609
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/run.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+#select language: hi-en or bn-en
+lang=hi-en
+
+train_set="hi-en/train/transcripts"
+valid_set="hi-en/test/transcripts"
+test_set="hi-en/test/transcripts"
+
+asr_config=conf/train.yaml
+lm_config_=conf/lm.yaml
+
+./asr.sh \
+    --lang $lang \
+    --ngpu 2 \
+    --expdir exp \
+    --local_data_opts $lang \
+    --use_lm true \
+    --lm_config "${lm_config_}" \
+    --audio_format wav\
+    --feats_type fbank_pitch\
+    --max_wav_duration 30 \
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_set}" "$@"
diff --git a/egs2/mucs21_subtask2/asr1/scripts b/egs2/mucs21_subtask2/asr1/scripts
new file mode 120000
index 00000000000..e73ad6212e2
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts/
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/steps b/egs2/mucs21_subtask2/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/mucs21_subtask2/asr1/utils b/egs2/mucs21_subtask2/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/mucs21_subtask2/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/nsc/asr1/cmd.sh b/egs2/nsc/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/nsc/asr1/cmd.sh
+++ b/egs2/nsc/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/open_li52/asr1/local/data.sh b/egs2/open_li52/asr1/local/data.sh
index d98466cae55..5cf5ddc1114 100755
--- a/egs2/open_li52/asr1/local/data.sh
+++ b/egs2/open_li52/asr1/local/data.sh
@@ -7,6 +7,8 @@
 . ./cmd.sh || exit 1;
 . ./db.sh || exit 1;
 
+shopt -s extglob
+
 # general configuration
 stage=0       # start from 0 if you need to start from data preparation
 stop_stage=100
@@ -117,11 +119,26 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     utils/combine_data.sh --skip_fix true data/train_temp data/train_!(*temp|*li52_*)
     utils/combine_data.sh --skip_fix true data/dev_temp data/dev_!(*temp|*li52_*)
 
+    # Perform text preprocessing (upper case, remove punctuation)
+    # Original text: 
+    #     But, most important, he was able every day to live out his dream. 
+    #     "Ask me why; I know why."
+    # --->
+    # Upper text:
+    #     BUT, MOST IMPORTANT, HE WAS ABLE EVERY DAY TO LIVE OUT HIS DREAM.
+    #     "ASK ME WHY; I KNOW WHY."
+    # ---->
+    # Punctuation remove: 
+    #     BUT MOST IMPORTANT HE WAS ABLE EVERY DAY TO LIVE OUT HIS DREAM
+    #     ASK ME WHY I KNOW WHY
+
     for x in data/train_temp data/dev_temp; do
         cp ${x}/text ${x}/text.org
         paste -d " " \
               <(cut -f 1 -d" " ${x}/text.org) \
-              <(cut -f 2- -d" " ${x}/text.org | python3 -c 'import sys; print(sys.stdin.read().upper(), end="")') \
+              <(cut -f 2- -d" " ${x}/text.org \
+                | python3 -c 'import sys; print(sys.stdin.read().upper(), end="")' \
+                | python3 -c 'import string; print(sys.stdin.read().translate(str.maketrans("", "", string.punctuation)), end="")') \
               > ${x}/text
         rm ${x}/text.org
     done
@@ -130,7 +147,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         cp data/${x}/text data/${x}/text.org
         paste -d " " \
               <(cut -f 1 -d" " data/${x}/text.org) \
-              <(cut -f 2- -d" " data/${x}/text.org | python3 -c 'import sys; print(sys.stdin.read().upper(), end="")') \
+              <(cut -f 2- -d" " data/${x}/text.org \
+                | python3 -c 'import sys; print(sys.stdin.read().upper(), end="")' \
+                | python3 -c 'import string; print(sys.stdin.read().translate(str.maketrans("", "", string.punctuation)), end="")') \
               > data/${x}/text
         rm data/${x}/text.org
     done
@@ -144,6 +163,15 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
 
     if [ "$lid" = true ]
     then
+
+        # Original text: 
+        #     BUT MOST IMPORTANT HE WAS ABLE EVERY DAY TO LIVE OUT HIS DREAM
+        #     ASK ME WHY I KNOW WHY
+        # --->
+        # Add language ID: 
+        #     [en] BUT MOST IMPORTANT HE WAS ABLE EVERY DAY TO LIVE OUT HIS DREAM
+        #     [en] ASK ME WHY I KNOW WHY
+
         paste -d " " \
        <(cut -f 1 -d" " data/train_temp/text) \
        <(cut -f 1 -d" " data/train_temp/text | sed -e "s/.*\-\(.*\)_.*/\1/" | sed -e "s/_[^TW]\+//" | sed -e "s/^/\[/" -e "s/$/\]/") \
diff --git a/egs2/open_li52/asr1/local/score.sh b/egs2/open_li52/asr1/local/score.sh
index 05d0e585240..a2f2aa52ab4 100755
--- a/egs2/open_li52/asr1/local/score.sh
+++ b/egs2/open_li52/asr1/local/score.sh
@@ -20,7 +20,7 @@ else
 fi
 
 if [ "${score_lang_id}" = false ]; then
-    echo "Training without language id, skip langauge identification scoring"
+    echo "Training without language id, skip language identification scoring"
     exit 1
 fi
 
diff --git a/egs2/polyphone_swiss_french/asr1/cmd.sh b/egs2/polyphone_swiss_french/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/polyphone_swiss_french/asr1/cmd.sh
+++ b/egs2/polyphone_swiss_french/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/primewords_chinese/asr1/README.md b/egs2/primewords_chinese/asr1/README.md
new file mode 100644
index 00000000000..8a72fde0570
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/README.md
@@ -0,0 +1,29 @@
+# Note
+- Dataset: http://www.openslr.org/47/
+- Please double check the data preparation stage when using this recipe in your own setting. Some processing might be inconsistent with other sources (if any). Currently we do ***not*** find a standard reference for data preparation, so the train/dev/test split is ***not*** "official". We do ***not*** include real speaker ids as well. Instead, utterance ids are used in `utt2spk`.
+
+
+# Initial Experiment: Conformer + Speed Perturbation + SpecAugment, without LM
+
+## Environments
+- date: `Thu Dec 23 08:05:36 EST 2021`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.10.0`
+- Git hash: `b687486451d28a3a3b5b04a23c10ebddcf09d13e`
+  - Commit date: `Thu Dec 23 01:43:00 2021 -0500`
+
+## asr_conformer_lr1e-3_warmup25k
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|7763|7763|15.3|84.7|0.0|0.0|84.7|84.7|
+|decode_asr_asr_model_valid.acc.ave/test|3617|3617|16.2|83.8|0.0|0.0|83.8|83.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|7763|160698|84.2|15.3|0.5|0.5|16.3|84.7|
+|decode_asr_asr_model_valid.acc.ave/test|3617|75151|84.7|14.9|0.4|0.4|15.7|83.8|
diff --git a/egs2/primewords_chinese/asr1/asr.sh b/egs2/primewords_chinese/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/primewords_chinese/asr1/cmd.sh b/egs2/primewords_chinese/asr1/cmd.sh
new file mode 100755
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/primewords_chinese/asr1/conf/decode_asr.yaml b/egs2/primewords_chinese/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..523657066cc
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+ctc_weight: 0.3
+lm_weight: 0.0
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
diff --git a/egs2/primewords_chinese/asr1/conf/fbank.conf b/egs2/primewords_chinese/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/primewords_chinese/asr1/conf/pbs.conf b/egs2/primewords_chinese/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/primewords_chinese/asr1/conf/pitch.conf b/egs2/primewords_chinese/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/primewords_chinese/asr1/conf/queue.conf b/egs2/primewords_chinese/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/primewords_chinese/asr1/conf/slurm.conf b/egs2/primewords_chinese/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/primewords_chinese/asr1/conf/train_asr_conformer.yaml b/egs2/primewords_chinese/asr1/conf/train_asr_conformer.yaml
new file mode 100644
index 00000000000..7d0d9a859ff
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/conf/train_asr_conformer.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 35000000
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 60
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/primewords_chinese/asr1/conf/train_lm_transformer.yaml b/egs2/primewords_chinese/asr1/conf/train_lm_transformer.yaml
new file mode 100644
index 00000000000..ace0739a939
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/conf/train_lm_transformer.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
diff --git a/egs2/primewords_chinese/asr1/db.sh b/egs2/primewords_chinese/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/primewords_chinese/asr1/local/data.sh b/egs2/primewords_chinese/asr1/local/data.sh
new file mode 100755
index 00000000000..98e21c9c506
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/local/data.sh
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+
+data_url=https://www.openslr.org/resources/47/primewords_md_2018_set1.tar.gz
+data_tar=primewords_md_2018_set1.tar.gz
+data_tar_size=9057625192
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+log "$0 $*"
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z ${PRIMEWORDS_CHINESE} ]; then
+    log "Fill the value of 'PRIMEWORDS_CHINESE' of db.sh"
+    exit 1
+fi
+
+current_path=$(pwd)
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Download data to ${PRIMEWORDS_CHINESE}"
+    if [ ! -d ${PRIMEWORDS_CHINESE} ]; then
+        mkdir -p ${PRIMEWORDS_CHINESE}
+    fi
+
+    # absolute path
+    PRIMEWORDS_CHINESE=$(cd ${PRIMEWORDS_CHINESE}; pwd)
+
+    # download data files if they do not exist
+    download_data=true
+    if [ -f ${PRIMEWORDS_CHINESE}/${data_tar} ]; then
+        size=$(/bin/ls -l ${PRIMEWORDS_CHINESE}/${data_tar} | awk '{print $5}')
+        if [ ${size} -eq ${data_tar_size} ]; then
+            download_data=false
+            log "${PRIMEWORDS_CHINESE}/${data_tar} exists and appears to be complete."
+        else
+            log "$0: removing existing file ${PRIMEWORDS_CHINESE}/${data_tar} because its size in bytes ${size} is not equal to the size of the archive."
+            rm ${PRIMEWORDS_CHINESE}/${data_tar}
+        fi
+    fi
+
+    if ${download_data}; then
+        cd ${PRIMEWORDS_CHINESE}
+        if ! wget --no-check-certificate ${data_url}; then
+            log "$0: error executing wget ${data_url}"
+            exit 1
+        fi
+    fi
+
+    log "$0: successfully downloaded ${data_tar}"
+
+    # untar
+    cd ${PRIMEWORDS_CHINESE}
+    if ! tar -xzf ${data_tar}; then
+        log "$0: error un-tarring archive ${data_tar}"
+        exit 1
+    fi
+
+    log "$0: successfully untarred ${data_tar}"
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    cd ${current_path}
+
+    # prepare datasets
+    mkdir -p data/{train,dev,test}
+    python3 local/data_prep.py \
+        --data_path ${PRIMEWORDS_CHINESE}/primewords_md_2018_set1 \
+        --train_dir data/train \
+        --dev_dir data/dev \
+        --test_dir data/test \
+        --dev_ratio 0.136 \
+        --test_ratio 0.136
+    for x in train dev test; do
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+        utils/fix_data_dir.sh data/${x}
+        utils/validate_data_dir.sh --no-feats data/${x}
+    done
+fi
+
+log "$0: Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/primewords_chinese/asr1/local/data_prep.py b/egs2/primewords_chinese/asr1/local/data_prep.py
new file mode 100644
index 00000000000..11258bc597f
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/local/data_prep.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import os
+import os.path
+import json
+import glob
+import math
+import argparse
+
+
+parser = argparse.ArgumentParser(description="Prepare Primewords_Chinese")
+parser.add_argument(
+    "--data_path", type=str, help="Path to the directory containing all files"
+)
+parser.add_argument("--train_dir", type=str, help="Path to the train data")
+parser.add_argument("--dev_dir", type=str, help="Path to the dev data")
+parser.add_argument("--test_dir", type=str, help="Path to the test data")
+parser.add_argument("--dev_ratio", type=float, default=0.136, help="Ratio of dev set")
+parser.add_argument("--test_ratio", type=float, default=0.136, help="Ratio of test set")
+args = parser.parse_args()
+
+
+with open(os.path.join(args.data_path, "set1_transcript.json"), "r") as fp:
+    transcript = json.load(fp)  # list
+user_ids = sorted(list(set([e["user_id"] for e in transcript])))
+
+n_dev_users = math.floor(args.dev_ratio * len(user_ids))
+n_test_users = math.floor(args.test_ratio * len(user_ids))
+dev_users = user_ids[:n_dev_users]
+test_users = user_ids[n_dev_users : n_dev_users + n_test_users]
+train_users = user_ids[n_dev_users + n_test_users :]
+assert len(set(dev_users).intersection(set(test_users))) == 0
+assert len(set(dev_users).intersection(set(train_users))) == 0
+assert len(set(test_users).intersection(set(train_users))) == 0
+
+wav_list = glob.glob(os.path.join(args.data_path, "audio_files/*/*/*.wav"))
+wavname2abspath = {os.path.basename(e): os.path.abspath(e) for e in wav_list}
+
+train_samples = []
+dev_samples = []
+test_samples = []
+for sample in transcript:
+    processed_sample = {
+        "user_id": sample["user_id"],  # speaker id
+        "text": "".join(sample["text"].split()),  # remove white spaces
+        "id": sample["id"],  # utterance id
+        "abs_path": wavname2abspath[sample["file"]],
+    }
+    if processed_sample["user_id"] in train_users:
+        train_samples.append(processed_sample)
+    elif processed_sample["user_id"] in dev_users:
+        dev_samples.append(processed_sample)
+    elif processed_sample["user_id"] in test_users:
+        test_samples.append(processed_sample)
+    else:
+        raise RuntimeError
+
+for setname in ["train", "dev", "test"]:
+    if setname == "train":
+        sample_list = train_samples
+        dest_dir = args.train_dir
+    elif setname == "dev":
+        sample_list = dev_samples
+        dest_dir = args.dev_dir
+    elif setname == "test":
+        sample_list = test_samples
+        dest_dir = args.test_dir
+    else:
+        raise RuntimeError
+
+    with open(os.path.join(dest_dir, "text"), "w") as text_f, open(
+        os.path.join(dest_dir, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join(dest_dir, "utt2spk"), "w") as utt2spk_f:
+        for sample in sample_list:
+            text_f.write(f"{int(sample['id']):08d} {sample['text']}\n")
+            wav_scp_f.write(f"{int(sample['id']):08d} {sample['abs_path']}\n")
+            utt2spk_f.write(f"{int(sample['id']):08d} {int(sample['id']):08d}\n")
diff --git a/egs2/primewords_chinese/asr1/local/path.sh b/egs2/primewords_chinese/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/primewords_chinese/asr1/path.sh b/egs2/primewords_chinese/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/primewords_chinese/asr1/pyscripts b/egs2/primewords_chinese/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/primewords_chinese/asr1/run.sh b/egs2/primewords_chinese/asr1/run.sh
new file mode 100755
index 00000000000..2e7f35dbf32
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/run.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+asr_tag=conformer_lr1e-3_warmup25k
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr_conformer.yaml
+inference_config=conf/decode_asr.yaml
+inference_asr_model=valid.acc.ave.pth
+
+use_lm=false
+use_wordlm=false
+lm_config=conf/train_lm_transformer.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                                \
+    --skip_data_prep false                              \
+    --skip_train false                                  \
+    --skip_eval false                                   \
+    --lang zh                                           \
+    --audio_format wav                                  \
+    --feats_type raw                                    \
+    --token_type char                                   \
+    --ngpu 4                                            \
+    --asr_tag "${asr_tag}"                              \
+    --use_lm "${use_lm}"                                \
+    --use_word_lm "${use_wordlm}"                       \
+    --lm_config "${lm_config}"                          \
+    --asr_config "${asr_config}"                        \
+    --inference_config "${inference_config}"            \
+    --inference_asr_model "${inference_asr_model}"      \
+    --train_set "${train_set}"                          \
+    --valid_set "${valid_set}"                          \
+    --test_sets "${test_sets}"                          \
+    --speed_perturb_factors "${speed_perturb_factors}"  \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/primewords_chinese/asr1/scripts b/egs2/primewords_chinese/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/primewords_chinese/asr1/steps b/egs2/primewords_chinese/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/primewords_chinese/asr1/utils b/egs2/primewords_chinese/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/primewords_chinese/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/RESULTS.md b/egs2/puebla_nahuatl/asr1/RESULTS.md
new file mode 100644
index 00000000000..055ca12fdae
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/RESULTS.md
@@ -0,0 +1,28 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue Dec 21 10:48:04 EST 2021`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.9.0`
+- model https://huggingface.co/espnet/ftshijt_espnet2_asr_puebla_nahuatl_transfer
+
+## asr_train_asr_transformer_hubert_raw_bpe500_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|10576|90532|77.0|17.0|6.0|3.6|26.6|74.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|10576|590273|92.2|2.1|5.7|3.0|10.8|74.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|10576|242435|86.0|7.3|6.8|3.5|17.5|74.0|
+
diff --git a/egs2/puebla_nahuatl/asr1/asr.sh b/egs2/puebla_nahuatl/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/cmd.sh b/egs2/puebla_nahuatl/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/puebla_nahuatl/asr1/conf/decode_asr.yaml b/egs2/puebla_nahuatl/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/conf/fbank.conf b/egs2/puebla_nahuatl/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/puebla_nahuatl/asr1/conf/pbs.conf b/egs2/puebla_nahuatl/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/puebla_nahuatl/asr1/conf/pitch.conf b/egs2/puebla_nahuatl/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/puebla_nahuatl/asr1/conf/queue.conf b/egs2/puebla_nahuatl/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/puebla_nahuatl/asr1/conf/slurm.conf b/egs2/puebla_nahuatl/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/puebla_nahuatl/asr1/conf/train_asr.yaml b/egs2/puebla_nahuatl/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/conf/train_lm.yaml b/egs2/puebla_nahuatl/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/puebla_nahuatl/asr1/conf/tuning/decode_rnn.yaml b/egs2/puebla_nahuatl/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/puebla_nahuatl/asr1/conf/tuning/decode_transformer.yaml b/egs2/puebla_nahuatl/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..33ea8424ad2
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,69 @@
+# minibatch related
+batch_type: folded
+batch_size: 16
+
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..4bc2299cbb2
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 100
+
+init: chainer
diff --git a/egs2/puebla_nahuatl/asr1/db.sh b/egs2/puebla_nahuatl/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/local/construct_dataset.py b/egs2/puebla_nahuatl/asr1/local/construct_dataset.py
new file mode 120000
index 00000000000..7b4d5fb3f00
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/local/construct_dataset.py
@@ -0,0 +1 @@
+../../../../egs/puebla_nahuatl/asr1/local/construct_dataset.py
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/local/data.sh b/egs2/puebla_nahuatl/asr1/local/data.sh
new file mode 100755
index 00000000000..1892a167d58
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/local/data.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Copyright 2020 Johns Hopkins University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+# dataset related
+annotation_type=eaf
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${PUEBLA_NAHUATL}
+if [ -z "${PUEBLA_NAHUATL}" ]; then
+    log "Fill the value of 'PUEBLA_NAHUATL' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+wavdir=${PUEBLA_NAHUATL}/Sound-files-Puebla-Nahuatl
+annotation_dir=${PUEBLA_NAHUATL}/Pueble-Nahuatl-Manifest
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Download data to ${PUEBLA_NAHUATL}"
+    mkdir -p ${PUEBLA_NAHUATL}
+   local/download_and_untar.sh local  https://www.openslr.org/resources/92/Puebla-Nahuatl-Manifest.tgz Puebla-Nahuatl-Manifest.tgz
+   local/download_and_untar.sh ${PUEBLA_NAHUATL} https://www.openslr.org/resources/92/Sound-Files-Puebla-Nahuatl.tgz.part0 Sound-Files-Puebla-Nahuatl.tgz.part0 9
+   git clone https://github.com/ftshijt/Puebla_Nahuatl_Split.git local/split
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage2: Preparing data for Puebla Nahuatl"
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    for x in train dev "test"; do
+        python local/data_prep.py -w $wavdir -t data/${x} -m ${annotation_type} -i local/split/Pueble-Nahuatl-Manifest/speaker_wav_mapping_nahuatl_${x}.csv -a ${annotation_dir}
+        utils/fix_data_dir.sh data/${x}
+        chmod +x data/${x}/remix_script.sh
+        mkdir -p remixed
+        ./data/${x}/remix_script.sh
+    done
+    
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/puebla_nahuatl/asr1/local/data_prep.py b/egs2/puebla_nahuatl/asr1/local/data_prep.py
new file mode 120000
index 00000000000..7e943cfc5af
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/puebla_nahuatl/asr1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/local/download_and_untar.sh b/egs2/puebla_nahuatl/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..343a1acf285
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/puebla_nahuatl/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/path.sh b/egs2/puebla_nahuatl/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/pyscripts b/egs2/puebla_nahuatl/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/run.sh b/egs2/puebla_nahuatl/asr1/run.sh
new file mode 100755
index 00000000000..0ea9a839121
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+train_dev="dev"
+test_set="test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --local_data_opts "--stage 1" \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu 1 \
+    --nj 40 \
+    --inference_nj 40 \
+    --use_lm true \
+    --token_type bpe \
+    --nbpe 500 \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --inference_asr_model valid.acc.best.pth \
+    --lm_train_text "data/${train_set}/text"  "$@"
+
diff --git a/egs2/puebla_nahuatl/asr1/scripts b/egs2/puebla_nahuatl/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/steps b/egs2/puebla_nahuatl/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/asr1/utils b/egs2/puebla_nahuatl/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/puebla_nahuatl/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/cmd.sh b/egs2/puebla_nahuatl/st1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/puebla_nahuatl/st1/conf/decode_st.yaml b/egs2/puebla_nahuatl/st1/conf/decode_st.yaml
new file mode 100644
index 00000000000..2967ee6fc0f
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/decode_st.yaml
@@ -0,0 +1,5 @@
+batch_size: 1
+beam_size: 10
+nbest: 1
+lm_weight: 0.0
+
diff --git a/egs2/puebla_nahuatl/st1/conf/fbank.conf b/egs2/puebla_nahuatl/st1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/puebla_nahuatl/st1/conf/pbs.conf b/egs2/puebla_nahuatl/st1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/puebla_nahuatl/st1/conf/pitch.conf b/egs2/puebla_nahuatl/st1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/puebla_nahuatl/st1/conf/queue.conf b/egs2/puebla_nahuatl/st1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/puebla_nahuatl/st1/conf/slurm.conf b/egs2/puebla_nahuatl/st1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/puebla_nahuatl/st1/conf/train_conformer_st.yaml b/egs2/puebla_nahuatl/st1/conf/train_conformer_st.yaml
new file mode 100644
index 00000000000..36496be8206
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/train_conformer_st.yaml
@@ -0,0 +1,95 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_bins: 32
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/puebla_nahuatl/st1/conf/train_st.yaml b/egs2/puebla_nahuatl/st1/conf/train_st.yaml
new file mode 100644
index 00000000000..9fcb404db15
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/conf/train_st.yaml
@@ -0,0 +1,87 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_asr_decoder: transformer
+extra_asr_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+extra_mt_decoder: transformer
+extra_mt_decoder_conf:
+    input_layer: embed
+    num_blocks: 2
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# loss related
+model_conf:
+    asr_weight: 0.3
+    mt_weight: 0.0
+    mtlalpha: 1.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 2.5
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: folded
+batch_size: 64
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/puebla_nahuatl/st1/db.sh b/egs2/puebla_nahuatl/st1/db.sh
new file mode 120000
index 00000000000..39ab78266e5
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/db.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/local/data.sh b/egs2/puebla_nahuatl/st1/local/data.sh
new file mode 100755
index 00000000000..9652927d4e6
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/local/data.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${PUEBLA_NAHUATL}
+if [ -z "${PUEBLA_NAHUATL}" ]; then
+    log "Fill the value of 'PUEBLA_NAHUATL' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# dataset related
+wavdir=${PUEBLA_NAHUATL}/Sound-files-Puebla-Nahuatl
+annotation_dir=${PUEBLA_NAHUATL}/SpeechTranslation210217
+annotation_type=eaf
+annotation_id=st
+src_lang=na
+tgt_lang=es
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Data downloading"
+    # Download the Data
+    # local/download_and_untar.sh local  https://www.openslr.org/resources/92/Puebla-Nahuatl-Manifest.tgz Puebla-Nahuatl-Manifest.tgz
+    # local/download_and_untar.sh ${PUEBLA_NAHUATL} https://www.openslr.org/resources/92/Sound-Files-Puebla-Nahuatl.tgz.part0 Sound-Files-Puebla-Nahuatl.tgz.part0 9
+    # local/download_and_untar.sh ${PUEBLA_NAHUATL} https://www.openslr.org/resources/92/SpeechTranslation_Nahuatl_Manifest.tgz SpeechTranslation_Nahuatl_Manifest.tgz
+    # git clone https://github.com/ftshijt/Puebla_Nahuatl_Split.git local/split
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data preparation"
+    mkdir -p remixed
+    for x in train dev test; do
+        python local/data_prep.py -w $wavdir -t data/${x}_${annotation_id} -m ${annotation_type} -i local/split/speaker_wav_mapping_nahuatl_${x}.csv -a ${annotation_dir} -d local/split/Puebla-Nahuat-and-Totonac-consultants_for-LDC-archive.xml
+        cp data/${x}_${annotation_id}/text.${src_lang} data/${x}_${annotation_id}/text.lc.rm.${src_lang}
+        cp data/${x}_${annotation_id}/text.${tgt_lang} data/${x}_${annotation_id}/text.lc.rm.${tgt_lang}
+        ln -sf data/${x}_${annotation_id}/text.lc.rm.${tgt_lang} data/${x}_${annotation_id}/text
+        utils/fix_data_dir.sh --utt_extra_files "text.${src_lang} text.${tgt_lang} text.lc.rm.${src_lang} text.lc.rm.${tgt_lang}" data/${x}_${annotation_id}
+        # shellcheck disable=SC1090
+        . ./data/${x}_st/remix_script.sh
+    sort -o data/${x}_${annotation_id}/text.lc.rm.${tgt_lang} data/${x}_${annotation_id}/text.lc.rm.${tgt_lang}
+    sort -o data/${x}_${annotation_id}/text.lc.rm.${src_lang} data/${x}_${annotation_id}/text.lc.rm.${src_lang} 
+    done
+fi
diff --git a/egs2/puebla_nahuatl/st1/local/data_prep.py b/egs2/puebla_nahuatl/st1/local/data_prep.py
new file mode 120000
index 00000000000..07e9b6d3a70
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/puebla_nahuatl/st1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/local/download_and_untar.sh b/egs2/puebla_nahuatl/st1/local/download_and_untar.sh
new file mode 120000
index 00000000000..694b903f7be
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/puebla_nahuatl/st1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/local/path.sh b/egs2/puebla_nahuatl/st1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/puebla_nahuatl/st1/path.sh b/egs2/puebla_nahuatl/st1/path.sh
new file mode 120000
index 00000000000..8e43dca7d4d
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/path.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/pyscripts b/egs2/puebla_nahuatl/st1/pyscripts
new file mode 120000
index 00000000000..f8eb6b2ab7b
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/pyscripts
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/run.sh b/egs2/puebla_nahuatl/st1/run.sh
new file mode 100755
index 00000000000..d8705f5e6f9
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/run.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+src_lang=na
+tgt_lang=es
+
+train_set=train_st
+train_dev=dev_st
+test_set="dev_st test_st"
+
+st_config=conf/train_st.yaml
+inference_config=conf/decode_st.yaml
+
+src_nbpe=500
+tgt_nbpe=500
+
+src_case=lc.rm
+tgt_case=lc.rm
+
+./st.sh \
+    --local_data_opts "--stage 0" \
+    --stage 11 \
+    --stop_stage 11 \
+    --audio_format "flac.ark" \
+    --use_lm false \
+    --token_joint false \
+    --st_tag "asr_pretrained" \
+    --nj 40 \
+    --inference_nj 4 \
+    --src_lang ${src_lang} \
+    --tgt_lang ${tgt_lang} \
+    --src_token_type "bpe" \
+    --src_nbpe $src_nbpe \
+    --tgt_token_type "bpe" \
+    --tgt_nbpe $tgt_nbpe \
+    --src_case ${src_case} \
+    --tgt_case ${tgt_case} \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --pretrained_asr "/projects/tir3/users/jiatongs/els/puebla_nahuatl/asr1/exp/asr_train_asr_transformer_specaug_raw_bpe500_sp/valid.acc.ave_10best.pth" \
+    --ignore_init_mismatch true \
+    --st_config "${st_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --src_bpe_train_text "data/${train_set}/text.${src_case}.${src_lang}" \
+    --tgt_bpe_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}" \
+    --lm_train_text "data/${train_set}/text.${tgt_case}.${tgt_lang}"  "$@"
diff --git a/egs2/puebla_nahuatl/st1/scripts b/egs2/puebla_nahuatl/st1/scripts
new file mode 120000
index 00000000000..c5b75bb5b0a
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/st1/scripts
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/st.sh b/egs2/puebla_nahuatl/st1/st.sh
new file mode 120000
index 00000000000..5c7465739e3
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/st.sh
@@ -0,0 +1 @@
+../../TEMPLATE/st1/st.sh
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/steps b/egs2/puebla_nahuatl/st1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/puebla_nahuatl/st1/utils b/egs2/puebla_nahuatl/st1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/puebla_nahuatl/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/reverb/asr1/cmd.sh b/egs2/reverb/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/reverb/asr1/cmd.sh
+++ b/egs2/reverb/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/reverb/asr1/conf/tuning/train_asr_transformer4.yaml b/egs2/reverb/asr1/conf/tuning/train_asr_transformer4.yaml
index 08c0d0d573c..439aa73bac2 100644
--- a/egs2/reverb/asr1/conf/tuning/train_asr_transformer4.yaml
+++ b/egs2/reverb/asr1/conf/tuning/train_asr_transformer4.yaml
@@ -72,6 +72,6 @@ normalize_conf:
 rir_scp: data/reverb_rir_single/wav.scp
 noise_scp: data/reverb_noise_single/wav.scp
 speech_volume_normalize: 1.
-noise_db_range: 12_17
+noise_db_range: "12_17"
 rir_apply_prob: 1.
 noise_apply_prob: 1.
diff --git a/egs2/ru_open_stt/asr1/cmd.sh b/egs2/ru_open_stt/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/ru_open_stt/asr1/cmd.sh
+++ b/egs2/ru_open_stt/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/ruslan/tts1/README.md b/egs2/ruslan/tts1/README.md
new file mode 100644
index 00000000000..9e2d2543f98
--- /dev/null
+++ b/egs2/ruslan/tts1/README.md
@@ -0,0 +1,36 @@
+# RUSLAN RECIPE
+
+This is the recipe of Russian male single speaker TTS model with [RUSLAN Corpus](https://ruslan-corpus.github.io/).
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+# INITIAL RESULTS
+
+## Environments
+
+- date: `Sat Jul 31 10:24:45 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.5.1`
+- Git hash: `98691b62c37d04fa9f1f38d76ec13c0591d94832`
+  - Commit date: `Fri Jul 30 21:55:52 2021 +0900`
+
+## Pretrained Models
+
+### ruslan_tts_train_transformer_raw_phn_espeak_ng_russian_train.loss.ave
+- https://zenodo.org/record/5149485
+
+### ruslan_tts_train_tacotron2_raw_phn_espeak_ng_russian_train.loss.ave
+- https://zenodo.org/record/5149493
+
+### ruslan_tts_train_conformer_fastspeech2_raw_phn_espeak_ng_russian_teacher_transformer_train.loss.ave
+- https://zenodo.org/record/5150961
diff --git a/egs2/ruslan/tts1/cmd.sh b/egs2/ruslan/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/ruslan/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/ruslan/tts1/conf/decode.yaml b/egs2/ruslan/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/conf/mfcc.conf b/egs2/ruslan/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/ruslan/tts1/conf/pbs.conf b/egs2/ruslan/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/ruslan/tts1/conf/queue.conf b/egs2/ruslan/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/ruslan/tts1/conf/slurm.conf b/egs2/ruslan/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/ruslan/tts1/conf/train.yaml b/egs2/ruslan/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/ruslan/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/ruslan/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/ruslan/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..e006156e8be
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false  # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/ruslan/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/ruslan/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..434096c4bbc
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 200            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 12000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/ruslan/tts1/conf/tuning/train_tacotron2.yaml b/egs2/ruslan/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/ruslan/tts1/conf/tuning/train_transformer.yaml b/egs2/ruslan/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/ruslan/tts1/conf/vad.conf b/egs2/ruslan/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/ruslan/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/ruslan/tts1/db.sh b/egs2/ruslan/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/ruslan/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/local/data.sh b/egs2/ruslan/tts1/local/data.sh
new file mode 100755
index 00000000000..d1fe3a61d37
--- /dev/null
+++ b/egs2/ruslan/tts1/local/data.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+text_format=raw
+nj=8
+g2p=espeak_ng_russian
+
+log "$0 $*"
+# shellcheck disable=SC1091
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+# shellcheck disable=SC1091
+. ./cmd.sh || exit 1;
+# shellcheck disable=SC1091
+. ./db.sh || exit 1;
+
+if [ -z "${RUSLAN}" ]; then
+   log "Fill the value of 'RUSLAN' of db.sh"
+   exit 1
+fi
+
+db_root=${RUSLAN}
+train_set=tr_no_dev
+dev_set=dev
+eval_set=eval1
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: local/data_download.sh"
+    local/data_download.sh "${db_root}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    local/data_prep.sh "${db_root}/RUSLAN" data/all
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: utils/subset_data_dir.sh"
+    utils/subset_data_dir.sh data/all 500 data/deveval
+    utils/subset_data_dir.sh --first data/deveval 250 "data/${dev_set}"
+    utils/subset_data_dir.sh --last data/deveval 250 "data/${eval_set}"
+    utils/copy_data_dir.sh data/all "data/${train_set}"
+    utils/filter_scp.pl --exclude data/deveval/wav.scp \
+        data/all/wav.scp > "data/${train_set}/wav.scp"
+    utils/fix_data_dir.sh "data/${train_set}"
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ] && [ "${text_format}" = phn ]; then
+    log "stage 2: pyscripts/utils/convert_text_to_phn.py"
+    for dset in "${train_set}" "${dev_set}" "${eval_set}"; do
+        utils/copy_data_dir.sh "data/${dset}" "data/${dset}_phn"
+        pyscripts/utils/convert_text_to_phn.py --g2p "${g2p}" --nj "${nj}" \
+            "data/${dset}/text" "data/${dset}_phn/text"
+        utils/fix_data_dir.sh "data/${dset}_phn"
+    done
+fi
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/ruslan/tts1/local/data_download.sh b/egs2/ruslan/tts1/local/data_download.sh
new file mode 100755
index 00000000000..9db2f7de16e
--- /dev/null
+++ b/egs2/ruslan/tts1/local/data_download.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+if [ ! -e "${download_dir}/RUSLAN" ]; then
+    (
+        mkdir -p "${download_dir}"
+        cd "${download_dir}"
+        gdown "https://drive.google.com/uc?id=1Y6vv--gcDx-S8DieSGaD7WnB86kZLgc_"
+        jar xf RUSLAN.zip
+    )
+    echo "Successfully finished download and unzip wav files."
+else
+    echo "Already exists. Skipped."
+fi
+if [ ! -e "${download_dir}/RUSLAN/metadata.csv" ]; then
+    (
+        mkdir -p "${download_dir}"
+        cd "${download_dir}"
+        gdown "https://drive.google.com/uc?id=11TD_ZwIOo-Wo75GYv-OWWOS3ABmwmAdK"
+        mv -v metadata_RUSLAN_22200.csv RUSLAN/metadata.csv
+    )
+    echo "Successfully finished download metadata."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/egs2/ruslan/tts1/local/data_prep.sh b/egs2/ruslan/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..f4c92b4749d
--- /dev/null
+++ b/egs2/ruslan/tts1/local/data_prep.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/RUSLAN data/train"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check directory existence
+[ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+
+# make scp, utt2spk, and spk2utt
+find "${db_root}" -name "*.wav" | sort | while read -r filename; do
+    id=RUSLAN_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} ${filename}" >> "${scp}"
+    echo "${id} RUSLAN" >> "${utt2spk}"
+done
+utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+echo "Successfully finished making wav.scp, utt2spk, spk2utt."
+
+# make text
+find "${db_root}" -name "metadata.csv" | sort | while read -r filename; do
+    awk -F "|" -v spk=RUSLAN '{print spk "_" $1 " " $2}' < "${filename}" | sort >> "${text}"
+done
+echo "Successfully finished making text."
+
+utils/fix_data_dir.sh "${data_dir}"
+echo "Successfully finished preparing data directory."
diff --git a/egs2/ruslan/tts1/local/path.sh b/egs2/ruslan/tts1/local/path.sh
new file mode 100644
index 00000000000..3c8f2419056
--- /dev/null
+++ b/egs2/ruslan/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import phonemizer" > /dev/null; then
+    echo "Error: phonemizer is not installed." >&2
+    echo "Error: please install phonemizer and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && source activate_python.sh && ./installers/install_phonemizer.sh" >&2
+    return 1
+fi
diff --git a/egs2/ruslan/tts1/path.sh b/egs2/ruslan/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/ruslan/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/pyscripts b/egs2/ruslan/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/ruslan/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/run.sh b/egs2/ruslan/tts1/run.sh
new file mode 100755
index 00000000000..68b8a90f05c
--- /dev/null
+++ b/egs2/ruslan/tts1/run.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Feature related
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+# Data prep related
+text_format=phn  # Use "raw" or "phn". If use "phn", convert to phn in data prep.
+local_data_opts=""
+local_data_opts+=" --text_format ${text_format}"
+if [ "${text_format}" = phn ]; then
+    local_data_opts+=" --g2p espeak_ng_russian"
+fi
+
+dset_suffix=""
+if [ "${text_format}" = phn ]; then
+    dset_suffix=_phn
+fi
+train_set=tr_no_dev${dset_suffix}
+valid_set=dev${dset_suffix}
+test_sets="dev${dset_suffix} eval1${dset_suffix}"
+
+# config related
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# NOTE(kan-bayashi): Make sure that you use text_format=raw
+#   if you want to use token_type=char.
+token_type=phn
+
+# NOTE(kan-bayashi): On-the-fly with Espeak is really slow,
+#   so we convert text into phn in data prep stage via
+#   --text_format=phn and use g2p=none for training.
+# g2p=espeak_ng_russian
+g2p=none
+
+./tts.sh \
+    --local_data_opts "${local_data_opts}" \
+    --audio_format wav \
+    --lang ru \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type "${token_type}" \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    "$@"
diff --git a/egs2/ruslan/tts1/scripts b/egs2/ruslan/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/ruslan/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/sid b/egs2/ruslan/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/ruslan/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/steps b/egs2/ruslan/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/ruslan/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/tts.sh b/egs2/ruslan/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/ruslan/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/ruslan/tts1/utils b/egs2/ruslan/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/ruslan/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/seame/asr1/README.md b/egs2/seame/asr1/README.md
new file mode 100644
index 00000000000..489ae51208a
--- /dev/null
+++ b/egs2/seame/asr1/README.md
@@ -0,0 +1,32 @@
+# Conformer + specaug + speed perturbation
+## Environments
+ - date: `Thu Jan 13 17:16:42 CST 2022`
+ - python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+ - espnet version: `espnet 0.10.6a1`
+ - pytorch version: `pytorch 1.8.1+cu111`
+ - Git hash: `cddeeef1933ce4c1552e9d2e1af5bb3c60ad74f4`
+   - Commit date: `Fri Dec 31 23:16:25 2021 +0900`
+
+## With Transformer LM
+ - Model link: [zenodo](https://zenodo.org/record/5845307) / [huggingface](https://huggingface.co/espnet/vectominist_seame_asr_conformer_bpe5626)
+ - ASR config: [./conf/tuning/train_asr_conformer.yaml](./conf/tuning/train_asr_conformer.yaml)
+ - LM config: [./conf/tuning/train_lm_transformer.yaml](./conf/tuning/train_lm_transformer.yaml)
+ 
+### WER
+ Mixed Mandarin CER / English WER
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+ |---|---|---|---|---|---|---|---|---|
+ |decode_lm0.2_ctc0.4_beam10/devman|6531|96737|85.3|11.4|3.3|1.9|16.6|75.5|
+ |decode_lm0.2_ctc0.4_beam10/devsge|5321|54390|79.5|16.2|4.4|2.8|23.3|74.3|
+
+ Mandarin CER
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+ |---|---|---|---|---|---|---|---|---|
+ |decode_lm0.2_ctc0.4_beam10/devman|6531|71806|88.2|7.6|4.3|3.2|15.0|59.4|
+ |decode_lm0.2_ctc0.4_beam10/devsge|5321|20327|84.9|9.1|6.0|6.9|22.0|34.5|
+
+ English WER
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+ |---|---|---|---|---|---|---|---|---|
+ |decode_lm0.2_ctc0.4_beam10/devman|6531|24931|76.9|14.4|8.7|6.1|29.2|52.6|
+ |decode_lm0.2_ctc0.4_beam10/devsge|5321|34063|76.2|16.2|7.5|4.5|28.2|66.2|
\ No newline at end of file
diff --git a/egs2/seame/asr1/asr.sh b/egs2/seame/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/seame/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/seame/asr1/cmd.sh b/egs2/seame/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/seame/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/seame/asr1/conf/decode_asr.yaml b/egs2/seame/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..c99b0378d69
--- /dev/null
+++ b/egs2/seame/asr1/conf/decode_asr.yaml
@@ -0,0 +1,3 @@
+lm_weight: 0.2
+ctc_weight: 0.4
+beam_size: 10
\ No newline at end of file
diff --git a/egs2/seame/asr1/conf/fbank.conf b/egs2/seame/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/seame/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/seame/asr1/conf/pbs.conf b/egs2/seame/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/seame/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/seame/asr1/conf/pitch.conf b/egs2/seame/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/seame/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/seame/asr1/conf/queue.conf b/egs2/seame/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/seame/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/seame/asr1/conf/slurm.conf b/egs2/seame/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/seame/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/seame/asr1/conf/train.yaml b/egs2/seame/asr1/conf/train.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/seame/asr1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/seame/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/seame/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..b9cfe8e6aee
--- /dev/null
+++ b/egs2/seame/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,70 @@
+# This configuration requires 2 GPUs with 32GB memory and 2 days for training
+batch_type: numel
+batch_bins: 15000000
+
+accum_grad: 2
+grad_clip: 5
+max_epoch: 120
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/seame/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/seame/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..6b00db4f608
--- /dev/null
+++ b/egs2/seame/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,28 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 1000000
+accum_grad: 2
+max_epoch: 15
+
+optim: adam
+optim_conf:
+   lr: 0.0002
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/seame/asr1/db.sh b/egs2/seame/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/seame/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/seame/asr1/local/data.sh b/egs2/seame/asr1/local/data.sh
new file mode 100755
index 00000000000..27fc9aa1a2f
--- /dev/null
+++ b/egs2/seame/asr1/local/data.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+dev_repo_dir=data/SEAME-dev-set
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ -z "${SEAME}" ]; then
+    log "Fill the value of 'SEAME' of db.sh"
+    exit 1
+fi
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && [ ! -d "${dev_repo_dir}" ]; then
+    log "stage 1: Clone official SEAME repository"
+    
+    git clone https://github.com/zengzp0912/SEAME-dev-set.git ${dev_repo_dir}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    
+    local/preprocess.py --out data --data ${SEAME} --repo ${dev_repo_dir}
+    
+    for set in train valid devman devsge
+    do
+        cp data/${set}/text.rm.noise data/${set}/text
+        utils/utt2spk_to_spk2utt.pl data/${set}/utt2spk > data/${set}/spk2utt
+        utils/validate_data_dir.sh --no-feats data/${set} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/seame/asr1/local/path.sh b/egs2/seame/asr1/local/path.sh
new file mode 100755
index 00000000000..080cdb9969f
--- /dev/null
+++ b/egs2/seame/asr1/local/path.sh
@@ -0,0 +1,5 @@
+if [ ! which flac &> /dev/null ]
+then
+    echo "Error: flac is not installed"
+    return 1
+fi
\ No newline at end of file
diff --git a/egs2/seame/asr1/local/preprocess.py b/egs2/seame/asr1/local/preprocess.py
new file mode 100755
index 00000000000..eb0ccfac47b
--- /dev/null
+++ b/egs2/seame/asr1/local/preprocess.py
@@ -0,0 +1,644 @@
+#!/usr/bin/env python3
+# -*- encoding: utf8 -*-
+
+"""
+    This is an python implementation of preprocessing of
+    the SEAME Mandarin-English code-switching corpus.
+    We follow original papers [1, 2] and the official
+    github repository [3] to make this code produces the
+    same amount of training and testing data.
+
+    [1] Dau-Cheng Lyu, Tien-Ping Tan, Eng-Siong Chng, and
+        Haizhou Li, "SEAME: a Mandarin-English Code-switching
+        Speech Corpus in South-East Asia," in Interspeech, 2010.
+    [2] Zhiping Zeng, Yerbolat Khassanov, Van Tung Pham, Haihua
+        Xu, Eng Siong Chng, and Haizhou Li, "On the End-to-End
+        Solution to Mandarin-English Code-switching Speech
+        Recognition," in Interspeech, 2019.
+    [3] https://github.com/zengzp0912/SEAME-dev-set
+"""
+
+import re
+import os
+import sys
+import argparse
+import itertools
+import collections
+import random as rd
+
+rd.seed(531)
+
+remove_punc = '()[]{}.,?·@，。、「」＃"~-—#%_`｀×*（）［］&【】～ｌ\\'
+pattern = str.maketrans(remove_punc, " " * len(remove_punc))
+
+translate_char_source = "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺé"
+translate_char_target = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyze"
+pattern2 = str.maketrans(translate_char_source, translate_char_target)
+
+all_chars = (chr(i) for i in range(sys.maxunicode))
+categories = {"Cc"}
+control_chars = "".join(map(chr, itertools.chain(range(0x00, 0x20), range(0x7F, 0xA0))))
+control_char_re = re.compile("[%s]" % re.escape(control_chars))
+
+
+def remove_control_chars(text):
+    """remove unprintable characters"""
+    return control_char_re.sub("", text)
+
+
+def remove_redundant_whitespaces(text):
+    """remove redundant whitespaces"""
+    return re.sub(" +", " ", text).strip()
+
+
+def is_english(c):
+    """check character is in English"""
+    return ord(c.lower()) >= ord("a") and ord(c.lower()) <= ord("z")
+
+
+def is_mandarin(c):
+    """check character is Mandarin"""
+    return (
+        not is_english(c)
+        and not c.isdigit()
+        and c != " "
+        and c != "<"
+        and c != ">"
+        and c != "'"
+    )
+
+
+def extract_mandarin_only(text):
+    """remove other symbols except for Mandarin characters in a string"""
+    return "".join([c for c in text if is_mandarin(c)])
+
+
+def extract_non_mandarin(text):
+    """remove Mandarin characters in a string"""
+    return " ".join([w for w in text.split(" ") if not any(is_mandarin(c) for c in w)])
+
+
+def insert_space_between_mandarin(text):
+    """insert space between Mandarin characters"""
+
+    if len(text) <= 1:
+        return text
+    out_text = text[0]
+    for i in range(1, len(text)):
+        if is_mandarin(text[i]):
+            out_text += " "
+        out_text += text[i]
+        if is_mandarin(text[i]):
+            out_text += " "
+
+    return out_text
+
+
+def remove_repeated_noise(text, pattern="<noise>"):
+    """remove repeated <noise>"""
+
+    if len(re.findall(pattern, text)) <= 1:
+        return text
+
+    out_text = ""
+    text_split = text.split()
+    out_text = [text_split[0]]
+    for i in range(1, len(text_split)):
+        if text_split[i] == pattern and text_split[i - 1] == pattern:
+            continue
+        else:
+            out_text.append(text_split[i])
+
+    return " ".join(out_text)
+
+
+def normalize_text(text):
+    """normalize a text sequence"""
+
+    rmtext = re.sub(
+        r"\(((pp)(\w)+)\)",
+        "<noise>",
+        text.lower(),
+    )
+    rmtext = re.sub(
+        r"\<((pp)(\w)+)\>",
+        "<noise>",
+        rmtext,
+    )
+    rmtext = rmtext.translate(pattern)
+    rmtext = remove_control_chars(rmtext)
+    output_text = ""
+    for wrd in rmtext.split():
+        if wrd in {
+            "ppl",
+            "ppc",
+            "ppb",
+            "ppo",
+            "<v-noise>",
+        }:
+            wrd = "<noise>"
+        output_text += f"{wrd} "
+
+    output_text = output_text.strip()
+    output_text = output_text.translate(pattern2)
+    output_text = output_text.replace("<unl>", "<unk>")
+    output_text = output_text.replace("< unk >", "<unk>")
+    output_text = re.sub(r"\<((unk)[a-z ]+)\>", "<unk>", output_text)
+    output_text = insert_space_between_mandarin(output_text)
+    output_text = remove_redundant_whitespaces(output_text)
+    output_text = remove_repeated_noise(output_text, "<noise>")
+
+    return output_text
+
+
+def read_list(pth):
+    """read data list (data/SEAME-dev-set/train/wav_file.txt)"""
+
+    stypes, idxs = [], []
+    with open(pth, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line == "":
+                continue
+            stype, idx = line.split("/")[-3], line.split("/")[-2]
+            stypes.append(stype)
+            idxs.append(idx)
+        return stypes, idxs
+
+
+def read_text(pth, rmspk=False):
+    """read dev set text data (data/SEAME-dev-set/{devset}/text)"""
+
+    idxs = []
+    with open(pth, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line == "":
+                continue
+
+            line = line.split()[0]
+            if rmspk:
+                line = line.split("-", 1)[-1]
+            idxs.append(line.lower())
+        return idxs
+
+
+def read_trans(data_dict, pth, phs, audio_list, aduio_pth):
+    """read transcriptions (SEAME/{type}/transcript/phaseII/??.txt)"""
+
+    audio_dict = set(audio_list)
+
+    with open(pth, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line == "":
+                continue
+
+            if phs.lower() == "phasei":
+                lang = None
+                if len(line.split("\t")) == 4:
+                    idx, start, end, text = line.split("\t")
+                else:
+                    idx, cont = line.split("\t", 1)
+                    print(f"Skip {idx} with {cont}... (no transcript)")
+                    continue
+            elif phs.lower() == "phaseii":
+                idx, start, end, lang, text = line.split("\t")
+            else:
+                print("folder error! not PhaseI or PhaseII")
+                raise
+            # start: start time in msec
+            # end: end time in msec
+
+            start_ms = start
+            end_ms = end
+
+            # fit the devset format
+            s_len, e_len = len(start), len(end)
+            if s_len < 5:
+                start = int(round(fit_format(start) / 10, 0))
+                start = str(start).zfill(5)
+            else:
+                start = int(round(float(start) / 10, 0))
+            if e_len < 5:
+                end = int(round(fit_format(end) / 10, 0))
+                end = str(end).zfill(5)
+            else:
+                end = int(round(float(end) / 10, 0))
+
+            name = f"{idx}-{start}-{end}"
+            if name not in data_dict:
+                if idx.split("_")[0][0].isdigit():
+                    spkr = idx.split("_")[0][2:-2].lower()
+                else:
+                    spkr = idx.split("_")[0][:5].lower()
+
+                if idx.split("-")[0] in audio_dict:
+                    apth = os.path.join(audio_pth, name.split("-")[0] + ".flac")
+                else:
+                    print("FLAC idx error!")
+                    raise
+
+                data_dict[name.lower()] = {
+                    "text": text,
+                    "start": start,
+                    "end": end,
+                    "speaker": spkr,
+                    "split": "train",
+                    "audio_pth": apth,
+                    "start_ms": start_ms,
+                    "end_ms": end_ms,
+                    "phase": phs,
+                }
+            else:
+                print("Repeated idx!")
+                raise
+
+
+def fit_format(digit):
+    """fit file name format"""
+    str_digit = str(float(digit) / 10.0)
+    if int(str_digit[-1]) >= 5:
+        return float(digit) + 1
+    else:
+        return float(digit)
+
+
+def check_audio(data_dict, audio_dict):
+    """check whether data_dict and audio_dict match"""
+    for key in data_dict.keys():
+        if key.split("-")[0] not in audio_dict:
+            print(f"key = {key} not in audio files")
+
+
+def check_test_split(test, data_dict, splitname):
+    """find testing data in data_dict"""
+
+    train_idx = []
+    data = list(data_dict.keys())
+    count = 0
+    space = {}
+    idx_space = {}
+    for key in data:
+        idx, start, end = key.split("-")
+        idx_space[idx] = idx_space.get(idx, []) + [[str(start), str(end)]]
+        space[idx] = space.get(idx, []) + [[float(start), float(end)]]
+
+    for key in test:
+        idx, start, end = key.split("-")
+        start, end = float(start), float(end)
+        for list_idx, time in enumerate(space[idx]):
+            if abs(start - time[0]) < 3 and abs(end - time[1]) < 3:
+                count += 1
+                time1, time2 = idx_space.get(idx)[list_idx]
+                data_dict[(f"{idx}-{time1}-{time2}")]["split"] = splitname
+                break
+
+    print(f"=> Test set = {count}/{len(test)}")
+
+
+def sieve_train(data_dict, train_dict):
+    """tag samples other than training or testing data"""
+
+    for key in data_dict.keys():
+        if data_dict[key]["split"] == "train" and key.split("-")[0] in train_dict:
+            continue
+        elif data_dict[key]["split"] in ["devman", "devsge"]:
+            continue
+        else:
+            data_dict[key]["split"] = "other"
+
+
+def split_val(data_dict, num_val=None):
+    """split train/val sets"""
+
+    count = 0
+    test_list = []
+    tr_list = []
+    for key, content in data_dict.items():
+        if content["split"] in {"devman", "devsge"}:
+            test_list.append(key)
+        elif content["split"] == "train":
+            tr_list.append(key)
+
+    rd.shuffle(tr_list)
+    val_len = num_val if num_val else int(len(tr_list) * 0.05)
+    tr_list, val_list = tr_list[:-val_len], tr_list[-val_len:]
+
+    for key in val_list:
+        data_dict[key]["split"] = "valid"
+
+    return data_dict, tr_list, val_list, test_list
+
+
+def count_data(data_dict):
+    """count audio length and number of speakers"""
+
+    lens = {"train": 0.0, "valid": 0.0, "devman": 0.0, "devsge": 0.0, "other": 0.0}
+    spkr_dict = {
+        "train": set(),
+        "valid": set(),
+        "devman": set(),
+        "devsge": set(),
+        "other": set(),
+    }
+    for key, val in data_dict.items():
+        lens[val["split"]] += (float(val["end_ms"]) - float(val["start_ms"])) / 1000.0
+        spkr_dict[val["split"]].add(val["speaker"])
+
+    for key in lens.keys():
+        print(
+            "=> {} set : {:.2f} hours / {} speakers".format(
+                key, lens[key] / 3600.0, len(spkr_dict[key])
+            )
+        )
+
+
+def write_f(pth, filename, data_dict):
+    """write kaldi-compatible files"""
+
+    print(f"=> Writing {filename}...")
+    idx_pth = os.path.join(pth, "list")
+    txt_pth = os.path.join(pth, "text.ori")
+    rmtxt_pth = os.path.join(pth, "text.rm")
+    idxtxt_pth = os.path.join(pth, "text.clean")
+    idxnoisetxt_pth = os.path.join(pth, "text.rm.noise")
+    seg_pth = os.path.join(pth, "segments")
+    wav_pth = os.path.join(pth, "wav.scp")
+    spk_pth = os.path.join(pth, "utt2spk")
+    gender_pth = os.path.join(pth, "spk2gender")
+    wav_cmds = {}
+    gender = {}
+    total_len = 0.0
+    total_utt = 0
+
+    # write idx list
+    with open(txt_pth, "w") as tlist:
+        with open(rmtxt_pth, "w") as rtlist:
+            with open(idxtxt_pth, "w") as itlist:
+                with open(idxnoisetxt_pth, "w") as intlist:
+                    with open(seg_pth, "w") as slist:
+                        with open(idx_pth, "w") as flist:
+                            with open(wav_pth, "w") as wlist:
+                                with open(spk_pth, "w") as spklist:
+                                    with open(gender_pth, "w") as genlist:
+                                        for idx, content in data_dict.items():
+                                            if filename != content["split"]:
+                                                continue
+
+                                            # id & text
+                                            text = content["text"]
+                                            audio_pth = content["audio_pth"]
+                                            spkr = content["speaker"]
+
+                                            # process text
+                                            normalized_text = normalize_text(text)
+                                            no_noise_text = normalized_text.replace(
+                                                "<noise>", ""
+                                            ).replace("<unk>", "")
+                                            no_noise_text = (
+                                                remove_redundant_whitespaces(
+                                                    no_noise_text
+                                                )
+                                            )
+                                            normalized_text = normalized_text.replace(
+                                                "<unk>", "<UNK>"
+                                            )
+
+                                            # remove short utterances
+                                            if len(no_noise_text) == 0:
+                                                continue
+
+                                            # fit kaldi format
+                                            prefix, id_start, id_end = idx.split("-")
+
+                                            # remove some short utterance
+                                            if float(id_end) - float(id_start) <= 1:
+                                                continue
+                                            idx = (
+                                                prefix
+                                                + "-"
+                                                + "0" * (6 - len(id_start))
+                                                + id_start
+                                                + "-"
+                                                + "0" * (6 - len(id_end))
+                                                + id_end
+                                            )
+
+                                            uttidx = f"{spkr}-{idx}"
+                                            if spkr[-1] in ["m", "f"]:
+                                                gender[spkr] = spkr[-1]
+                                            else:
+                                                # some SEAME's bug
+                                                for g in reversed(prefix.split("_")[0]):
+                                                    if g.lower() in ["m", "f"]:
+                                                        gender[spkr] = g.lower()
+
+                                            spklist.write(f"{uttidx} {spkr}\n")
+                                            flist.write(f"{uttidx}\n")
+                                            tlist.write(f"{uttidx} {text}\n")
+                                            _, recordid, start, end = uttidx.split("-")
+                                            wav_cmds[
+                                                recordid
+                                            ] = f"flac -c -d -s {audio_pth} |"
+                                            # map to sec, original ms,
+                                            # idx here has /10.
+                                            start, end = (
+                                                float(start) / 100,
+                                                float(end) / 100,
+                                            )
+                                            # write segments
+                                            slist.write(
+                                                f"{uttidx} {recordid} {start} {end}\n"
+                                            )
+
+                                            rtlist.write(normalized_text + "\n")
+                                            intlist.write(
+                                                f"{uttidx} {normalized_text}\n"
+                                            )
+
+                                            itlist.write(f"{uttidx} {no_noise_text}\n")
+
+                                            total_len += end - start
+                                            total_utt += 1
+
+                                        for recordid in sorted(wav_cmds.keys()):
+                                            wlist.write(
+                                                f"{recordid} {wav_cmds[recordid]}\n"
+                                            )
+
+                                        for spkr in sorted(gender.keys()):
+                                            genlist.write(f"{spkr} {gender[spkr]}\n")
+
+    print(
+        "=>    {}: {} utts, {:.2f} hours, avg {:.2f} sec/utt".format(
+            filename, total_utt, total_len / 3600.0, total_len / total_utt
+        )
+    )
+
+
+def write_mandarin_only_text(data_dict, file, char_file1, char_file2):
+    """write Mandarin text data"""
+
+    counter = collections.Counter()
+    with open(file, "w") as fp:
+        for idx, content in data_dict.items():
+            if "train" == content["split"]:
+                text = normalize_text(content["text"])
+                text = text.replace("<noise>", "")
+                text = text.replace("<unk>", "")
+                text = remove_redundant_whitespaces(text)
+                text = extract_mandarin_only(text)
+                counter.update(text)
+                if text != "":
+                    fp.write(text + "\n")
+
+    vocab_list = sorted(counter.keys())
+    print(f"=> Mandarin vocab size = {len(vocab_list)}")
+
+    with open(char_file1, "w") as fp:
+        fp.write("\n".join(vocab_list))
+    with open(char_file2, "w") as fp:
+        fp.write('bpe_nlsyms="<noise>,▁' + ",▁".join(vocab_list) + '"\n')
+        fp.write(f"man_chars={len(vocab_list)}")
+
+
+def write_bpe_train_text(data_dict, file):
+    """write English BPE training text data"""
+
+    with open(file, "w") as fp:
+        for idx, content in data_dict.items():
+            if "train" == content["split"]:
+                text = normalize_text(content["text"])
+                text = text.replace("<noise>", "")
+                text = text.replace("<unk>", "")
+                text = remove_redundant_whitespaces(text)
+                text = extract_non_mandarin(text)
+                if text != "":
+                    fp.write(text + "\n")
+
+
+if __name__ == "__main__":
+    # parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--out",
+        "-o",
+        type=str,
+        help="Path to output directory.",
+    )
+    parser.add_argument("--data", "-d", type=str, help="Path to original corpus.")
+    parser.add_argument(
+        "--repo", "-r", type=str, help="Path to official repo (downloaded)."
+    )
+    args = parser.parse_args()
+
+    # basic variables setup
+    out_pth = args.out
+    ori_data_pth = args.data
+
+    # read data
+    print("=> Preprocessing transcription files...")
+    audio_type = ["conversation", "interview"]
+    audios, trans = [], []
+    data_dict, audio_idx_list = {}, []
+    for atp in audio_type:
+        # read audio
+        audio_pth = os.path.abspath(os.path.join(ori_data_pth, atp, "audio"))
+        for au in os.listdir(os.path.join(ori_data_pth, atp, "audio")):
+            audios.append(au.strip(".flac"))
+            audio_idx_list.append(au.split("/")[-1].strip(".flac").lower())
+
+        # read transcription
+        for phs in ["phaseII"]:
+            for txt in os.listdir(os.path.join(ori_data_pth, atp, "transcript", phs)):
+                trans_pth = os.path.join(ori_data_pth, atp, "transcript", phs, txt)
+                read_trans(data_dict, trans_pth, phs, audios, audio_pth)
+
+    # check whether the audio file exists for each utterance
+    print("=> Checking audio files...")
+    check_audio(data_dict, set(audio_idx_list))
+
+    # get train set
+    print("=> Reading wav_file.txt for training set...")
+    all_audio_pth = os.path.join(args.repo, "train", "wav_file.txt")
+    folder_type, all_audio_idx = read_list(all_audio_pth)
+
+    print("=> Getting training set...")
+    sieve_train(data_dict, set(all_audio_idx))
+
+    # dev set
+    print("=> Reading dev set indices...")
+    rmspk = True
+    dev_man = os.path.join(args.repo, "dev_man", "text")
+    devman_idx = read_text(dev_man, rmspk)
+
+    dev_sge = os.path.join(args.repo, "dev_sge", "text")
+    devsge_idx = read_text(dev_sge, rmspk)
+
+    # check
+    print("=> Checking testing sets...")
+    check_test_split(devman_idx, data_dict, "devman")
+    check_test_split(devsge_idx, data_dict, "devsge")
+
+    # split
+    print("=> Splitting train/val sets...")
+    data_dict, tr_list, val_list, test_list = split_val(data_dict)
+
+    # report some results
+    print(f"=> Audio files = {len(audios)}")
+    print(f"=> Total utterance = {len(data_dict.keys())}")
+    print(f"=> Number of train set = {len(tr_list)}; validation set = {len(val_list)}")
+    print(f"=> Number of devman = {len(devman_idx)}; devsge = {len(devsge_idx)}")
+
+    # report corpus size (in hours)
+    count_data(data_dict)
+
+    # sort by speaker
+    print("=> Sorting data by speaker id...")
+    data_idx = []
+    spkr_dict = collections.OrderedDict([])
+    for k, v in data_dict.items():
+        speaker = data_dict[k]["speaker"]
+        spkr_dict[speaker] = spkr_dict.get(speaker, []) + [k]
+    for k in sorted(spkr_dict.keys()):
+        data_idx += sorted(spkr_dict[k])
+
+    sorted_idx = []
+    prev_name = None
+    buff = {}
+    for idx in data_idx:
+        name, start = idx.split("-")[0], idx.split("-")[1]
+        if prev_name:
+            if prev_name == name:
+                buff[int(start)] = idx
+            else:
+                sorted_idx += [buff[k] for k in sorted(buff.keys())]
+                # clean buff
+                buff = {int(start): idx}
+                prev_name = name
+        else:
+            prev_name = name
+            buff = {int(start): idx}
+    sorted_data_dict = collections.OrderedDict()
+    for key in sorted_idx:
+        sorted_data_dict[key] = data_dict[key]
+
+    # make kaldi format files
+    print("=> Writing files...")
+    for name in ["train", "valid", "devman", "devsge"]:
+        data_pth = os.path.join(out_pth, name)
+        os.makedirs(data_pth, exist_ok=True)
+        write_f(data_pth, name, sorted_data_dict)
+
+    write_mandarin_only_text(
+        sorted_data_dict,
+        os.path.join(out_pth, "train", "text.man"),
+        os.path.join(out_pth, "train", "token.man.1"),
+        os.path.join(out_pth, "train", "token.man.2"),
+    )
+
+    write_bpe_train_text(
+        sorted_data_dict,
+        os.path.join(out_pth, "train", "text.eng.bpe"),
+    )
diff --git a/egs2/seame/asr1/local/score.sh b/egs2/seame/asr1/local/score.sh
new file mode 100755
index 00000000000..ac7a1784b2e
--- /dev/null
+++ b/egs2/seame/asr1/local/score.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# This script computes CER of Mandarin and WER of English separately
+
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    echo "only one argument is required"
+fi
+
+while IFS= read -r expdir; do
+    if ls "${expdir}"/*/*/score_wer/hyp.trn &> /dev/null; then
+        for scoredir in "${expdir}"/*/*/score_wer; do
+            # split Mandarin and English transcriptions
+            local/split_lang_trn.py -t ${scoredir}/hyp.trn -o ${scoredir}
+            local/split_lang_trn.py -t ${scoredir}/ref.trn -o ${scoredir}
+
+            # respectively computes the error rates
+            for lang in eng man; do
+                sclite -e utf-8 -c NOASCII \
+                    -r "${scoredir}/ref.trn.${lang}" trn \
+                    -h "${scoredir}/hyp.trn.${lang}" trn \
+                    -i rm -o all stdout \
+                    > "${scoredir}/result.${lang}.txt"
+            done
+        done
+
+        # show results
+        for lang in eng man; do
+            if [ $lang = eng ]; then
+                echo "English WER"
+            else
+                echo "Mandarin CER"
+            fi
+
+            echo "|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|"
+            grep -H -e Avg "${expdir}"/*/*/score_wer/result.${lang}.txt \
+                | sed -e "s#${expdir}/\([^/]*/[^/]*\)/score_wer/result.${lang}.txt:#|\1#g" \
+                | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+            echo
+        done
+    fi
+done < <(find ${exp} -mindepth 0 -maxdepth 1 -type d)
diff --git a/egs2/seame/asr1/local/split_lang_trn.py b/egs2/seame/asr1/local/split_lang_trn.py
new file mode 100755
index 00000000000..1cff8674a73
--- /dev/null
+++ b/egs2/seame/asr1/local/split_lang_trn.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- encoding: utf8 -*-
+
+import os
+import argparse
+
+from preprocess import (
+    remove_redundant_whitespaces,
+    extract_mandarin_only,
+    extract_non_mandarin,
+    insert_space_between_mandarin,
+)
+
+
+if __name__ == "__main__":
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--trn", "-t", type=str, help=".trn file")
+    parser.add_argument("--out", "-o", type=str, help="Output dir.")
+    args = parser.parse_args()
+
+    out_name = args.trn.split("/")[-1]  # hyp.trn / ref.trn
+    eng_out_path = os.path.join(args.out, out_name + ".eng")
+    man_out_path = os.path.join(args.out, out_name + ".man")
+
+    with open(args.trn, "r") as fp:
+        with open(eng_out_path, "w") as fp_eng:
+            with open(man_out_path, "w") as fp_man:
+                for line in fp:
+                    sent, idx = line.split("\t")
+
+                    sent_eng = extract_non_mandarin(sent)
+                    sent_man = extract_mandarin_only(sent)
+                    sent_man = insert_space_between_mandarin(sent_man)
+                    sent_eng = remove_redundant_whitespaces(sent_eng)
+                    sent_man = remove_redundant_whitespaces(sent_man)
+
+                    fp_eng.write(sent_eng + "\t" + idx)
+                    fp_man.write(sent_man + "\t" + idx)
diff --git a/egs2/seame/asr1/path.sh b/egs2/seame/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/seame/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/seame/asr1/pyscripts b/egs2/seame/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/seame/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/seame/asr1/run.sh b/egs2/seame/asr1/run.sh
new file mode 100755
index 00000000000..45a429aa542
--- /dev/null
+++ b/egs2/seame/asr1/run.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=13
+
+train_set="train"
+valid_set="valid"
+test_sets="devman devsge"
+
+asr_config=conf/tuning/train_asr_conformer.yaml
+lm_config=conf/tuning/train_lm_transformer.yaml
+inference_config=conf/decode_asr.yaml
+
+if [ ! -f "data/train/token.man.2" ]; then
+    # must preprocess data first to get Mandarin character tokens
+    if [ ${stage} -eq 1 ]; then
+        ./asr.sh --stage 1 --stop_stage 1
+        stage=2
+    else
+        echo "Error: data/train/token.man.2 does not exist! Run from stage=1 again."
+        exit 1
+    fi
+fi
+
+man_chars=2622
+bpe_nlsyms=""
+
+source data/train/token.man.2  # for bpe_nlsyms & man_chars
+nbpe=$((3000 + man_chars + 4))  # 5626
+# English BPE: 3000 / Mandarin: 2622 / other symbols: 4
+
+./asr.sh \
+    --ngpu 2 \
+    --stage ${stage} \
+    --stop_stage ${stop_stage} \
+    --nbpe ${nbpe} \
+    --bpe_nlsyms "${bpe_nlsyms}" \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --max_wav_duration 30 \
+    --asr_config "${asr_config}" \
+    --lm_config "${lm_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --bpe_train_text "data/${train_set}/text.eng.bpe" \
+    --score_opts "-e utf-8 -c NOASCII" \
+    "$@"
diff --git a/egs2/seame/asr1/scripts b/egs2/seame/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/seame/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/seame/asr1/steps b/egs2/seame/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/seame/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/seame/asr1/utils b/egs2/seame/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/seame/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/README.md b/egs2/sinhala/asr1/README.md
new file mode 100644
index 00000000000..5a55cc82373
--- /dev/null
+++ b/egs2/sinhala/asr1/README.md
@@ -0,0 +1,43 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Environments
+- date: `Wed Dec 22 00:25:08 EST 2021`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a3`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `ea2174b5cef70a9b65a64980ef56283222283af2`
+  - Commit date: `Thu Sep 16 21:19:45 2021 +0900` 
+ 
+## Dataset 
+- Dataset Link: https://drive.google.com/file/d/17_e0JhMW4_FPxfh93foplnxb4OQp8zh3/view?usp=sharing
+- Licence Link: http://rtuthaya.lk/wp-content/uploads/2019/11/LICENSE.txt
+
+## Using Transformer based encoder-decoder predicting transcript along with intent
+- ASR config: [conf/tuning/train_asr.yaml](conf/tuning/train_asr.yaml)
+- token_type: word
+- keep_nbest_models: 5
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|762|94.88|
+|inference_asr_model_valid.acc.ave_5best/valid|763|93.577|
+
+## asr_train_asr_raw_en_word
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|762|15965|97.2|1.8|1.0|1.3|4.1|13.9|
+|inference_asr_model_valid.acc.ave_5best/valid|763|15853|96.4|2.5|1.1|1.8|5.5|16.3|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_5best/test|762|41332|97.8|1.4|0.8|1.0|3.2|13.9|
+|inference_asr_model_valid.acc.ave_5best/valid|763|41124|97.3|1.9|0.8|1.4|4.1|16.3|
+
+
+### Pre-trained Model
+The pre-trained model has been uploaded to HuggingFace [ESPNet Model](https://huggingface.co/espnet/Karthik_sinhala_asr_train_asr_transformer)
diff --git a/egs2/sinhala/asr1/asr.sh b/egs2/sinhala/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/sinhala/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/cmd.sh b/egs2/sinhala/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/sinhala/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/sinhala/asr1/conf/fbank.conf b/egs2/sinhala/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/sinhala/asr1/conf/pbs.conf b/egs2/sinhala/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/sinhala/asr1/conf/pitch.conf b/egs2/sinhala/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/sinhala/asr1/conf/queue.conf b/egs2/sinhala/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/sinhala/asr1/conf/slurm-2.conf b/egs2/sinhala/asr1/conf/slurm-2.conf
new file mode 100644
index 00000000000..71200d8b9b7
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/slurm-2.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p RM-shared
+option gpu=* -p GPU-shared --gres=gpu:v100-16:$0 # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/sinhala/asr1/conf/slurm.conf b/egs2/sinhala/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/sinhala/asr1/conf/train_asr.yaml b/egs2/sinhala/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..4ebab74c499
--- /dev/null
+++ b/egs2/sinhala/asr1/conf/train_asr.yaml
@@ -0,0 +1,48 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+keep_nbest_models: 5
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/sinhala/asr1/db.sh b/egs2/sinhala/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/sinhala/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/local/data.sh b/egs2/sinhala/asr1/local/data.sh
new file mode 100755
index 00000000000..d2e9968cfad
--- /dev/null
+++ b/egs2/sinhala/asr1/local/data.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SINHALA}" ]; then
+    log "Fill the value of 'SINHALA' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${SINHALA}/LICENCE" ]; then
+	echo "stage 1: Download data to ${SINHALA}"
+    else
+        log "stage 1: ${SINHALA}/LICENCE is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,valid,test}
+    python3 local/data_prep.py ${SINHALA}
+    for x in test valid train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/sinhala/asr1/local/data_prep.py b/egs2/sinhala/asr1/local/data_prep.py
new file mode 100644
index 00000000000..ba218d62260
--- /dev/null
+++ b/egs2/sinhala/asr1/local/data_prep.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Karthik Ganesan
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+from tqdm import tqdm
+import pandas as pd
+import os
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [SINHALA]")
+    sys.exit(1)
+sinhala_root = sys.argv[1]
+
+
+def read_sinhala_data(audio_csv, sentences_csv, export_csv):
+    sent_df = pd.read_csv(sentences_csv)
+    data = pd.read_csv(audio_csv)
+    output_df = []
+
+    if not os.path.exists("wavs"):
+        os.mkdir("wavs")
+
+    for i in range(len(sent_df)):
+        intent, intent_details, inflection, transcript = (
+            sent_df.iloc[i]["intent"],
+            sent_df.iloc[i]["intent_details"],
+            sent_df.iloc[i]["inflection"],
+            sent_df.iloc[i]["sentence"],
+        )
+
+        for j in range(len(data)):
+            wav_name, intent_, inflection_ = (
+                data.iloc[j]["audio_file"],
+                data.iloc[j]["intent"],
+                data.iloc[j]["inflection"],
+            )
+            if intent_ == intent and inflection_ == inflection:
+                # clean transcript
+                # export audio of for the crop with wav_path_start_duration
+                export_path = os.path.join("wavs", wav_name)
+                # Append to output_df
+                output_df.append(
+                    [
+                        export_path,
+                        "unknown",
+                        transcript,
+                        intent_details.replace(" ", ""),
+                    ]
+                )
+
+    X = pd.DataFrame(
+        output_df, columns=["path", "speakerId", "transcription", "task_type"]
+    )
+    Y = X.pop("task_type").to_frame()
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, Y, stratify=Y, test_size=0.20, random_state=42
+    )
+    X_test, X_val, y_test, y_val = train_test_split(
+        X_test, y_test, stratify=y_test, test_size=0.50, random_state=42
+    )
+    pd.concat([X_train, y_train], axis=1).to_csv("train.csv")
+    pd.concat([X_test, y_test], axis=1).to_csv("test.csv")
+    pd.concat([X_val, y_val], axis=1).to_csv("validation.csv")
+
+
+read_sinhala_data(
+    os.path.join(sinhala_root, "Sinhala_Data.csv"),
+    os.path.join(sinhala_root, "Sinhala_Sentences.csv"),
+    os.path.join(sinhala_root, "export.csv"),
+)
+
+
+dir_dict = {
+    "train": "train.csv",
+    "valid": "validation.csv",
+    "test": "test.csv",
+}
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "wav.scp"), "w"
+    ) as wav_scp_f, open(
+        os.path.join("data", x, "transcript"), "w"
+    ) as transcript_f, open(
+        os.path.join("data", x, "utt2spk"), "w"
+    ) as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(os.path.join(sinhala_root, "data", dir_dict[x]))
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            words = row[4].replace(" ", "_") + " " + " ".join([ch for ch in row[3]])
+            path_arr = row[1].split("/")
+            utt_id = path_arr[-2] + "_" + path_arr[-1]
+            text_f.write(utt_id + " " + words + "\n")
+            wav_scp_f.write(utt_id + " " + hyper_root + "/" + row[1] + "\n")
+            utt2spk_f.write(utt_id + " " + row[2] + "\n")
diff --git a/egs2/sinhala/asr1/local/path.sh b/egs2/sinhala/asr1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/sinhala/asr1/local/score.py b/egs2/sinhala/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/sinhala/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/local/score.sh b/egs2/sinhala/asr1/local/score.sh
new file mode 120000
index 00000000000..91c8680b9b9
--- /dev/null
+++ b/egs2/sinhala/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../fsc/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/path.sh b/egs2/sinhala/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/sinhala/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/pyscripts b/egs2/sinhala/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/sinhala/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/run.sh b/egs2/sinhala/asr1/run.sh
new file mode 100755
index 00000000000..875f55cb916
--- /dev/null
+++ b/egs2/sinhala/asr1/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="valid"
+test_sets="test valid"
+
+asr_config=conf/train_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --stage 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --audio_format wav\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_asr_model valid.acc.ave_5best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/sinhala/asr1/scripts b/egs2/sinhala/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/sinhala/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/steps b/egs2/sinhala/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/sinhala/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/sinhala/asr1/utils b/egs2/sinhala/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/sinhala/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/siwis/tts1/README.md b/egs2/siwis/tts1/README.md
new file mode 100644
index 00000000000..94c06c4a46e
--- /dev/null
+++ b/egs2/siwis/tts1/README.md
@@ -0,0 +1,32 @@
+# SIWIS RECIPE
+
+This is the recipe of French female single speaker TTS model with [SIWIS Corpus](https://datashare.ed.ac.uk/handle/10283/2353).
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+# INITIAL RESULTS
+
+## Environments
+- date: `Mon Aug 23 11:49:52 JST 2021`
+- python version: `3.6.8 (default, Nov 16 2020, 16:55:22)  [GCC 4.8.5 20150623 (Red Hat 4.8.5-44)]`
+- espnet version: `espnet 0.10.2a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `83e3443a136bf3d7f870fc43ad60d0cb7d774cc0`
+  - Commit date: `Tue Aug 17 19:03:57 2021 +0900`
+
+## Pretrained Models
+
+### siwis_tts_train_tacotron2_raw_phn_none_train.loss.ave
+- https://zenodo.org/record/5234497
+
+### siwis_tts_train_transformer2_raw_phn_none_train.loss.ave
+- https://zenodo.org/record/5234538
diff --git a/egs2/siwis/tts1/cmd.sh b/egs2/siwis/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/siwis/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/siwis/tts1/conf/decode.yaml b/egs2/siwis/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/siwis/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/siwis/tts1/conf/pbs.conf b/egs2/siwis/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/siwis/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/siwis/tts1/conf/queue.conf b/egs2/siwis/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/siwis/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/siwis/tts1/conf/slurm.conf b/egs2/siwis/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/siwis/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/siwis/tts1/conf/train.yaml b/egs2/siwis/tts1/conf/train.yaml
new file mode 120000
index 00000000000..90f79ae1f16
--- /dev/null
+++ b/egs2/siwis/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/siwis/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/siwis/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/siwis/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/siwis/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/siwis/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..e006156e8be
--- /dev/null
+++ b/egs2/siwis/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false  # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/siwis/tts1/conf/tuning/train_conformer_fastspeech2.yaml b/egs2/siwis/tts1/conf/tuning/train_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..434096c4bbc
--- /dev/null
+++ b/egs2/siwis/tts1/conf/tuning/train_conformer_fastspeech2.yaml
@@ -0,0 +1,108 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2. It requires only a single GPU with 12 GB memory
+# and it takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 200            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 12000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/siwis/tts1/conf/tuning/train_tacotron2.yaml b/egs2/siwis/tts1/conf/tuning/train_tacotron2.yaml
new file mode 100644
index 00000000000..9eb633a0fe0
--- /dev/null
+++ b/egs2/siwis/tts1/conf/tuning/train_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/siwis/tts1/conf/tuning/train_transformer.yaml b/egs2/siwis/tts1/conf/tuning/train_transformer.yaml
new file mode 100644
index 00000000000..d4ef8b8b71c
--- /dev/null
+++ b/egs2/siwis/tts1/conf/tuning/train_transformer.yaml
@@ -0,0 +1,91 @@
+# This configuration is for ESPnet2 to train Transformer-TTS. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# 4 GPUs with 32 GB memory and it will finish within 1 day to finish the
+# training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 8000  # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 200              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/siwis/tts1/db.sh b/egs2/siwis/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/siwis/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/siwis/tts1/local/data.sh b/egs2/siwis/tts1/local/data.sh
new file mode 100755
index 00000000000..9c7260b0d53
--- /dev/null
+++ b/egs2/siwis/tts1/local/data.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Takenori Yoshimura
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+text_format=raw
+nj=8
+g2p=espeak_ng_french
+
+log "$0 $*"
+# shellcheck disable=SC1091
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+# shellcheck disable=SC1091
+. ./cmd.sh || exit 1;
+# shellcheck disable=SC1091
+. ./db.sh || exit 1;
+
+if [ -z "${SIWIS}" ]; then
+    log "Fill the value of 'SIWIS' of db.sh"
+    exit 1
+fi
+
+db_root=${SIWIS}
+train_set=tr_no_dev
+dev_set=dev
+eval_set=eval1
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: local/data_download.sh"
+    local/data_download.sh "${db_root}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    name="SiwisFrenchSpeechSynthesisDatabase"
+    local/data_prep.sh "${db_root}/${name}" "data/all" 1,2,3
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: utils/subset_data_dir.sh"
+    utils/subset_data_dir.sh data/all 500 data/deveval
+    utils/subset_data_dir.sh --first data/deveval 250 "data/${dev_set}"
+    utils/subset_data_dir.sh --last data/deveval 250 "data/${eval_set}"
+    utils/copy_data_dir.sh data/all "data/${train_set}"
+    utils/filter_scp.pl --exclude data/deveval/wav.scp \
+        data/all/wav.scp > "data/${train_set}/wav.scp"
+    utils/fix_data_dir.sh "data/${train_set}"
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ] && [ "${text_format}" = phn ]; then
+    log "stage 2: pyscripts/utils/convert_text_to_phn.py"
+    for dset in "${train_set}" "${dev_set}" "${eval_set}"; do
+        utils/copy_data_dir.sh "data/${dset}" "data/${dset}_phn"
+        pyscripts/utils/convert_text_to_phn.py --g2p "${g2p}" --nj "${nj}" \
+            "data/${dset}/text" "data/${dset}_phn/text"
+        utils/fix_data_dir.sh "data/${dset}_phn"
+    done
+fi
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/siwis/tts1/local/data_download.sh b/egs2/siwis/tts1/local/data_download.sh
new file mode 100755
index 00000000000..ea2e461d464
--- /dev/null
+++ b/egs2/siwis/tts1/local/data_download.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Takenori Yoshimura
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+if [ ! -e "${download_dir}/SiwisFrenchSpeechSynthesisDatabase" ]; then
+    (
+        mkdir -p "${download_dir}"
+        cd "${download_dir}"
+        wget https://datashare.ed.ac.uk/download/DS_10283_2353.zip --no-check-certificate
+        unzip DS_10283_2353.zip
+        unzip -q SiwisFrenchSpeechSynthesisDatabase.zip
+    )
+    echo "Successfully finished download and unzip wav files."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/egs2/siwis/tts1/local/data_prep.sh b/egs2/siwis/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..547be43a189
--- /dev/null
+++ b/egs2/siwis/tts1/local/data_prep.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Takenori Yoshimura
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+data_dir=$2
+subsets=$3
+
+# check arguments
+if [ $# != 3 ]; then
+    echo "Usage: $0 <db_root> <data_dir> <subset>"
+    echo "e.g.: $0 downloads/SIWIS data/train 1,2,3"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check directory existence
+[ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+segments=${data_dir}/segments
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+[ -e "${segments}" ] && rm "${segments}"
+
+# make files
+for part in ${subsets//,/ }; do
+    while read -r filename; do
+        id=$(basename "${filename}" .lab)
+        echo "${id} SIWIS" >> "${utt2spk}"
+
+        wavname="${db_root}/wavs/part${part}/${id}.wav"
+        echo "${id} ${wavname}" >> "${scp}"
+
+        txtname="${db_root}/text/part${part}/${id}.txt"
+        {
+            echo -n "${id} "
+            cat "${txtname}"
+            echo ""
+        } >> "${text}"
+
+        labname="${db_root}/labs/part${part}/${id}.lab"
+        s=$(head -n 1 "${labname}" | cut -d" " -f 2 | awk '{ print $1 / 10000000 }')
+        e=$(tail -n 1 "${labname}" | cut -d" " -f 1 | awk '{ print $1 / 10000000 }')
+        echo "${id} ${id} ${s} ${e}" >> "${segments}"
+    done < "${db_root}/lists/lab.part${part}_all.list"
+done
+
+sort "${utt2spk}" -o "${utt2spk}"
+utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+
+# replace UTF-8 white spaces and remove empty lines
+# https://stackoverflow.com/questions/43638993/bash-remove-all-unicode-spaces-and-replace-with-normal-space
+cp "${text}" "${text}.bak"
+perl -CSDA -plE 's/\s/ /g' "${text}.bak" | sed '/^$/d' > "${text}"
+rm "${text}.bak"
+
+utils/fix_data_dir.sh "${data_dir}"
+echo "Successfully finished preparing data directory."
diff --git a/egs2/siwis/tts1/local/path.sh b/egs2/siwis/tts1/local/path.sh
new file mode 100644
index 00000000000..3c8f2419056
--- /dev/null
+++ b/egs2/siwis/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import phonemizer" > /dev/null; then
+    echo "Error: phonemizer is not installed." >&2
+    echo "Error: please install phonemizer and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && source activate_python.sh && ./installers/install_phonemizer.sh" >&2
+    return 1
+fi
diff --git a/egs2/siwis/tts1/path.sh b/egs2/siwis/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/siwis/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/siwis/tts1/pyscripts b/egs2/siwis/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/siwis/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/siwis/tts1/run.sh b/egs2/siwis/tts1/run.sh
new file mode 100755
index 00000000000..083ce1b97e2
--- /dev/null
+++ b/egs2/siwis/tts1/run.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# Feature related
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+# Data prep related
+text_format=phn  # Use "raw" or "phn". If use "phn", convert to phn in data prep.
+local_data_opts=""
+local_data_opts+=" --text_format ${text_format}"
+if [ "${text_format}" = phn ]; then
+    local_data_opts+=" --g2p espeak_ng_french"
+fi
+
+dset_suffix=""
+if [ "${text_format}" = phn ]; then
+    dset_suffix=_phn
+fi
+train_set=tr_no_dev${dset_suffix}
+valid_set=dev${dset_suffix}
+test_sets="dev${dset_suffix} eval1${dset_suffix}"
+
+# config related
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+# NOTE(kan-bayashi): Make sure that you use text_format=raw
+#   if you want to use token_type=char.
+token_type=phn
+
+# NOTE(kan-bayashi): On-the-fly with Espeak is really slow,
+#   so we convert text into phn in data prep stage via
+#   --text_format=phn and use g2p=none for training.
+# g2p=espeak_ng_french
+g2p=none
+
+./tts.sh \
+    --local_data_opts "${local_data_opts}" \
+    --audio_format wav \
+    --lang fr \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type "${token_type}" \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    "$@"
diff --git a/egs2/siwis/tts1/scripts b/egs2/siwis/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/siwis/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/siwis/tts1/steps b/egs2/siwis/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/siwis/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/siwis/tts1/tts.sh b/egs2/siwis/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/siwis/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/siwis/tts1/utils b/egs2/siwis/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/siwis/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/README.md b/egs2/slue-voxceleb/asr1/README.md
new file mode 100644
index 00000000000..3da3f39a411
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/README.md
@@ -0,0 +1,46 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Environments
+- date: `Tue Dec 28 12:28:28 EST 2021`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51) [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `6bf3c2a4f138d35331634d2e879bbc5c32a5266e`
+  - Commit date: `Mon Dec 22 15:41:32 EST 2021`
+
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/tuning/train_asr_conformer.yaml]
+- token_type: word
+- Pretrained Model
+  - Hugging Face : https://huggingface.co/espnet/siddhana_slue_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best
+
+|dataset|Snt|Intent Classification Accuracy (%)|Intent Classification Macro F1 (%)|
+|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|954|80.2|39.7|
+
+### Detailed Classification Report
+
+|dataset|Label|Snt|Prec|Recall|F1|
+|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|Neutral|784|85|93|89|
+|inference_asr_model_valid.acc.ave_10best/devel|Positive|167|40|24|30|
+|inference_asr_model_valid.acc.ave_10best/devel|Negative|3|0|0|0|
+
+## Using Conformer based encoder with output size 256 and Transformer based decoder with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/train_asr.yaml](conf/tuning/train_asr_wav2vec2_conformer_small.yaml)
+- token_type: word
+
+|dataset|Snt|Intent Classification Accuracy (%)|Intent Classification Macro F1 (%)|
+|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|954|79.0|44.0|
+
+### Detailed Classification Report
+
+|dataset|Label|Snt|Prec|Recall|F1|
+|---|---|---|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/devel|Neutral|784|88|87|87|
+|inference_asr_model_valid.acc.ave_10best/devel|Positive|167|46|43|44|
+|inference_asr_model_valid.acc.ave_10best/devel|Negative|3|0|0|0|
+
diff --git a/egs2/slue-voxceleb/asr1/asr.sh b/egs2/slue-voxceleb/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/cmd.sh b/egs2/slue-voxceleb/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/slue-voxceleb/asr1/conf/decode_asr.yaml b/egs2/slue-voxceleb/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/conf/fbank.conf b/egs2/slue-voxceleb/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/slue-voxceleb/asr1/conf/pbs.conf b/egs2/slue-voxceleb/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/slue-voxceleb/asr1/conf/pitch.conf b/egs2/slue-voxceleb/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/slue-voxceleb/asr1/conf/queue.conf b/egs2/slue-voxceleb/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/slue-voxceleb/asr1/conf/slurm.conf b/egs2/slue-voxceleb/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/slue-voxceleb/asr1/conf/train_asr.yaml b/egs2/slue-voxceleb/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..b79a2904e79
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_wav2vec2_conformer_small.yaml
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..ee983ed0e8c
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,65 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer_nlu.yaml b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer_nlu.yaml
new file mode 100644
index 00000000000..a10d58445aa
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer_nlu.yaml
@@ -0,0 +1,77 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+postencoder: hugging_face_transformers
+postencoder_conf:
+    # pick up a model from https://huggingface.co/models?filter=transformers
+    # most of models should work, but maybe some don't
+    # known to work: bert, gpt2, xlnet, roberta, mpnet, t5, bart
+    # xlnet currently works for single gpu only
+    model_name_or_path: "bert-base-uncased"
+
+freeze_param: [
+"postencoder.transformer"
+]
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer_s3prl.yaml b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer_s3prl.yaml
new file mode 100755
index 00000000000..c6acbbb652a
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_conformer_s3prl.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml
new file mode 100644
index 00000000000..fe8f902cb63
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/conf/tuning/train_asr_wav2vec2_conformer_small.yaml
@@ -0,0 +1,90 @@
+# network architecture
+# encoder related
+batch_type: numel
+batch_bins: 6000000
+encoder: conformer
+accum_grad: 2
+
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 1.0e-06
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 5000
+max_epoch: 100
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wav2vec2_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/slue-voxceleb/asr1/db.sh b/egs2/slue-voxceleb/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/local/data.sh b/egs2/slue-voxceleb/asr1/local/data.sh
new file mode 100755
index 00000000000..3f266008f4f
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/data.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${VOXCELEB}" ]; then
+    log "Fill the value of 'VOXCELEB' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${VOXCELEB}/LICENSE.txt" ]; then
+	echo "stage 1: Download data to ${VOXCELEB}"
+    else
+        log "stage 1: ${VOXCELEB}/LICENSE.txt is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,devel,test}
+    python3 local/data_prep_slue.py ${VOXCELEB}
+    for x in test devel train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+    local/run_spm.sh
+    mv data data_old
+    mv data_bpe_1000 data
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/slue-voxceleb/asr1/local/data_prep_slue.py b/egs2/slue-voxceleb/asr1/local/data_prep_slue.py
new file mode 100644
index 00000000000..89b42059e30
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/data_prep_slue.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+
+if len(sys.argv) != 2:
+    print("Usage: python data_prep.py [root]")
+    sys.exit(1)
+root = sys.argv[1]
+
+dir_dict = {
+    "train": "slue-voxceleb_fine-tune.tsv",
+    "devel": "slue-voxceleb_dev.tsv",
+    "test": "slue-voxceleb_test_blind.tsv",
+}
+
+for x in dir_dict:
+    with open(os.path.join("data", x, "text"), "w") as text_f, open(
+        os.path.join("data", x, "wav.scp"), "w"
+    ) as wav_scp_f, open(
+        os.path.join("data", x, "transcript"), "w"
+    ) as transcript_f, open(
+        os.path.join("data", x, "utt2spk"), "w"
+    ) as utt2spk_f:
+
+        text_f.truncate()
+        wav_scp_f.truncate()
+        utt2spk_f.truncate()
+        transcript_df = pd.read_csv(os.path.join(root, dir_dict[x]), sep="\t")
+        # lines = sorted(transcript_df.values, key=lambda s: s[0])
+        for row in transcript_df.values:
+            if x == "test":
+                speaker = row[1]
+                words = (
+                    "<blank>"  # Test set is blind, will have to submit to leaderboard
+                )
+            else:
+                if row[4] == "<mixed>":
+                    continue
+                print(x)
+                print(row)
+                words = (
+                    row[4].replace(" ", "_")
+                    + " "
+                    + row[1].encode("ascii", "ignore").decode()
+                )
+                print(words)
+                speaker = row[2]
+            if x == "train":
+                path = "fine-tune_raw/" + row[0] + ".flac"
+            elif x == "devel":
+                path = "dev_raw/" + row[0] + ".flac"
+            else:
+                path = "test_raw/" + row[0] + ".flac"
+            utt_id = row[0]
+            # print(utt_id + " " + words + "\n")
+            text_f.write(utt_id + " " + words + "\n")
+            wav_scp_f.write(utt_id + " " + root + "/" + path + "\n")
+            utt2spk_f.write(utt_id + " " + speaker + "\n")
diff --git a/egs2/slue-voxceleb/asr1/local/f1_score.py b/egs2/slue-voxceleb/asr1/local/f1_score.py
new file mode 100755
index 00000000000..4f45752a812
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/f1_score.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+from sklearn.metrics import f1_score
+from sklearn.metrics import classification_report
+
+
+def get_classification_result(hyp_file, ref_file):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+
+    error = 0
+    hyp_intent_arr = []
+    ref_intent_arr = []
+    for line_count in range(len(hyp_lines)):
+        hyp_intent = hyp_lines[line_count].split(" ")[0]
+        ref_intent = ref_lines[line_count].split(" ")[0]
+        hyp_intent_arr.append(hyp_intent)
+        ref_intent_arr.append(ref_intent)
+    print(classification_report(ref_intent_arr, hyp_intent_arr))
+    return f1_score(ref_intent_arr, hyp_intent_arr, average="macro")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+result = get_classification_result(valid_hyp_file, valid_ref_file)
+print("Valid Macro F1")
+print(result)
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+
+result = get_classification_result(test_hyp_file, test_ref_file)
+print("Test Intent Macro F1")
+print(result)
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    result = get_classification_result(utt_test_hyp_file, utt_test_ref_file)
+    print("Unseen Utterance Test Macro F1")
+    print(result)
diff --git a/egs2/slue-voxceleb/asr1/local/generate_asr_files.py b/egs2/slue-voxceleb/asr1/local/generate_asr_files.py
new file mode 100644
index 00000000000..dd8a4645410
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/generate_asr_files.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+
+
+def generate_asr_files(txt_file, transcript_file):
+    line_arr = [line for line in txt_file]
+    for line in line_arr:
+        if len(line.split("\t")) > 2:
+            print(line)
+            exit()
+        if len(line.split("\t")[0].split()) == 1:
+            text = "<blank>"
+        else:
+            text = line.split("\t")[0].split()[1].replace("▁", "")
+        for sub_word in line.split("\t")[0].split()[2:]:
+            if "▁" in sub_word:
+                text = text + " " + sub_word.replace("▁", "")
+            else:
+                text = text + sub_word
+        if len(text) == 0:
+            text = "<blank>"
+        wav_name = line.split("\t")[1]
+        transcript_file.write(text + "\t" + wav_name)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+valid_hyp_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+valid_ref_write_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+generate_asr_files(valid_hyp_file, valid_hyp_write_file)
+
+generate_asr_files(valid_ref_file, valid_ref_write_file)
+
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+test_hyp_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+)
+test_ref_write_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref_asr.trn"), "w"
+)
+
+generate_asr_files(test_hyp_file, test_hyp_write_file)
+
+generate_asr_files(test_ref_file, test_ref_write_file)
+
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    utt_test_hyp_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp_asr.trn"), "w"
+    )
+    utt_test_ref_write_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref_asr.trn"), "w"
+    )
+    generate_asr_files(utt_test_hyp_file, utt_test_hyp_write_file)
+
+    generate_asr_files(utt_test_ref_file, utt_test_ref_write_file)
diff --git a/egs2/slue-voxceleb/asr1/local/path.sh b/egs2/slue-voxceleb/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/slue-voxceleb/asr1/local/run_spm.sh b/egs2/slue-voxceleb/asr1/local/run_spm.sh
new file mode 100755
index 00000000000..9710cf5601e
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/run_spm.sh
@@ -0,0 +1,38 @@
+# This script is called in data preparation step by local/data.sh
+# It takes the data prepared using token type word as input
+# It then trains a bpe model with "nbpe" number of tokens on the train transcript i.e. text after first word (intent)
+# It then encodes the transcript for train, valid and test using the trained bpe model 
+nbpe=1000 #try 100, 500, 1000
+bpemode=bpe #try unigram, bpe
+
+new_data=data_${bpemode}_${nbpe}
+dict=${new_data}/en_token_list/word/tokens.txt
+bpemodel=${new_data}/spm_train_${bpemode}${nbpe}
+
+cp -R data ${new_data}
+
+cut -d' ' -f2 data/train/text | sort | uniq > ${new_data}/intents.txt
+cut -d' ' -f3- data/train/text > ${new_data}/input.txt
+
+spm_train --input=${new_data}/input.txt \
+            --model_prefix=${bpemodel} \
+            --vocab_size=${nbpe} \
+            --character_coverage=1.0 \
+            --model_type=${bpemode} \
+            --model_prefix=${bpemodel} \
+            --input_sentence_size=100000000 \
+            --bos_id=-1 \
+            --eos_id=-1 \
+            --unk_id=0 
+
+for split in train devel test; do 
+    cut -d' ' -f-2 data/${split}/text > ${new_data}/tmp_${split}_utt
+    cut -d' ' -f3- data/${split}/text > ${new_data}/tmp_${split}_transcript
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${new_data}/tmp_${split}_transcript > ${new_data}/new_${split}_transcript
+    paste -d' ' ${new_data}/tmp_${split}_utt ${new_data}/new_${split}_transcript > ${new_data}/${split}/text
+    rm ${new_data}/tmp_${split}_utt
+    rm ${new_data}/tmp_${split}_transcript
+    rm ${new_data}/new_${split}_transcript
+done
+
+#| awk '{print $0 " " NR+1}' >> ${dict}
diff --git a/egs2/slue-voxceleb/asr1/local/score.py b/egs2/slue-voxceleb/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/local/score.sh b/egs2/slue-voxceleb/asr1/local/score.sh
new file mode 100755
index 00000000000..e9c5e002a19
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/local/score.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+# # begin configuration section.
+# cmd=run.pl
+# stage=0
+# data=data/eval2000
+# #end configuration section.
+
+# TODO(siddhana): Automatically determine the decoding folder name
+# TODO(siddhana): Show SLU results in RESULTS.md
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder>"
+  exit 1;
+fi
+
+asr_expdir=$1
+
+if [ $# -gt 1 ]; then
+	valid_inference_folder=$2
+	test_inference_folder=$3
+	python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+	python local/generate_asr_files.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+	python local/f1_score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+else
+	valid_inference_folder="inference_asr_model_valid.acc.ave_10best/devel/"
+	test_inference_folder="inference_asr_model_valid.acc.ave_10best/test/"
+	python local/score.py --exp_root ${asr_expdir}
+	python local/generate_asr_files.py --exp_root ${asr_expdir}
+	python local/f1_score.py --exp_root ${asr_expdir}
+fi
+
+sclite \
+            -r "${asr_expdir}/${valid_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${valid_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${valid_inference_folder}/score_wer/result_asr.txt"
+
+sclite \
+            -r "${asr_expdir}/${test_inference_folder}/score_wer/ref_asr.trn" trn \
+            -h "${asr_expdir}/${test_inference_folder}/score_wer/hyp_asr.trn" trn \
+            -i rm -o all stdout > "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+echo "Write ASR result in ${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+                grep -e Avg -e SPKR -m 2 "${asr_expdir}/${test_inference_folder}/score_wer/result_asr.txt"
+
+exit 0
+
diff --git a/egs2/slue-voxceleb/asr1/path.sh b/egs2/slue-voxceleb/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/pyscripts b/egs2/slue-voxceleb/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/run.sh b/egs2/slue-voxceleb/asr1/run.sh
new file mode 100755
index 00000000000..e70f5f0a6ba
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/run.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="devel"
+test_sets="test devel"
+
+asr_config=conf/train_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --feats_type raw\
+    --gpu_inference true\
+    --max_wav_duration 30 \
+    --feats_normalize utterance_mvn\
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/slue-voxceleb/asr1/scripts b/egs2/slue-voxceleb/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/steps b/egs2/slue-voxceleb/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/slue-voxceleb/asr1/utils b/egs2/slue-voxceleb/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/slue-voxceleb/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/slurp/asr1/README.md b/egs2/slurp/asr1/README.md
new file mode 100644
index 00000000000..96c3c2615c0
--- /dev/null
+++ b/egs2/slurp/asr1/README.md
@@ -0,0 +1,79 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Environments
+- date: `Mon Oct 11 13:11:36 2021 -0400`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51) [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `4e7d2ba3510463ae744d1a6d98f18388ad929a9d`
+  - Commit date: `Mon Oct 11 12:57:48 2021 -0400`
+- Pretrained Model
+  - Zenodo : https://zenodo.org/record/5590384
+  - Hugging Face : https://huggingface.co/espnet/siddhana_slurp_new_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/train_asr.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|86.3|
+|inference_asr_model_valid.acc.ave_10best/valid|8690|86.9|
+
+## Trying different pretrained ASR models
+- ASR config: [conf/tuning/train_asr_conformer_s3prl.yaml](conf/tuning/train_asr_conformer_s3prl.yaml)
+- token_type: word
+### Hubert
+- frontend_conf: upstream: hubert_large_ll60k
+- preencoder_conf: input_size: 1024
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|83.3|
+|inference_asr_model_valid.acc.ave_10best/valid|8690|84.2|
+
+### Wav2vec2
+- frontend_conf: upstream: wav2vec2_large_ll60k
+- preencoder_conf: input_size: 1024
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|83.3|
+|inference_asr_model_valid.acc.ave_10best/valid|8690|84.2|
+
+### TERA
+- frontend_conf: upstream: tera_960hr
+- preencoder_conf: input_size: 768
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|83.5|
+|inference_asr_model_valid.acc.ave_10best/valid|8690|84.8|
+
+
+### VQ-APC
+- frontend_conf: upstream: vq_apc_960hr
+- preencoder_conf: input_size: 512
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|83.5|
+|inference_asr_model_valid.acc.ave_10best/valid|8690|84.8|
+
+## Trying different pretrained ASR models
+- ASR config: [conf/tuning/train_asr_conformer_nlu.yaml](conf/tuning/train_asr_conformer_nlu.yaml)
+- token_type: word
+### BERT
+- postencoder_conf: model_name_or_path: "bert-base-uncased"
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|85.7|
+
+### MPNET
+- postencoder_conf: model_name_or_path: "sentence-transformers/all-mpnet-base-v2"
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|82.5|
diff --git a/egs2/slurp/asr1/asr.sh b/egs2/slurp/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/slurp/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/slurp/asr1/cmd.sh b/egs2/slurp/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/slurp/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/slurp/asr1/conf/decode_asr.yaml b/egs2/slurp/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/slurp/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/slurp/asr1/conf/fbank.conf b/egs2/slurp/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/slurp/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/slurp/asr1/conf/pbs.conf b/egs2/slurp/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/slurp/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/slurp/asr1/conf/pitch.conf b/egs2/slurp/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/slurp/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/slurp/asr1/conf/queue.conf b/egs2/slurp/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/slurp/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/slurp/asr1/conf/slurm.conf b/egs2/slurp/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/slurp/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/slurp/asr1/conf/train_asr.yaml b/egs2/slurp/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/slurp/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/slurp/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/slurp/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..ee983ed0e8c
--- /dev/null
+++ b/egs2/slurp/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,65 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/slurp/asr1/conf/tuning/train_asr_conformer_nlu.yaml b/egs2/slurp/asr1/conf/tuning/train_asr_conformer_nlu.yaml
new file mode 100644
index 00000000000..a10d58445aa
--- /dev/null
+++ b/egs2/slurp/asr1/conf/tuning/train_asr_conformer_nlu.yaml
@@ -0,0 +1,77 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+postencoder: hugging_face_transformers
+postencoder_conf:
+    # pick up a model from https://huggingface.co/models?filter=transformers
+    # most of models should work, but maybe some don't
+    # known to work: bert, gpt2, xlnet, roberta, mpnet, t5, bart
+    # xlnet currently works for single gpu only
+    model_name_or_path: "bert-base-uncased"
+
+freeze_param: [
+"postencoder.transformer"
+]
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/slurp/asr1/conf/tuning/train_asr_conformer_s3prl.yaml b/egs2/slurp/asr1/conf/tuning/train_asr_conformer_s3prl.yaml
new file mode 100755
index 00000000000..c6acbbb652a
--- /dev/null
+++ b/egs2/slurp/asr1/conf/tuning/train_asr_conformer_s3prl.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/slurp/asr1/db.sh b/egs2/slurp/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/slurp/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/slurp/asr1/local/data.sh b/egs2/slurp/asr1/local/data.sh
new file mode 100755
index 00000000000..9ccd324589e
--- /dev/null
+++ b/egs2/slurp/asr1/local/data.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SLURP}" ]; then
+    log "Fill the value of 'SLURP' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${SLURP}/LICENSE.txt" ]; then
+	echo "stage 1: Download data to ${SLURP}"
+    else
+        log "stage 1: ${SLURP}/LICENSE.txt is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,valid,test}
+    python3 local/prepare_slurp_data.py ${SLURP}
+    for x in test devel train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+    local/run_spm.sh
+    mv data data_old
+    mv data_bpe_500 data
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/slurp/asr1/local/path.sh b/egs2/slurp/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/slurp/asr1/local/prepare_slurp_data.py b/egs2/slurp/asr1/local/prepare_slurp_data.py
new file mode 100644
index 00000000000..1120d03f9a5
--- /dev/null
+++ b/egs2/slurp/asr1/local/prepare_slurp_data.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import json
+import os
+import sys
+import subprocess
+import re
+
+idir = sys.argv[1]
+
+spk = {}
+
+with open(os.path.join(idir, "dataset", "slurp", "metadata" + ".json")) as meta:
+    records = json.load(meta)
+    for record in records.values():
+        for filename in record["recordings"].keys():
+            spk[filename[6:-5]] = record["recordings"][filename]["usrid"]
+recordid_unique = {}
+for subset in ["train", "devel", "test"]:
+    odir = os.path.join("data", subset)
+    os.makedirs(odir, exist_ok=True)
+
+    with open(os.path.join(idir, "dataset", "slurp", subset + ".jsonl")) as meta, open(
+        os.path.join(odir, "text"), "w", encoding="utf-8"
+    ) as text, open(os.path.join(odir, "wav.scp"), "w") as wavscp, open(
+        os.path.join(odir, "utt2spk"), "w"
+    ) as utt2spk:
+
+        for line in meta:
+            prompt = json.loads(line.strip())
+            transcript = prompt["sentence"]
+            transcript = transcript.replace("@", " at ")
+            transcript = transcript.replace("#", " hashtag ")
+            transcript = transcript.replace(",", "")
+            transcript = transcript.replace(".", "")
+            transcript = re.sub(" +", " ", transcript)
+            words = "{}".format(
+                prompt["scenario"] + "_" + prompt["action"] + " " + transcript
+            ).replace("<unk>", "unknown")
+            for recording in prompt["recordings"]:
+                recoid = recording["file"][6:-5]
+                if recoid in recordid_unique:
+                    print("Already covered")
+                    continue
+                recordid_unique[recoid] = 1
+                wav = os.path.join(idir, "audio", "slurp_real", recording["file"])
+                speaker = spk[recoid]
+                uttid = "slurp_{}_{}".format(speaker, recoid)
+                text.write("{} {}\n".format(uttid, words))
+                utt2spk.write("{} slurp_{}\n".format(uttid, speaker))
+                wavscp.write("{} {}\n".format(uttid, wav))
+        if subset == "train":
+            meta = open(os.path.join(idir, "dataset", "slurp", "train_synthetic.jsonl"))
+            for line in meta:
+                prompt = json.loads(line.strip())
+                transcript = prompt["sentence"]
+                transcript = transcript.replace("@", " at ")
+                transcript = transcript.replace("#", " hashtag ")
+                transcript = transcript.replace(",", "")
+                transcript = transcript.replace(".", "")
+                transcript = re.sub(" +", " ", transcript).lower()
+                words = "{}".format(
+                    prompt["scenario"] + "_" + prompt["action"] + " " + transcript
+                ).replace("<unk>", "unknown")
+                for recording in prompt["recordings"]:
+                    recoid = recording["file"][6:-5]
+                    if recoid in recordid_unique:
+                        print("Already covered")
+                        continue
+                    recordid_unique[recoid] = 1
+                    wav = os.path.join(idir, "audio", "slurp_synth", recording["file"])
+                    speaker = "synthetic"
+                    uttid = "slurp_{}_{}".format(speaker, recoid)
+                    text.write("{} {}\n".format(uttid, words))
+                    utt2spk.write("{} slurp_{}\n".format(uttid, speaker))
+                    wavscp.write("{} {}\n".format(uttid, wav))
diff --git a/egs2/slurp/asr1/local/run_spm.sh b/egs2/slurp/asr1/local/run_spm.sh
new file mode 100755
index 00000000000..1202a4942f7
--- /dev/null
+++ b/egs2/slurp/asr1/local/run_spm.sh
@@ -0,0 +1,38 @@
+# This script is called in data preparation step by local/data.sh
+# It takes the data prepared using token type word as input
+# It then trains a bpe model with "nbpe" number of tokens on the train transcript i.e. text after first word (intent)
+# It then encodes the transcript for train, valid and test using the trained bpe model 
+nbpe=500 #try 100, 500, 1000
+bpemode=bpe #try unigram, bpe
+
+new_data=data_${bpemode}_${nbpe}
+dict=${new_data}/en_token_list/word/tokens.txt
+bpemodel=${new_data}/spm_train_${bpemode}${nbpe}
+
+cp -R data ${new_data}
+
+cut -d' ' -f2 data/train/text | sort | uniq > ${new_data}/intents.txt
+cut -d' ' -f3- data/train/text > ${new_data}/input.txt
+
+spm_train --input=${new_data}/input.txt \
+            --model_prefix=${bpemodel} \
+            --vocab_size=${nbpe} \
+            --character_coverage=1.0 \
+            --model_type=${bpemode} \
+            --model_prefix=${bpemodel} \
+            --input_sentence_size=100000000 \
+            --bos_id=-1 \
+            --eos_id=-1 \
+            --unk_id=0 
+
+for split in train devel test; do 
+    cut -d' ' -f-2 data/${split}/text > ${new_data}/tmp_${split}_utt
+    cut -d' ' -f3- data/${split}/text > ${new_data}/tmp_${split}_transcript
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${new_data}/tmp_${split}_transcript > ${new_data}/new_${split}_transcript
+    paste -d' ' ${new_data}/tmp_${split}_utt ${new_data}/new_${split}_transcript > ${new_data}/${split}/text
+    rm ${new_data}/tmp_${split}_utt
+    rm ${new_data}/tmp_${split}_transcript
+    rm ${new_data}/new_${split}_transcript
+done
+
+#| awk '{print $0 " " NR+1}' >> ${dict}
diff --git a/egs2/slurp/asr1/local/score.py b/egs2/slurp/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/slurp/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/slurp/asr1/local/score.sh b/egs2/slurp/asr1/local/score.sh
new file mode 100755
index 00000000000..5b59d6fb3c5
--- /dev/null
+++ b/egs2/slurp/asr1/local/score.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+# # begin configuration section.
+# cmd=run.pl
+# stage=0
+# data=data/eval2000
+# #end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder>"
+  exit 1;
+fi
+
+asr_expdir=$1
+
+if [ $# -gt 1 ]; then
+	valid_inference_folder=$2
+	test_inference_folder=$3
+	python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+else
+	python local/score.py --exp_root ${asr_expdir}
+fi
+
+exit 0
+
diff --git a/egs2/slurp/asr1/path.sh b/egs2/slurp/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/slurp/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/slurp/asr1/pyscripts b/egs2/slurp/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/slurp/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/slurp/asr1/run.sh b/egs2/slurp/asr1/run.sh
new file mode 100755
index 00000000000..3ab3911cb4d
--- /dev/null
+++ b/egs2/slurp/asr1/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="devel"
+test_sets="test devel"
+
+asr_config=conf/train_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --feats_normalize utterance_mvn\
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/slurp/asr1/scripts b/egs2/slurp/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/slurp/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/slurp/asr1/steps b/egs2/slurp/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/slurp/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/slurp/asr1/utils b/egs2/slurp/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/slurp/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/README.md b/egs2/slurp_entity/asr1/README.md
new file mode 100644
index 00000000000..6f3ebae7da7
--- /dev/null
+++ b/egs2/slurp_entity/asr1/README.md
@@ -0,0 +1,42 @@
+<!-- Generated by ./scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Environments
+- date: `Thu Oct 28 16:54:32 2021 -0400`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51) [GCC 7.5.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `d7093719d98692774bb47d3c9470a1ca94d33866`
+  - Commit date: `Thu Oct 28 16:54:32 2021 -0400`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with intent
+- ASR config: [conf/train_asr.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- Entity classification code borrowed from SLURP [1] official repo - https://github.com/pswietojanski/slurp/tree/master/scripts/evaluation
+- Pretrained Model
+  - Zenodo : https://zenodo.org/record/5651224
+  - Hugging Face : https://huggingface.co/espnet/siddhana_slurp_entity_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best
+
+|dataset|Snt|Entity Classification (F1 Score)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|71.9|
+
+### Intent Classification Results
+
+
+|dataset|Snt|Intent Classification (%)|
+|---|---|---|
+|inference_asr_model_valid.acc.ave_10best/test|13078|84.4|
+|inference_asr_model_valid.acc.ave_10best/valid|8690|85.4|
+
+
+## Citation
+
+```
+@inproceedings{slurp,
+    author = {Emanuele Bastianelli and Andrea Vanzo and Pawel Swietojanski and Verena Rieser},
+    title={{SLURP: A Spoken Language Understanding Resource Package}},
+    booktitle={{Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)}},
+    year={2020}
+}
+```
diff --git a/egs2/slurp_entity/asr1/asr.sh b/egs2/slurp_entity/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/slurp_entity/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/cmd.sh b/egs2/slurp_entity/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/slurp_entity/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/slurp_entity/asr1/conf/decode_asr.yaml b/egs2/slurp_entity/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/conf/fbank.conf b/egs2/slurp_entity/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/slurp_entity/asr1/conf/pbs.conf b/egs2/slurp_entity/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/slurp_entity/asr1/conf/pitch.conf b/egs2/slurp_entity/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/slurp_entity/asr1/conf/queue.conf b/egs2/slurp_entity/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/slurp_entity/asr1/conf/slurm.conf b/egs2/slurp_entity/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/slurp_entity/asr1/conf/train_asr.yaml b/egs2/slurp_entity/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..ee983ed0e8c
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,65 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer_nlu.yaml b/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer_nlu.yaml
new file mode 100644
index 00000000000..a10d58445aa
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer_nlu.yaml
@@ -0,0 +1,77 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+postencoder: hugging_face_transformers
+postencoder_conf:
+    # pick up a model from https://huggingface.co/models?filter=transformers
+    # most of models should work, but maybe some don't
+    # known to work: bert, gpt2, xlnet, roberta, mpnet, t5, bart
+    # xlnet currently works for single gpu only
+    model_name_or_path: "bert-base-uncased"
+
+freeze_param: [
+"postencoder.transformer"
+]
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer_s3prl.yaml b/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer_s3prl.yaml
new file mode 100755
index 00000000000..c6acbbb652a
--- /dev/null
+++ b/egs2/slurp_entity/asr1/conf/tuning/train_asr_conformer_s3prl.yaml
@@ -0,0 +1,85 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/db.sh b/egs2/slurp_entity/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/slurp_entity/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/local/convert_to_entity_file.py b/egs2/slurp_entity/asr1/local/convert_to_entity_file.py
new file mode 100644
index 00000000000..e37898f1ae9
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/convert_to_entity_file.py
@@ -0,0 +1,57 @@
+import json
+import sys
+import argparse
+import os
+
+
+def generate_entity_file(line_arr, output_file="result_test.json"):
+    fp = open(output_file, "w")
+    for line in line_arr:
+        scenario = line.strip().split("\t")[0].split("_")[0]
+        action = "_".join(line.strip().split("\t")[0].split()[0].split("_")[1:])
+        entity_names_arr = line.strip().split("▁SEP")[1:-1]
+        ent_final_arr = []
+        for entity in entity_names_arr:
+            if len(entity.split("▁FILL")) != 2:
+                continue
+            ent_type = entity.split("▁FILL")[0].strip()
+            ent_val = entity.split("▁FILL")[1].strip().replace(" ", "")
+            ent_val = ent_val.replace("▁", " ").strip().replace("'", "'")
+            dict1 = {}
+            dict1["type"] = ent_type
+            dict1["filler"] = ent_val
+            ent_final_arr.append(dict1)
+        file_name = line.strip().split("\t")[1].split("_")[-1].replace(")", "")
+        file_name = "audio-" + file_name + ".flac"
+        write_dict = {}
+        write_dict["text"] = ""
+        write_dict["scenario"] = scenario
+        write_dict["action"] = action
+        write_dict["entities"] = ent_final_arr
+        write_dict["file"] = file_name
+        json.dump(write_dict, fp)
+        fp.write("\n")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+gen_file = open(os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn"))
+line_arr = [line for line in gen_file]
+generate_entity_file(line_arr)
diff --git a/egs2/slurp_entity/asr1/local/data.sh b/egs2/slurp_entity/asr1/local/data.sh
new file mode 100755
index 00000000000..359b68376a5
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/data.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SLURP}" ]; then
+    log "Fill the value of 'SLURP' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ! -e "${SLURP}/LICENSE.txt" ]; then
+	echo "stage 1: Download data to ${SLURP}"
+    else
+        log "stage 1: ${SLURP}/LICENSE.txt is already existing. Skip data downloading"
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    mkdir -p data/{train,valid,test}
+    python3 local/prepare_slurp_entity_data.py ${SLURP}
+    for x in test devel train; do
+        for f in text wav.scp utt2spk; do
+            sort data/${x}/${f} -o data/${x}/${f}
+        done
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > "data/${x}/spk2utt"
+        utils/validate_data_dir.sh --no-feats data/${x} || exit 1
+    done
+    local/run_spm.sh
+    mv data data_old
+    mv data_bpe_500 data
+    python local/prepare_entity_type.py
+    for x in test devel train; do
+        mv data/${x}/text data/${x}/text_old
+	mv data/${x}/text_new data/${x}/text
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/slurp_entity/asr1/local/evaluation/README.md b/egs2/slurp_entity/asr1/local/evaluation/README.md
new file mode 100755
index 00000000000..d01f5187eae
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/evaluation/README.md
@@ -0,0 +1,89 @@
+# slurp-evaluation
+
+All code in this folder has been adapted from SLURP official repo (https://github.com/pswietojanski/slurp/tree/master/scripts/evaluation)
+
+This package provides the code to use the SLU metrics proposed in *SLURP: A Spoken Language Understanding Resource Package* by Bastianelli, Vanzo, Swietojanski, and Rieser (EMNLP2020).
+
+To install all the dependencies, simply run:
+```shell script
+$ pip install -r requirements.txt
+```
+
+The `Python` script `evaluate.py` allows to run the evaluation of a prediction file:
+
+```shell script
+$ python evaluate.py -h                                                                             
+usage: evaluate.py [-h] -g GOLD_DATA -p PREDICTION_FILE
+                   [-d DISTANCE [DISTANCE ...]] [--load-gold] [--full]
+                   [--errors] [--average AVERAGE]
+                   [--table-layout TABLE_LAYOUT]
+
+SLURP evaluation script
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -g GOLD_DATA, --gold-data GOLD_DATA
+                        Gold data in SLURP jsonl format
+  -p PREDICTION_FILE, --prediction-file PREDICTION_FILE
+                        Predictions file
+  --load-gold           When evaluating against gold transcriptions
+                        (gold_*_predictions.jsonl), this flag must be true.
+  --average AVERAGE     The averaging modality {micro, macro}.  
+  --full                Print the full results, including per-label metrics.
+  --errors              Print TPs, FPs, and FNs in each row.
+  --table-layout TABLE_LAYOUT
+                        The results table layout {fancy_grid (DEFAULT), csv,
+                        tsv}.
+```
+
+ * `GOLD_DATA` is the path to the jsonl file of the testing gold examples as they are provided in the release format of slurp. In the following, an example of line of the `test.jsonl` file:
+```json
+{"slurp_id": 4130,
+"sentence": "is my reminder alarm set for dance class",
+"sentence_annotation": "is my reminder alarm set for [event_name : dance class]",
+"intent": "alarm_query",
+"action": "query",
+"tokens": [
+  {"surface": "is", "id": 0, "lemma": "be", "pos": "VBZ"},
+  {"surface": "my", "id": 1, "lemma": "-PRON-", "pos": "PRP$"},
+  {"surface": "reminder", "id": 2, "lemma": "reminder", "pos": "NN"},
+  {"surface": "alarm", "id": 3, "lemma": "alarm", "pos": "NN"},
+  {"surface": "set", "id": 4, "lemma": "set", "pos": "VBN"},
+  {"surface": "for", "id": 5, "lemma": "for", "pos": "IN"},
+  {"surface": "dance", "id": 6, "lemma": "dance", "pos": "NN"},
+  {"surface": "class", "id": 7, "lemma": "class", "pos": "NN"}
+],
+"scenario": "alarm",
+"recordings": [
+  {"file": "audio--1504192882-headset.wav", "wer": 0.0, "ent_wer": 0.0, "status": "correct"},
+  {"file": "audio--1504192882.wav", "wer": 0.0, "ent_wer": 0.0, "status": "correct"},
+  {"file": "audio--1504194663-headset.wav", "wer": 0.0, "ent_wer": 0.0, "status": "correct"},
+  {"file": "audio--1504194663.wav", "wer": 0.0, "ent_wer": 0.0, "status": "correct"},
+  {"file": "audio--1505405690-headset.wav", "wer": 0.0, "ent_wer": 0.0, "status": "correct"},
+  {"file": "audio--1505405690.wav", "wer": 0.0, "ent_wer": 0.0, "status": "correct"},
+  {"file": "audio-1497451768.wav", "wer": 0.0, "ent_wer": 0.0, "status": "correct"},
+  {"file": "audio-1495371389-headset.wav", "wer": 0.25, "ent_wer": 1.0, "status": "correct"}
+],
+"entities": [
+  {"span": [6, 7], "type": "event_name"}
+]}
+```
+ * `PREDICTION_FILE` is the `jsonl` file containing predictions, where each line is provided in the following format:
+```json
+{"file": "audio--1504192882-headset.wav", 
+"scenario": "alarm",
+"action": "query",
+"entities": [
+  {"type": "event_name", "filler": "dance class"}
+]}
+```
+ * the flag `load-gold` must be set to `True` when evaluating against gold transcriptions and predicted NLU (`gold_*_predictions.jsonl`)
+ * `AVERAGE` is the averaging modality (`micro`, `macro`)
+ * the flag `full` allows to print the scores for each label
+ * the flag `errors` prints TPs, FPs, and FNs for each row of the output label
+ * `TABLE_LAYOUT` defined the output table format
+ 
+To evaluate against predictions, run:
+```shell script
+python evaluate.py -g <PATH_TO_GOLD> -p <PATH_TO_PREDICTIONS>
+```
diff --git a/egs2/slurp_entity/asr1/local/evaluation/evaluate.py b/egs2/slurp_entity/asr1/local/evaluation/evaluate.py
new file mode 100755
index 00000000000..908fb2d77c0
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/evaluation/evaluate.py
@@ -0,0 +1,177 @@
+import argparse
+import logging
+
+from progress.bar import Bar
+
+from metrics import ErrorMetric
+from util import format_results, load_predictions, load_gold_data
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+
+logger = logging.getLogger(__name__)
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="SLURP evaluation script")
+    parser.add_argument(
+        "-g",
+        "--gold-data",
+        required=True,
+        type=str,
+        help="Gold data in SLURP jsonl format",
+    )
+    parser.add_argument(
+        "-p", "--prediction-file", type=str, required=True, help="Predictions file"
+    )
+    parser.add_argument(
+        "--load-gold",
+        action="store_true",
+        help="When evaluating against gold transcriptions\
+                (gold_*_predictions.jsonl), this flag must be true.",
+    )
+    parser.add_argument(
+        "--average",
+        type=str,
+        default="micro",
+        help="The averaging modality {micro, macro}.",
+    )
+    parser.add_argument(
+        "--full",
+        action="store_true",
+        help="Print the full results, including per-label metrics.",
+    )
+    parser.add_argument(
+        "--errors", action="store_true", help="Print TPs, FPs, and FNs in each row."
+    )
+    parser.add_argument(
+        "--table-layout",
+        type=str,
+        default="fancy_grid",
+        help="The results table layout {fancy_grid (DEFAULT), csv, tsv}.",
+    )
+
+    args = parser.parse_args()
+
+    logger.info("Loading data")
+    pred_examples = load_predictions(args.prediction_file, args.load_gold)
+    gold_examples = load_gold_data(args.gold_data, args.load_gold)
+    n_gold_examples = len(gold_examples)
+
+    logger.info("Initializing metrics")
+    scenario_f1 = ErrorMetric.get_instance(metric="f1", average=args.average)
+    action_f1 = ErrorMetric.get_instance(metric="f1", average=args.average)
+    intent_f1 = ErrorMetric.get_instance(metric="f1", average=args.average)
+    span_f1 = ErrorMetric.get_instance(metric="span_f1", average=args.average)
+    distance_metrics = {}
+    for distance in ["word", "char"]:
+        distance_metrics[distance] = ErrorMetric.get_instance(
+            metric="span_distance_f1", average=args.average, distance=distance
+        )
+    slu_f1 = ErrorMetric.get_instance(metric="slu_f1", average=args.average)
+
+    bar = Bar(message="Evaluating metrics", max=len(gold_examples))
+    for gold_id in list(gold_examples):
+        if gold_id in pred_examples:
+            gold_example = gold_examples.pop(gold_id)
+            pred_example = pred_examples.pop(gold_id)
+            scenario_f1(gold_example["scenario"], pred_example["scenario"])
+            action_f1(gold_example["action"], pred_example["action"])
+            intent_f1(
+                "{}_{}".format(gold_example["scenario"], gold_example["action"]),
+                "{}_{}".format(pred_example["scenario"], pred_example["action"]),
+            )
+
+            # Filtering below has been added to original code
+            # because of way in which punctuation handled in data preparation
+            for k in gold_example["entities"]:
+                k["filler"] = k["filler"].replace(" '", "'")
+
+            span_f1(gold_example["entities"], pred_example["entities"])
+            for distance, metric in distance_metrics.items():
+                metric(gold_example["entities"], pred_example["entities"])
+        bar.next()
+    bar.finish()
+
+    logger.info("Results:")
+    results = scenario_f1.get_metric()
+    print(
+        format_results(
+            results=results,
+            label="scenario",
+            full=args.full,
+            errors=args.errors,
+            table_layout=args.table_layout,
+        ),
+        "\n",
+    )
+
+    results = action_f1.get_metric()
+    print(
+        format_results(
+            results=results,
+            label="action",
+            full=args.full,
+            errors=args.errors,
+            table_layout=args.table_layout,
+        ),
+        "\n",
+    )
+
+    results = intent_f1.get_metric()
+    print(
+        format_results(
+            results=results,
+            label="intent (scen_act)",
+            full=args.full,
+            errors=args.errors,
+            table_layout=args.table_layout,
+        ),
+        "\n",
+    )
+
+    results = span_f1.get_metric()
+    print(
+        format_results(
+            results=results,
+            label="entities",
+            full=args.full,
+            errors=args.errors,
+            table_layout=args.table_layout,
+        ),
+        "\n",
+    )
+
+    for distance, metric in distance_metrics.items():
+        results = metric.get_metric()
+        slu_f1(results)
+        print(
+            format_results(
+                results=results,
+                label="entities (distance {})".format(distance),
+                full=args.full,
+                errors=args.errors,
+                table_layout=args.table_layout,
+            ),
+            "\n",
+        )
+    results = slu_f1.get_metric()
+    print(
+        format_results(
+            results=results,
+            label="SLU F1",
+            full=args.full,
+            errors=args.errors,
+            table_layout=args.table_layout,
+        ),
+        "\n",
+    )
+
+    logger.warning(
+        "Gold examples not predicted: {} (out of {})".format(
+            len(gold_examples), n_gold_examples
+        )
+    )
diff --git a/egs2/slurp_entity/asr1/local/evaluation/metrics/__init__.py b/egs2/slurp_entity/asr1/local/evaluation/metrics/__init__.py
new file mode 100755
index 00000000000..80b25690873
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/evaluation/metrics/__init__.py
@@ -0,0 +1,3 @@
+from .distance import Distance
+from .metrics import ErrorMetric
+from .metrics import compute_metrics
diff --git a/egs2/slurp_entity/asr1/local/evaluation/metrics/distance.py b/egs2/slurp_entity/asr1/local/evaluation/metrics/distance.py
new file mode 100755
index 00000000000..18928317486
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/evaluation/metrics/distance.py
@@ -0,0 +1,77 @@
+from jiwer import wer
+from typing import List, Union
+from textdistance.algorithms.edit_based import levenshtein
+
+DISTANCE_OPTIONS = {"word", "char"}
+
+
+class Distance:
+    """
+    An abstract class representing a distance metric.\
+            Every distance is normalized to be defined in [0, 1].
+    """
+
+    def __call__(
+        self, truth: Union[str, List[str]], hypothesis: Union[str, List[str]]
+    ) -> float:
+        """
+        Return the distance between truth and hypothesis.
+
+        :param truth: The ground-truth sentence as a string or list of words.
+        :param hypothesis: The hypothesis sentence as a string or list of words.
+        :return: The distance value between `truth` and `hypothesis`.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def get_instance(distance: str) -> "Distance":
+        """
+        This static method allows to build a Distance object.
+
+        :param distance: The distance to be returned.
+        :return: A `Distance` object to evaluate the distance between two texts.
+        """
+        assert distance in DISTANCE_OPTIONS, "Allowed distances: {}".format(
+            DISTANCE_OPTIONS
+        )
+        if distance == "word":
+            return WordDistance()
+        if distance == "char":
+            return CharDistance()
+
+
+class WordDistance(Distance):
+    """
+    The Word-level distance, implemented through the Word Error Rate (WER).
+    """
+
+    def __call__(
+        self, truth: Union[str, List[str]], hypothesis: Union[str, List[str]]
+    ) -> float:
+        """
+        Evaluates the word-level distance
+
+        :param truth: The ground-truth sentence as a string or list of words.
+        :param hypothesis: The hypothesis sentence as a string or list of words.
+        :return: The word-level distance.
+        """
+        return wer(truth=truth, hypothesis=hypothesis)
+
+
+class CharDistance(Distance):
+    """
+    The Character-level distance, implemented through\
+            the normalised Levenshtein distance.
+    """
+
+    def __call__(
+        self, truth: Union[str, List[str]], hypothesis: Union[str, List[str]]
+    ) -> float:
+        """
+        Evaluates the character-level distance
+
+        :param truth: The ground-truth sentence as a string or list of words.
+        :param hypothesis: The hypothesis sentence as a string or list of words.
+        :return: The character-level distance.
+        """
+        return levenshtein.normalized_distance(truth, hypothesis)
diff --git a/egs2/slurp_entity/asr1/local/evaluation/metrics/metrics.py b/egs2/slurp_entity/asr1/local/evaluation/metrics/metrics.py
new file mode 100755
index 00000000000..9841de71dfe
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/evaluation/metrics/metrics.py
@@ -0,0 +1,335 @@
+from collections import defaultdict
+from copy import copy
+from typing import Dict, List, Set, Tuple
+
+from .distance import Distance
+
+METRIC_OPTIONS = {"f1", "span_f1", "span_distance_f1", "slu_f1"}
+
+
+class ErrorMetric:
+    """
+    An abstract class representing a metric which accumulates TPs, FPs, and FNs.
+
+    :param average: This determines the type of averaging performed\
+            on the data: 'micro' (calculate metrics globally by
+    counting the total true positives, false negatives and false positives)\
+            , 'macro' (calculate metrics for each label,
+    and find their unweighted mean).
+    """
+
+    def __init__(self, average: str = "micro"):
+        # These will hold per span label counts.
+        self._true_positives: Dict[str, float] = defaultdict(float)
+        self._false_positives: Dict[str, float] = defaultdict(float)
+        self._false_negatives: Dict[str, float] = defaultdict(float)
+        self._average = average
+
+    def __call__(self, gold, prediction) -> None:
+        """
+
+        :param gold : A tensor corresponding to some gold label to evaluate against.
+        :param prediction : A tensor of prediction.
+        """
+        raise NotImplementedError
+
+    def get_metric(self) -> Dict[str, Tuple[float, ...]]:
+        """
+        Computes the metrics starting from TPs, FPs, and FNs.
+
+        :return: A Dict per label containing a tuple for\
+                precision, recall and f1-measure. Additionally, an ``overall``
+        key is included, which provides the precision, recall and f1-measure\
+                for all spans (depending on the averaging
+        modality.).
+
+        """
+        all_tags: Set[str] = set()
+        all_tags.update(self._true_positives.keys())
+        all_tags.update(self._false_positives.keys())
+        all_tags.update(self._false_negatives.keys())
+
+        all_metrics = {}
+
+        # Compute the precision, recall and f1 for all spans jointly.
+        for tag in all_tags:
+            precision, recall, f1_measure = compute_metrics(
+                self._true_positives[tag],
+                self._false_positives[tag],
+                self._false_negatives[tag],
+            )
+            all_metrics[tag] = (
+                precision,
+                recall,
+                f1_measure,
+                self._true_positives[tag],
+                self._false_positives[tag],
+                self._false_negatives[tag],
+            )
+        if self._average == "macro":
+            overall_precision = 0.0
+            overall_recall = 0.0
+            overall_f1_measure = 0.0
+            for tag in all_tags:
+                precision, recall, f1_measure = compute_metrics(
+                    self._true_positives[tag],
+                    self._false_positives[tag],
+                    self._false_negatives[tag],
+                )
+                overall_precision += precision
+                overall_recall += recall
+                overall_f1_measure += f1_measure
+            precision = overall_precision / len(all_tags)
+            recall = overall_recall / len(all_tags)
+            f1_measure = overall_f1_measure / len(all_tags)
+        else:
+            precision, recall, f1_measure = compute_metrics(
+                sum(self._true_positives.values()),
+                sum(self._false_positives.values()),
+                sum(self._false_negatives.values()),
+            )
+        all_metrics["overall"] = (
+            precision,
+            recall,
+            f1_measure,
+            sum(self._true_positives.values()),
+            sum(self._false_positives.values()),
+            sum(self._false_negatives.values()),
+        )
+
+        return all_metrics
+
+    def reset(self) -> None:
+        """
+        Reset any accumulators or internal state.
+        """
+        self._true_positives: Dict[str, float] = defaultdict(float)
+        self._false_positives: Dict[str, float] = defaultdict(float)
+        self._false_negatives: Dict[str, float] = defaultdict(float)
+
+    @staticmethod
+    def get_instance(
+        metric: str, average: str = "micro", distance: str = "word"
+    ) -> "ErrorMetric":
+        """
+        This static method allows to build a Metric object.
+
+        :param metric: The metric to be returned.
+        :param average: The average to be applied at label level.
+        :param distance: When distance-based F1 is chosen,\
+                it specifies which distance is being applied.
+        :return: The `Metric` object as specified by params.
+
+        """
+        assert metric in METRIC_OPTIONS, "Allowed metrics: {}".format(METRIC_OPTIONS)
+        if metric == "f1":
+            return FMeasure(average=average)
+        if metric == "span_f1":
+            return SpanFMeasure(average=average)
+        if metric == "span_distance_f1":
+            return SpanDistanceFMeasure(average=average, distance=distance)
+        if metric == "slu_f1":
+            return SLUF1(average=average)
+
+
+class FMeasure(ErrorMetric):
+    """
+    Compute precision, recall, F-measure for each class\
+            of a general multi-class problem.
+    """
+
+    def __call__(self, gold: str, prediction: str) -> None:
+        """
+        This method accumulates TPs, FPs, and FNs for each label
+        :param gold: The gold label.
+        :param prediction: The predicted label.
+        """
+        if prediction == gold:
+            self._true_positives[prediction] += 1
+        else:
+            self._false_positives[prediction] += 1
+            self._false_negatives[gold] += 1
+
+
+class SpanFMeasure(ErrorMetric):
+    """
+    Compute precision, recall, F-measure for each class\
+            of a span-based multi-class problem.
+    """
+
+    def __call__(
+        self, gold: List[Dict[str, str]], prediction: List[Dict[str, str]]
+    ) -> None:
+        """
+        This method accumulates TPs, FPs, and FNs for each span label.
+
+        :param gold: A list of gold entities, each defined by a dictionary\
+                with `type` and `filler` keys.
+        :param prediction: A list of gold entities, each defined by a\
+                dictionary with `type` and `filler` keys.
+        """
+        gold = copy(gold)
+        for entity in prediction:
+            if entity in gold:
+                self._true_positives[entity["type"]] += 1
+                gold.remove(entity)
+            else:
+                self._false_positives[entity["type"]] += 1
+        # These spans weren't predicted.
+        for entity in gold:
+            self._false_negatives[entity["type"]] += 1
+
+
+class SLUF1(ErrorMetric):
+    """
+    The SLUF1 metric mediates between the WordF1 and CharF1,\
+            computed as the sum of the confusion matrices.
+    For more information, please see `SLURP: A Spoken Language Understanding\
+            Resource Package` by Bastianelli, Vanzo,
+    Swietojanski, and Rieser.
+    """
+
+    def __call__(self, results: Dict[str, Tuple[float, ...]]) -> None:
+        """
+        This method accumulates TPs, FPs, and FNs given the results dictionary\
+                output by another metric.
+
+        :param results: The dictionary output by another metric.
+        """
+        for label in results:
+            if label != "overall":
+                self._true_positives[label] += results[label][3]
+                self._false_positives[label] += results[label][4]
+                self._false_negatives[label] += results[label][5]
+
+
+class SpanDistanceFMeasure(ErrorMetric):
+    """
+    This metric is a generalisation of the `SpanFMeasure`,\
+            particularly suitable for measuring entity prediction scores
+    in SLU tasks. A distance function is used to smooth the negative\
+            contribution of wrong transcription. In particular,
+    for every label match, the lexical distance between gold and predicted\
+            fillers contributes to FPs and FNs count.
+    For more information, please see `SLURP: A Spoken Language Understanding\
+            Resource Package` by Bastianelli, Vanzo,
+    Swietojanski, and Rieser.
+
+    :param average: This determines the type of averaging performed\
+            on the data: 'micro' (calculate metrics globally
+    by counting the total true positives, false negatives and false positives),\
+            'macro' (calculate metrics for each
+    label, and find their unweighted mean).
+    :param distance: The distance function being applied.\
+            `word` applies WER, whereas `char` applies the Levenshtein
+    distance.
+    """
+
+    def __init__(self, average: str = "micro", distance: str = "word"):
+        super().__init__(average=average)
+        self._distance = Distance.get_instance(distance=distance)
+
+    def __call__(
+        self, gold: List[Dict[str, str]], prediction: List[Dict[str, str]]
+    ) -> None:
+        """
+        This method accumulates TPs, FPs, and FNs for each span label,\
+                taking into account the distance function being
+        applied.
+
+        :param gold: A list of gold entities, each defined by a dictionary\
+                with `type` and `filler` keys.
+        :param prediction: A list of gold entities, each defined\
+                by a dictionary with `type` and `filler` keys.
+        """
+        gold_labels, gold_fillers = split_spans(gold)
+        predicted_labels, predicted_fillers = split_spans(prediction)
+
+        for j, pred_label in enumerate(predicted_labels):
+            if pred_label in gold_labels:
+                idx_to_remove, distance = self._get_lowest_distance(
+                    pred_label, predicted_fillers[j], gold_labels, gold_fillers
+                )
+                self._true_positives[pred_label] += 1
+                self._false_positives[pred_label] += distance
+                self._false_negatives[pred_label] += distance
+                gold_labels.pop(idx_to_remove)
+                gold_fillers.pop(idx_to_remove)
+            else:
+                self._false_positives[pred_label] += 1
+        for i, gold_label in enumerate(gold_labels):
+            self._false_negatives[gold_label] += 1
+
+    def _get_lowest_distance(
+        self,
+        target_label: str,
+        target_span: str,
+        gold_labels: List[str],
+        gold_spans: List[str],
+    ) -> Tuple[int, float]:
+        """
+        This method returns a tuple: the first element is the index of the\
+                gold entity having the lowest distance with
+        the predicted one, the second element is the corresponding distance.
+
+        :param target_label: The label of the target predicted entity.
+        :param target_span: The span of the target predicted entity.
+        :param gold_labels: A list of label of the gold entities.\
+                It is aligned with `gold_spans`.
+        :param gold_spans: A list of span of the gold entities.\
+                It is aligned with `gold_labels`.
+        :return: A tuple with index and distance of the best gold candidate.
+        """
+        index = 0
+        lowest_distance = float("inf")
+        for j, gold_label in enumerate(gold_labels):
+            if target_label == gold_label:
+                distance = self._distance(gold_spans[j], target_span)
+                if distance < lowest_distance:
+                    index = j
+                    lowest_distance = distance
+        return index, lowest_distance
+
+
+def compute_metrics(
+    true_positives: float, false_positives: float, false_negatives: float
+) -> Tuple[float, float, float]:
+    """
+    This static method computes precision, recall and f-measure out of\
+            TPs, FPs, and FNs.
+
+    :param true_positives: The number of true positives.
+    :param false_positives: The number of false positives.
+    :param false_negatives: The number of false negatives.
+    :return: A tuple with precision, recall and f-measure.
+    """
+    if true_positives == 0.0 and false_positives == 0.0:
+        precision = 0.0
+    else:
+        precision = float(true_positives) / float(true_positives + false_positives)
+    if true_positives == 0.0 and false_negatives == 0.0:
+        recall = 0.0
+    else:
+        recall = float(true_positives) / float(true_positives + false_negatives)
+    if precision == 0.0 and recall == 0.0:
+        f1_measure = 0.0
+    else:
+        f1_measure = 2.0 * ((precision * recall) / (precision + recall))
+    return precision, recall, f1_measure
+
+
+def split_spans(entities: List[Dict[str, str]]) -> Tuple[List[str], List[str]]:
+    """
+    Split a list dictionary representing the entities into two aligned lists,\
+            containing labels and fillers,
+    respectively.
+
+    :param entities: The list of entities as dictionaries.
+    :return: A tuple of lists of entities' labels and fillers.
+    """
+    labels = []
+    fillers = []
+    for entity in entities:
+        labels.append(entity["type"])
+        fillers.append(entity["filler"])
+    return labels, fillers
diff --git a/egs2/slurp_entity/asr1/local/evaluation/requirements.txt b/egs2/slurp_entity/asr1/local/evaluation/requirements.txt
new file mode 100755
index 00000000000..eb551138353
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/evaluation/requirements.txt
@@ -0,0 +1,4 @@
+progress==1.5
+tabulate==0.8.7
+textdistance==4.1.5
+jiwer==2.0.0
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/local/evaluation/util.py b/egs2/slurp_entity/asr1/local/evaluation/util.py
new file mode 100755
index 00000000000..c5c2b3560d5
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/evaluation/util.py
@@ -0,0 +1,146 @@
+import json
+import logging
+import os
+import tabulate
+
+from typing import Dict, Any, Tuple
+
+from progress.bar import Bar
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+
+logger = logging.getLogger(__name__)
+
+SEPARATORS = {"csv": ",", "tsv": "\t"}
+
+
+def load_predictions(path: str, load_gold: bool = False):
+    """
+    Load prediction file in a single dictionary, indexed by filename.
+
+    :param path: Path to prediction file
+    :param load_gold: When evaluating gold hypotheses upper bound
+    :return: Dictionary of predictions
+    """
+    data_path = os.path.join(path)
+    result = {}
+    with open(data_path, "r") as f:
+        lines = list(f)
+        bar = Bar(message="Loading prediction file", max=len(lines))
+        for line in lines:
+            example = json.loads(line)
+            result[example.pop("slurp_id" if load_gold else "file")] = example
+            bar.next()
+        bar.finish()
+    return result
+
+
+def load_gold_data(path: str, load_gold: bool = False):
+    """
+    Load gold file (test.jsonl) in a single dictionary, indexed by filename.
+
+    :param path: Path to gold file
+    :param load_gold: When evaluating gold hypotheses upper bound
+    :return: Dictionary of gold examples
+    """
+    data_path = os.path.join(path)
+    result = {}
+    with open(data_path, "r") as f:
+        lines = list(f)
+        bar = Bar(message="Loading gold data", max=len(lines))
+        for line in lines:
+            example = json.loads(line)
+            result.update(release2prediction(example, load_gold))
+            bar.next()
+        bar.finish()
+    return result
+
+
+def release2prediction(example: Dict[str, Any], load_gold: bool = False):
+    """
+    Convert the SLURP release format into prediction format.
+
+    :param example: the example in release format\
+            (as they come with the dataset release)
+    :param load_gold: When evaluating gold hypotheses upper bound
+    :return: a list of examples in prediction format: List[Dict[str, Union[str, List]]]
+    """
+    result = {}
+    res = {
+        "text": " ".join([t["surface"] for t in example["tokens"]]),
+        "scenario": example["scenario"],
+        "action": example["action"],
+        "entities": [
+            {
+                "type": entity["type"],
+                "filler": " ".join(
+                    [example["tokens"][i]["surface"].lower() for i in entity["span"]]
+                ),
+            }
+            for entity in example["entities"]
+        ],
+    }
+    if load_gold:
+        result[str(example["slurp_id"])] = res
+    else:
+        for file in example["recordings"]:
+            result[file["file"]] = res
+    return result
+
+
+def format_results(
+    results: Dict[str, Tuple[float]],
+    label: str,
+    full: bool = True,
+    errors: bool = False,
+    table_layout: str = "fancy_grid",
+):
+    """
+    Util to format and print the results.
+
+    Format results in tabular format.
+    :param results: the dictionary output by the get_metric() method
+    :param label: the title of the table to print
+    :param full: is true, prints the results of all the labels.\
+            Otherwise prints just the average among them
+    :param errors: if true, prints TPs, FPs and FNs
+    :param table_layout: the table layout.\
+            Available: all those from `tabulate`, `csv` and `tsv`.
+    :return: the formatted table as string
+    """
+    if errors:
+        threshold = 100
+    else:
+        threshold = 4
+    header = [label.capitalize(), "Precision", "Recall", "F-Measure", "TP", "FP", "FN"][
+        :threshold
+    ]
+    table = [["overall".upper(), *results.pop("overall")][:threshold]]
+    if full:
+        for label in results:
+            table.append([label, *results[label]][:threshold])
+    if table_layout in {"csv", "tsv"}:
+        for i, row in enumerate(table):
+            for j, item in enumerate(row):
+                table[i][j] = str(item)
+        return (
+            SEPARATORS[table_layout].join(header)
+            + "\n"
+            + "\n".join([SEPARATORS[table_layout].join(row) for row in table])
+        )
+    if table_layout not in tabulate.tabulate_formats:
+        logger.warning(
+            "{} non valid as table format. Using ``fancy_grid``".format(table_layout)
+        )
+        table_layout = "fancy_grid"
+
+    return tabulate.tabulate(
+        table,
+        headers=header,
+        tablefmt=table_layout,
+        floatfmt=("", ".4f", ".4f", ".4f", ".0f", ".1f", ".1f"),
+    )
diff --git a/egs2/slurp_entity/asr1/local/path.sh b/egs2/slurp_entity/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/slurp_entity/asr1/local/prepare_entity_type.py b/egs2/slurp_entity/asr1/local/prepare_entity_type.py
new file mode 100644
index 00000000000..dd3eb44c75c
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/prepare_entity_type.py
@@ -0,0 +1,24 @@
+def combine_type(input_file, output_file):
+    for line in input_file:
+        str2 = line.split("▁SEP")
+        str1 = str2[1:-1]
+        type_arr = []
+        name_arr = []
+        for k in str1:
+            print(k)
+            ent_type = k.split("▁FILL")[0]
+            name = k.split("▁FILL")[1]
+            ent_type = ent_type.replace(" ", "").replace("▁", "")
+            type_arr.append(ent_type)
+            name_arr.append(name)
+        final_str = str2[0]
+        for k in range(len(type_arr)):
+            final_str = final_str + "▁SEP " + type_arr[k] + " ▁FILL" + name_arr[k]
+        final_str = final_str + "▁SEP"
+        final_str = final_str + str2[-1]
+        output_file.write(final_str)
+
+
+combine_type(open("data/train/text", "r"), open("data/train/text_new", "w"))
+combine_type(open("data/devel/text", "r"), open("data/devel/text_new", "w"))
+combine_type(open("data/test/text", "r"), open("data/test/text_new", "w"))
diff --git a/egs2/slurp_entity/asr1/local/prepare_slurp_data.py b/egs2/slurp_entity/asr1/local/prepare_slurp_data.py
new file mode 100644
index 00000000000..1120d03f9a5
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/prepare_slurp_data.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import json
+import os
+import sys
+import subprocess
+import re
+
+idir = sys.argv[1]
+
+spk = {}
+
+with open(os.path.join(idir, "dataset", "slurp", "metadata" + ".json")) as meta:
+    records = json.load(meta)
+    for record in records.values():
+        for filename in record["recordings"].keys():
+            spk[filename[6:-5]] = record["recordings"][filename]["usrid"]
+recordid_unique = {}
+for subset in ["train", "devel", "test"]:
+    odir = os.path.join("data", subset)
+    os.makedirs(odir, exist_ok=True)
+
+    with open(os.path.join(idir, "dataset", "slurp", subset + ".jsonl")) as meta, open(
+        os.path.join(odir, "text"), "w", encoding="utf-8"
+    ) as text, open(os.path.join(odir, "wav.scp"), "w") as wavscp, open(
+        os.path.join(odir, "utt2spk"), "w"
+    ) as utt2spk:
+
+        for line in meta:
+            prompt = json.loads(line.strip())
+            transcript = prompt["sentence"]
+            transcript = transcript.replace("@", " at ")
+            transcript = transcript.replace("#", " hashtag ")
+            transcript = transcript.replace(",", "")
+            transcript = transcript.replace(".", "")
+            transcript = re.sub(" +", " ", transcript)
+            words = "{}".format(
+                prompt["scenario"] + "_" + prompt["action"] + " " + transcript
+            ).replace("<unk>", "unknown")
+            for recording in prompt["recordings"]:
+                recoid = recording["file"][6:-5]
+                if recoid in recordid_unique:
+                    print("Already covered")
+                    continue
+                recordid_unique[recoid] = 1
+                wav = os.path.join(idir, "audio", "slurp_real", recording["file"])
+                speaker = spk[recoid]
+                uttid = "slurp_{}_{}".format(speaker, recoid)
+                text.write("{} {}\n".format(uttid, words))
+                utt2spk.write("{} slurp_{}\n".format(uttid, speaker))
+                wavscp.write("{} {}\n".format(uttid, wav))
+        if subset == "train":
+            meta = open(os.path.join(idir, "dataset", "slurp", "train_synthetic.jsonl"))
+            for line in meta:
+                prompt = json.loads(line.strip())
+                transcript = prompt["sentence"]
+                transcript = transcript.replace("@", " at ")
+                transcript = transcript.replace("#", " hashtag ")
+                transcript = transcript.replace(",", "")
+                transcript = transcript.replace(".", "")
+                transcript = re.sub(" +", " ", transcript).lower()
+                words = "{}".format(
+                    prompt["scenario"] + "_" + prompt["action"] + " " + transcript
+                ).replace("<unk>", "unknown")
+                for recording in prompt["recordings"]:
+                    recoid = recording["file"][6:-5]
+                    if recoid in recordid_unique:
+                        print("Already covered")
+                        continue
+                    recordid_unique[recoid] = 1
+                    wav = os.path.join(idir, "audio", "slurp_synth", recording["file"])
+                    speaker = "synthetic"
+                    uttid = "slurp_{}_{}".format(speaker, recoid)
+                    text.write("{} {}\n".format(uttid, words))
+                    utt2spk.write("{} slurp_{}\n".format(uttid, speaker))
+                    wavscp.write("{} {}\n".format(uttid, wav))
diff --git a/egs2/slurp_entity/asr1/local/prepare_slurp_entity_data.py b/egs2/slurp_entity/asr1/local/prepare_slurp_entity_data.py
new file mode 100644
index 00000000000..358d947ced9
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/prepare_slurp_entity_data.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import json
+import os
+import sys
+import subprocess
+import re
+
+idir = sys.argv[1]
+
+spk = {}
+
+with open(os.path.join(idir, "dataset", "slurp", "metadata" + ".json")) as meta:
+    records = json.load(meta)
+    for record in records.values():
+        for filename in record["recordings"].keys():
+            spk[filename[6:-5]] = record["recordings"][filename]["usrid"]
+recordid_unique = {}
+for subset in ["train", "devel", "test"]:
+    odir = os.path.join("data", subset)
+    os.makedirs(odir, exist_ok=True)
+
+    with open(os.path.join(idir, "dataset", "slurp", subset + ".jsonl")) as meta, open(
+        os.path.join(odir, "text"), "w", encoding="utf-8"
+    ) as text, open(os.path.join(odir, "wav.scp"), "w") as wavscp, open(
+        os.path.join(odir, "utt2spk"), "w"
+    ) as utt2spk:
+
+        for line in meta:
+            prompt = json.loads(line.strip())
+            transcript = prompt["sentence"]
+            sentence_annotation = prompt["sentence_annotation"]
+            num_entities = sentence_annotation.count("[")
+            entities = []
+            for slot in range(num_entities):
+                ent_type = (
+                    sentence_annotation.split("[")[slot + 1]
+                    .split("]")[0]
+                    .split(":")[0]
+                    .strip()
+                )
+                filler = (
+                    sentence_annotation.split("[")[slot + 1]
+                    .split("]")[0]
+                    .split(":")[1]
+                    .strip()
+                )
+                entities.append({"type": ent_type, "filler": filler})
+            sortednames = sorted(entities, key=lambda x: x["type"].lower())
+            print(sortednames)
+            # exit()
+            transcript = transcript.replace("@", " at ")
+            transcript = transcript.replace("#", " hashtag ")
+            transcript = transcript.replace(",", "")
+            transcript = transcript.replace(".", "")
+            transcript = re.sub(" +", " ", transcript)
+            predict_sent = prompt["scenario"] + "_" + prompt["action"]
+            for k in sortednames:
+                predict_sent += " SEP " + k["type"] + " FILL " + k["filler"].lower()
+            predict_sent += " SEP " + transcript
+            words = "{}".format(predict_sent).replace("<unk>", "unknown")
+            for recording in prompt["recordings"]:
+                recoid = recording["file"][6:-5]
+                if recoid in recordid_unique:
+                    print("Already covered")
+                    continue
+                recordid_unique[recoid] = 1
+                wav = os.path.join(idir, "audio", "slurp_real", recording["file"])
+                speaker = spk[recoid]
+                uttid = "slurp_{}_{}".format(speaker, recoid)
+                text.write("{} {}\n".format(uttid, words))
+                utt2spk.write("{} slurp_{}\n".format(uttid, speaker))
+                wavscp.write("{} {}\n".format(uttid, wav))
+        if subset == "train":
+            meta = open(os.path.join(idir, "dataset", "slurp", "train_synthetic.jsonl"))
+            for line in meta:
+                prompt = json.loads(line.strip())
+                transcript = prompt["sentence"]
+                sentence_annotation = prompt["sentence_annotation"]
+                num_entities = sentence_annotation.count("[")
+                entities = []
+                for slot in range(num_entities):
+                    ent_type = (
+                        sentence_annotation.split("[")[slot + 1]
+                        .split("]")[0]
+                        .split(":")[0]
+                        .strip()
+                    )
+                    filler = (
+                        sentence_annotation.split("[")[slot + 1]
+                        .split("]")[0]
+                        .split(":")[1]
+                        .strip()
+                    )
+                    entities.append({"type": ent_type, "filler": filler})
+                sortednames = sorted(entities, key=lambda x: x["type"].lower())
+                print(sortednames)
+                # exit()
+                transcript = transcript.replace("@", " at ")
+                transcript = transcript.replace("#", " hashtag ")
+                transcript = transcript.replace(",", "")
+                transcript = transcript.replace(".", "")
+                transcript = re.sub(" +", " ", transcript)
+                predict_sent = prompt["scenario"] + "_" + prompt["action"]
+                for k in sortednames:
+                    predict_sent += " SEP " + k["type"] + " FILL " + k["filler"].lower()
+                predict_sent += " SEP " + transcript
+                words = "{}".format(predict_sent).replace("<unk>", "unknown")
+                for recording in prompt["recordings"]:
+                    recoid = recording["file"][6:-5]
+                    if recoid in recordid_unique:
+                        print("Already covered")
+                        continue
+                    recordid_unique[recoid] = 1
+                    wav = os.path.join(idir, "audio", "slurp_synth", recording["file"])
+                    speaker = "synthetic"
+                    uttid = "slurp_{}_{}".format(speaker, recoid)
+                    text.write("{} {}\n".format(uttid, words))
+                    utt2spk.write("{} slurp_{}\n".format(uttid, speaker))
+                    wavscp.write("{} {}\n".format(uttid, wav))
diff --git a/egs2/slurp_entity/asr1/local/run_spm.sh b/egs2/slurp_entity/asr1/local/run_spm.sh
new file mode 100755
index 00000000000..1202a4942f7
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/run_spm.sh
@@ -0,0 +1,38 @@
+# This script is called in data preparation step by local/data.sh
+# It takes the data prepared using token type word as input
+# It then trains a bpe model with "nbpe" number of tokens on the train transcript i.e. text after first word (intent)
+# It then encodes the transcript for train, valid and test using the trained bpe model 
+nbpe=500 #try 100, 500, 1000
+bpemode=bpe #try unigram, bpe
+
+new_data=data_${bpemode}_${nbpe}
+dict=${new_data}/en_token_list/word/tokens.txt
+bpemodel=${new_data}/spm_train_${bpemode}${nbpe}
+
+cp -R data ${new_data}
+
+cut -d' ' -f2 data/train/text | sort | uniq > ${new_data}/intents.txt
+cut -d' ' -f3- data/train/text > ${new_data}/input.txt
+
+spm_train --input=${new_data}/input.txt \
+            --model_prefix=${bpemodel} \
+            --vocab_size=${nbpe} \
+            --character_coverage=1.0 \
+            --model_type=${bpemode} \
+            --model_prefix=${bpemodel} \
+            --input_sentence_size=100000000 \
+            --bos_id=-1 \
+            --eos_id=-1 \
+            --unk_id=0 
+
+for split in train devel test; do 
+    cut -d' ' -f-2 data/${split}/text > ${new_data}/tmp_${split}_utt
+    cut -d' ' -f3- data/${split}/text > ${new_data}/tmp_${split}_transcript
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${new_data}/tmp_${split}_transcript > ${new_data}/new_${split}_transcript
+    paste -d' ' ${new_data}/tmp_${split}_utt ${new_data}/new_${split}_transcript > ${new_data}/${split}/text
+    rm ${new_data}/tmp_${split}_utt
+    rm ${new_data}/tmp_${split}_transcript
+    rm ${new_data}/new_${split}_transcript
+done
+
+#| awk '{print $0 " " NR+1}' >> ${dict}
diff --git a/egs2/slurp_entity/asr1/local/score.py b/egs2/slurp_entity/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/local/score.sh b/egs2/slurp_entity/asr1/local/score.sh
new file mode 100755
index 00000000000..7fc80f6a9a2
--- /dev/null
+++ b/egs2/slurp_entity/asr1/local/score.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Copyright 2021  Siddhant Arora
+#           2021  Carnegie Mellon University
+
+# # begin configuration section.
+# cmd=run.pl
+# stage=0
+# data=data/eval2000
+# #end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder>"
+  exit 1;
+fi
+. ./db.sh
+
+SLURP="/ocean/projects/cis210027p/siddhana/slurp"
+asr_expdir=$1
+
+if [ $# -gt 1 ]; then
+	valid_inference_folder=$2
+	test_inference_folder=$3
+	python local/score.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+	python local/convert_to_entity_file.py --exp_root ${asr_expdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder}
+else
+	python local/score.py --exp_root ${asr_expdir}
+	python local/convert_to_entity_file.py --exp_root ${asr_expdir}
+fi
+python local/evaluation/evaluate.py -g ${SLURP}/dataset/slurp/test.jsonl -p result_test.json
+exit 0
+
diff --git a/egs2/slurp_entity/asr1/path.sh b/egs2/slurp_entity/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/slurp_entity/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/pyscripts b/egs2/slurp_entity/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/slurp_entity/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/run.sh b/egs2/slurp_entity/asr1/run.sh
new file mode 100755
index 00000000000..3ab3911cb4d
--- /dev/null
+++ b/egs2/slurp_entity/asr1/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="devel"
+test_sets="test devel"
+
+asr_config=conf/train_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --feats_normalize utterance_mvn\
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/slurp_entity/asr1/scripts b/egs2/slurp_entity/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/slurp_entity/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/steps b/egs2/slurp_entity/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/slurp_entity/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/slurp_entity/asr1/utils b/egs2/slurp_entity/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/slurp_entity/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/sms_wsj/enh1/cmd.sh b/egs2/sms_wsj/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/sms_wsj/enh1/cmd.sh
+++ b/egs2/sms_wsj/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/sms_wsj/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml b/egs2/sms_wsj/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
index 75fe31ffa3c..5860e23fe4f 100644
--- a/egs2/sms_wsj/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
+++ b/egs2/sms_wsj/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
@@ -32,6 +32,7 @@ encoder: stft
 encoder_conf:
     n_fft: 512
     hop_length: 128
+    use_builtin_complex: False
 decoder: stft
 decoder_conf:
     n_fft: 512
diff --git a/egs2/sms_wsj/enh1/local/create_database.sh b/egs2/sms_wsj/enh1/local/create_database.sh
index af6e9f7a0ca..c2714cdc0e1 100755
--- a/egs2/sms_wsj/enh1/local/create_database.sh
+++ b/egs2/sms_wsj/enh1/local/create_database.sh
@@ -137,6 +137,8 @@ fi
 
 
 # This takes about 25 minutes with the default configuration.
+# NOTE (Wangyou): If you try to rerun this part, please make sure the directories under
+#   ${sms_wsj_wav}/ are deleted in advance.
 echo "Creating ${sms_wsj_wav} audio data in '${sms_wsj_wav}'"
 mpiexec -np ${nj} python -m sms_wsj.database.write_files \
   with dst_dir=${sms_wsj_wav} json_path=${json_dir}/intermediate_sms_wsj.json \
diff --git a/egs2/snips/asr1/RESULTS.md b/egs2/snips/asr1/RESULTS.md
new file mode 100644
index 00000000000..e187e94913a
--- /dev/null
+++ b/egs2/snips/asr1/RESULTS.md
@@ -0,0 +1,50 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments, utt_cmvn
+- date: `Fri Sep 24 07:01:30 UTC 2021`
+- python version: `3.8.8 (default, Apr 13 2021, 19:58:26)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a3`
+- pytorch version: `pytorch 1.8.0+cu111`
+- Git hash: `64f026d35013e9f0058bcdeab86eb28fed48ed4b`
+  - Commit date: `Fri May 7 09:31:16 2021 +0000`
+
+## asr_train_asr_hubert_conformer_raw_word_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_asr_model_valid.acc.best/dev|166|166|82.5|17.5|0.0|0.0|17.5|17.5|
+|decode_asr_transformer_asr_model_valid.acc.best/test|166|166|89.2|10.8|0.0|0.0|10.8|10.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_asr_model_valid.acc.best/dev|166|2664|91.8|5.3|2.9|3.1|11.3|17.5|
+|decode_asr_transformer_asr_model_valid.acc.best/test|166|2504|94.5|4.5|1.0|1.7|7.2|10.8|
+
+
+
+## Environments
+- date: `Thu Aug 19 08:01:25 UTC 2021`
+- python version: `3.8.8 (default, Apr 13 2021, 19:58:26)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.9`
+- pytorch version: `pytorch 1.8.0+cu111`
+- Git hash: `64f026d35013e9f0058bcdeab86eb28fed48ed4b`
+  - Commit date: `Fri May 7 09:31:16 2021 +0000`
+
+## asr_train_asr_transformer_adam_raw_word_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_asr_model_valid.acc.best/dev|166|166|23.5|76.5|0.0|0.0|76.5|76.5|
+|decode_asr_transformer_asr_model_valid.acc.best/test|166|166|28.9|71.1|0.0|0.0|71.1|71.1|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_transformer_asr_model_valid.acc.best/dev|166|2664|57.4|24.8|17.8|9.3|51.9|76.5|
+|decode_asr_transformer_asr_model_valid.acc.best/test|166|2504|69.7|20.2|10.0|12.2|42.5|71.1|
+
diff --git a/egs2/snips/asr1/asr.sh b/egs2/snips/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/snips/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/snips/asr1/cmd.sh b/egs2/snips/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/snips/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/snips/asr1/conf/decode_asr_transformer.yaml b/egs2/snips/asr1/conf/decode_asr_transformer.yaml
new file mode 100644
index 00000000000..e6231927cdc
--- /dev/null
+++ b/egs2/snips/asr1/conf/decode_asr_transformer.yaml
@@ -0,0 +1,6 @@
+beam_size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.0
+lm_weight: 0.0
diff --git a/egs2/snips/asr1/conf/fbank.conf b/egs2/snips/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/snips/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/snips/asr1/conf/pbs.conf b/egs2/snips/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/snips/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/snips/asr1/conf/pitch.conf b/egs2/snips/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/snips/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/snips/asr1/conf/queue.conf b/egs2/snips/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/snips/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/snips/asr1/conf/slurm.conf b/egs2/snips/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/snips/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/snips/asr1/conf/tuning/train_asr_hubert_conformer.yaml b/egs2/snips/asr1/conf/tuning/train_asr_hubert_conformer.yaml
new file mode 100644
index 00000000000..87dc213b2f3
--- /dev/null
+++ b/egs2/snips/asr1/conf/tuning/train_asr_hubert_conformer.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+    warmup_steps: 800
+max_epoch: 50
+freeze_param: [
+"frontend.upstream"
+]
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/snips/asr1/conf/tuning/train_asr_transformer_adam.yaml b/egs2/snips/asr1/conf/tuning/train_asr_transformer_adam.yaml
new file mode 100644
index 00000000000..2c9e6c45607
--- /dev/null
+++ b/egs2/snips/asr1/conf/tuning/train_asr_transformer_adam.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 800
+max_epoch: 10
+
+model_conf:
+        ctc_weight: 0.0
diff --git a/egs2/snips/asr1/db.sh b/egs2/snips/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/snips/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/snips/asr1/local/compute_f1.py b/egs2/snips/asr1/local/compute_f1.py
new file mode 100644
index 00000000000..609682201d0
--- /dev/null
+++ b/egs2/snips/asr1/local/compute_f1.py
@@ -0,0 +1,42 @@
+import argparse
+from collections import defaultdict
+
+parser = argparse.ArgumentParser(description="Macro-F1 for intent classification")
+parser.add_argument("--hyp_trn", required=True, help="hyp.trn file path")
+parser.add_argument("--ref_trn", required=True, help="ref.trn file path")
+args = parser.parse_args()
+print(args)
+
+
+def compute_precision_recall_f1(count_metrics):
+    tp = count_metrics["TP"]
+    fp = count_metrics["FP"]
+    fn = count_metrics["FN"]
+    precision = 0.0 if tp == 0 else float(tp) / float(tp + fp)
+    recall = 0.0 if tp == 0 else float(tp) / float(tp + fn)
+    if precision == 0.0 or recall == 0.0:
+        f1 = 0.0
+    else:
+        f1 = 2 * (precision * recall) / (precision + recall)
+    return f1
+
+
+if __name__ == "__main__":
+    metrics = defaultdict()
+    with open(args.hyp_trn, "r") as hyp, open(args.ref_trn, "r") as ref:
+        for line_hyp, line_ref in zip(hyp, ref):
+            line_hyp, line_ref = line_hyp.split(), line_ref.split()
+            assert line_hyp[-1] == line_ref[-1]
+            predicted_intent, actual_intent = line_hyp[0], line_ref[0]
+            if predicted_intent not in metrics:
+                metrics[predicted_intent] = {"TP": 0, "FP": 0, "FN": 0}
+            if actual_intent not in metrics:
+                metrics[actual_intent] = {"TP": 0, "FP": 0, "FN": 0}
+            if predicted_intent == actual_intent:
+                metrics[predicted_intent]["TP"] += 1
+            else:
+                metrics[predicted_intent]["FP"] += 1
+                metrics[actual_intent]["FN"] += 1
+    f1_lists = [compute_precision_recall_f1(val_dict) for val_dict in metrics.values()]
+    macro_f1 = sum(f1_lists) / len(f1_lists)
+    print(f"The Macro-F1 score: {macro_f1}")
diff --git a/egs2/snips/asr1/local/data.sh b/egs2/snips/asr1/local/data.sh
new file mode 100755
index 00000000000..89c0b20c17f
--- /dev/null
+++ b/egs2/snips/asr1/local/data.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Yuekai Zhang
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=1       # start from 0 if you need to start from data preparation
+stop_stage=3
+SECONDS=0
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+if [ -z "${SNIPS}" ]; then
+    log "Fill the value of 'SNIPS' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: Download Data to"
+    local/download_and_untar.sh 
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Preparing Data for snips"
+    mkdir -p data
+    python local/data_prep.py --wav_path $SNIPS
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+   log "stage 2: Processing the text files"
+   mkdir -p data/tmp
+   sort -u data/text.trans > data/tmp/text.sort
+   sort  -o data/wav.scp  data/wav.scp
+   sort -uo data/non_linguistic_symbols.txt data/non_linguistic_symbols.txt
+   sort -u data/semantics > data/tmp/semantics.sort
+   rm -r data/semantics
+   awk '{print $(NF)}' data/tmp/text.sort > data/tmp/wavs
+   paste -d "|" data/tmp/wavs data/tmp/semantics.sort > data/tmp/wav_sem_text
+   cut -d "|" -f 1,3 data/tmp/wav_sem_text > data/tmp/wav_sem
+   cut -d "|" -f 2 data/tmp/wav_sem_text > data/tmp/pure_text
+   tr '[:upper:]' '[:lower:]' <data/tmp/pure_text>data/tmp/pure_text_lower
+   tr -d '[:punct:]'< data/tmp/pure_text_lower > data/tmp/pure_text_norm
+   sed -r 's/\|/ /g' data/tmp/wav_sem > data/tmp/wav_sem.2
+   cut -f 1,2 -d " " data/tmp/wav_sem.2  > data/tmp/wav_intent
+   paste -d " " data/tmp/wav_intent data/tmp/pure_text_norm > data/text
+   mkdir -p data/all
+   mv data/utt2spk data/text data/wav.scp data/all 
+   utils/fix_data_dir.sh data/all
+   rm -r data/tmp
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+   utils/subset_data_dir.sh --first data/all 332 data/dev_test
+   n=$(($(wc -l < data/all/text) - 332))
+   utils/subset_data_dir.sh --last data/all ${n} data/train
+   utils/fix_data_dir.sh data/train
+   utils/subset_data_dir.sh --first data/dev_test 166 data/dev
+   n=$(($(wc -l < data/dev_test/text) - 166))
+   utils/subset_data_dir.sh --last data/dev_test ${n} data/test
+   utils/fix_data_dir.sh data/dev
+   utils/fix_data_dir.sh data/test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/snips/asr1/local/data_prep.py b/egs2/snips/asr1/local/data_prep.py
new file mode 100644
index 00000000000..79cd5e2b420
--- /dev/null
+++ b/egs2/snips/asr1/local/data_prep.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Yuekai Zhang
+
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description="Process snips dataset.")
+parser.add_argument("--wav_path", type=str, help="file path for audios")
+parser.add_argument(
+    "--text_f", type=str, default="data/text.trans", help="file path for text"
+)
+parser.add_argument(
+    "--semantics", type=str, default="data/semantics", help="file path for semantics"
+)
+parser.add_argument(
+    "--utt2spk_f", type=str, default="data/utt2spk", help="file path for utt2spk"
+)
+parser.add_argument(
+    "--wavscp_f", type=str, default="data/wav.scp", help="file path for wav.scp"
+)
+parser.add_argument(
+    "--non_linguistic_symbols",
+    type=str,
+    default="data/non_linguistic_symbols.txt",
+    help="non_linguistic_symbols",
+)
+args = parser.parse_args()
+
+meta = args.wav_path + "/speech_corpus/metadata.json"
+dataset = args.wav_path + "/dataset.json"
+
+with open(meta, "r") as meta_f, open(args.text_f, "w") as text_f, open(
+    dataset, "r"
+) as dataset, open(args.non_linguistic_symbols, "w") as non_linguistic_symbols, open(
+    args.wavscp_f, "w"
+) as wavscp_f, open(
+    args.utt2spk_f, "w"
+) as utt2spk_f, open(
+    args.semantics, "w"
+) as semantics:
+    meta_info, dataset = json.load(meta_f), json.load(dataset)
+    intents = dataset["intents"]
+    for intent, utts in intents.items():
+        non_linguistic_symbols.write(f"{intent.upper()}\n")
+        for utt in utts["utterances"]:
+            utt_text = ""
+            utt_semantic = [intent.upper()]
+            for partial_text in utt["data"]:
+                utt_text += partial_text["text"]
+                if "entity" in partial_text:
+                    assert "slot_name" in partial_text
+                    entity = partial_text["entity"].upper()
+                    utt_semantic.append(entity)
+                    non_linguistic_symbols.write(f"{entity}\n")
+                    slot = partial_text["slot_name"].upper()
+                    utt_semantic.append(slot)
+                    non_linguistic_symbols.write(f"{slot}\n")
+                    utt_semantic.append(partial_text["text"].lower())
+            utt_text = utt_text.strip("\n")
+            utt_text = utt_text.replace("\n", " ")
+            utt_semantic = [
+                utt_semantic[0]
+            ]  # Currently, focus on intent classification task only
+            utt_semantic = " ".join(utt_semantic)
+            if (
+                utt_text != "Turn the lights up"
+            ):  # 1310.wav missed in the original dataset
+                semantics.write(f"{utt_text}|{utt_semantic}\n")
+    for wav in meta_info.values():
+        wav, text, spk_id = wav["filename"], wav["text"], wav["worker"]["id"]
+        wav_name = spk_id + "-" + wav
+        text = text.strip("\n")
+        text = text.replace("\n", " ")
+        text_f.write(f"{text} {wav_name}\n")
+        wav_path = args.wav_path + "/speech_corpus/audio/" + wav
+        wavscp_f.write(f"{wav_name} {wav_path}\n")
+        utt2spk_f.write(f"{wav_name} {spk_id}\n")
diff --git a/egs2/snips/asr1/local/path.sh b/egs2/snips/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/snips/asr1/local/score.sh b/egs2/snips/asr1/local/score.sh
new file mode 100644
index 00000000000..8fe53ea0c27
--- /dev/null
+++ b/egs2/snips/asr1/local/score.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright Yuekai Zhang, 2021.  Apache 2.0.
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+asr_expdir=$1
+
+for dir in ${asr_expdir}/decode_*/; do
+    score_dir=${dir}/score_f1
+    mkdir -p ${score_dir}
+    python local/compute_f1.py --hyp_trn $dir/score_wer/hyp.trn \
+                               --ref_trn $dir/score_wer/ref.trn \
+              > ${score_dir}/f1_score
+
+done
+exit 0
\ No newline at end of file
diff --git a/egs2/snips/asr1/path.sh b/egs2/snips/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/snips/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/snips/asr1/pyscripts b/egs2/snips/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/snips/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/snips/asr1/run.sh b/egs2/snips/asr1/run.sh
new file mode 100755
index 00000000000..9b05da34958
--- /dev/null
+++ b/egs2/snips/asr1/run.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/tuning/train_asr_hubert_conformer.yaml
+inference_config=conf/decode_asr_transformer.yaml
+
+pretrained_hubert_asr=exp/exp_hubert_large_ll60k_weighted_perturb/asr_train_asr_conformer7_hubert_960hr_large_raw_en_bpe5000_sp/26epoch.pth
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+#finetune with hubert pretrained ASR e.g. train text: INTENT1 th _is a _trans _cript
+
+./asr.sh                                               \
+    --use_lm false                                     \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --feats_normalize utterance_mvn                    \
+    --token_type bpe                                   \
+    --nbpe 100                                         \
+    --bpe_nlsyms DECREASEBRIGHTNESS,INCREASEBRIGHTNESS,SETLIGHTBRIGHTNESS,SETLIGHTCOLOR,SWITCHLIGHTOFF,SWITCHLIGHTON                                     \
+    --bpe_train_text data/train/text                   \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --asr_args " --init_param $pretrained_hubert_asr:::decoder.output_layer,decoder.embed.0,ctc.ctc_lo"    \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+     "$@"
diff --git a/egs2/snips/asr1/scripts b/egs2/snips/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/snips/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/snips/asr1/steps b/egs2/snips/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/snips/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/snips/asr1/utils b/egs2/snips/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/snips/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/speechcommands/asr1/README.md b/egs2/speechcommands/asr1/README.md
new file mode 100644
index 00000000000..c173a5435f8
--- /dev/null
+++ b/egs2/speechcommands/asr1/README.md
@@ -0,0 +1,79 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Notes
+- In ***conformer***-based experiments, `nn.BatchNorm1d` was ***not*** used in `ConvolutionModule`, which made the training more stable.
+- To manually remove `nn.BatchNorm1d`, please modify this file:
+  ```
+  espnet/nets/pytorch_backend/conformer/convolution.py
+  ```
+  * Comment out the following line in `__init__`:
+    ```
+    self.norm = nn.BatchNorm1d(channels)
+    ```
+  * Modify 1D depthwise convolution in `forward` as follows:
+    ```
+    # 1D Depthwise Conv
+    x = self.depthwise_conv(x)
+    # x = self.activation(self.norm(x))
+    x = self.activation(x)
+    ```
+
+## Dataset
+Google Speech Commands Paper: https://arxiv.org/abs/1804.03209
+
+Two versions are supported in this recipe: 12 commands and 35 commands. The variable `num_commands` in `run.sh` should be set to 12 or 35.
+- 12 commands: 10 words + silence + unknown. Results on two test sets are reported: (1) (test) a standard test set from the original paper, and (2) (test_speechbrain) a test set used in SpeechBrain's recipe.
+- 35 commands: entire 35 command words. The entire test set from the original paper is used.
+
+
+## asr_conformer_noBatchNorm_warmup5k_lr2e-4_accum3_conv15_5speeds (12 commands)
+
+Model: https://zenodo.org/record/5635530#.YcaCZBOZMVU
+
+### Environments
+- date: `Sun Oct  3 05:20:21 UTC 2021`
+- python version: `3.8.12 | packaged by conda-forge | (default, Sep 16 2021, 02:08:29)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.3a3`
+- pytorch version: `pytorch 1.9.0`
+- Git hash: `8536be6afc363bcf6b4fc6f41d612e42173de46c`
+  - Commit date: `Sun Oct 3 04:15:48 2021 +0000`
+
+### Classification Accuracy
+|dataset|total|correct|accuracy|
+|-------|-----|-------|--------|
+|dev|4605|4499|0.9770|
+|test|4890|4785|0.9785|
+|test_speechbrain|4886|4809|0.9842|
+
+### WER
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|infer/dev|4605|4605|97.7|2.3|0.0|0.0|2.3|2.3|
+|infer/test|4890|4890|97.9|2.1|0.0|0.0|2.1|2.1|
+|infer/test_speechbrain|4886|4886|98.4|1.6|0.0|0.0|1.6|1.6|
+
+
+## asr_35commands_conformer_noBatchNorm_warmup5k_lr2e-4_accum3_conv15_5speeds (35 commands)
+
+Model: https://zenodo.org/record/5637586#.YcaCQhOZMVU
+
+### Environments
+- date: `Mon Oct  4 20:07:28 UTC 2021`
+- python version: `3.8.12 | packaged by conda-forge | (default, Sep 16 2021, 02:08:29)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.3a3`
+- pytorch version: `pytorch 1.9.0`
+- Git hash: `94a64d4037602b2a7944619075bbc04ebdcd963d`
+  - Commit date: `Sun Oct 3 04:24:10 2021 +0000`
+
+### Classification Accuracy
+|dataset|total|correct|accuracy|
+|-------|-----|-------|--------|
+|dev|9981|9725|0.9744|
+|test|11005|10732|0.9752|
+
+### WER
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|infer/dev|9981|9981|97.4|2.6|0.0|0.0|2.6|2.6|
+|infer/test|11005|11005|97.5|2.5|0.0|0.0|2.5|2.5|
diff --git a/egs2/speechcommands/asr1/asr.sh b/egs2/speechcommands/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/speechcommands/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/speechcommands/asr1/cmd.sh b/egs2/speechcommands/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/speechcommands/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/speechcommands/asr1/conf/decode_asr.yaml b/egs2/speechcommands/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..bc4fdccba34
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/decode_asr.yaml
@@ -0,0 +1,4 @@
+lm_weight: 0.0
+ctc_weight: 0.0
+beam_size: 1
+maxlenratio: -1
diff --git a/egs2/speechcommands/asr1/conf/fbank.conf b/egs2/speechcommands/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/speechcommands/asr1/conf/pbs.conf b/egs2/speechcommands/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/speechcommands/asr1/conf/pitch.conf b/egs2/speechcommands/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/speechcommands/asr1/conf/queue.conf b/egs2/speechcommands/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/speechcommands/asr1/conf/slurm.conf b/egs2/speechcommands/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/speechcommands/asr1/conf/train_asr.yaml b/egs2/speechcommands/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..24c465e8e15
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_adam_noBatchNorm.yaml
\ No newline at end of file
diff --git a/egs2/speechcommands/asr1/conf/tuning/train_asr_conformer_adam_noBatchNorm.yaml b/egs2/speechcommands/asr1/conf/tuning/train_asr_conformer_adam_noBatchNorm.yaml
new file mode 100644
index 00000000000..9458e32bd96
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/tuning/train_asr_conformer_adam_noBatchNorm.yaml
@@ -0,0 +1,71 @@
+batch_type: numel
+batch_bins: 4000000
+accum_grad: 3
+max_epoch: 150
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: legacy        # 'legacy' or 'latest'
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 5000
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/speechcommands/asr1/conf/tuning/train_asr_transformer_adam.yaml b/egs2/speechcommands/asr1/conf/tuning/train_asr_transformer_adam.yaml
new file mode 100644
index 00000000000..fabf62e773c
--- /dev/null
+++ b/egs2/speechcommands/asr1/conf/tuning/train_asr_transformer_adam.yaml
@@ -0,0 +1,61 @@
+batch_type: numel
+batch_bins: 7000000
+accum_grad: 1
+max_epoch: 200
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 5
+
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 10000
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.0
+    length_normalized_loss: false
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/speechcommands/asr1/db.sh b/egs2/speechcommands/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/speechcommands/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/speechcommands/asr1/local/data.sh b/egs2/speechcommands/asr1/local/data.sh
new file mode 100755
index 00000000000..6ad5b6b3c14
--- /dev/null
+++ b/egs2/speechcommands/asr1/local/data.sh
@@ -0,0 +1,176 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Google Speech Commands: https://arxiv.org/abs/1804.03209
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=1
+stop_stage=100
+num_commands=12         # 12 or 35
+
+# data_url: the original location.
+# test_data_url: a canonical test set for top-1 error.
+data_url=http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz
+data_tar=speech_commands_v0.02.tar.gz
+data_tar_size=2428923189
+test_data_url=http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz
+test_data_tar=speech_commands_test_set_v0.02.tar.gz
+test_data_tar_size=112563277
+
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+log "$0 $*"
+
+. utils/parse_options.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z ${SPEECHCOMMANDS} ]; then
+    log "Fill the value of 'SPEECHCOMMANDS' of db.sh"
+    exit 1
+fi
+
+cur_path=$(pwd)
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Download Data to ${SPEECHCOMMANDS}"
+    if [ ! -d ${SPEECHCOMMANDS} ]; then
+    mkdir -p ${SPEECHCOMMANDS}
+    fi
+    # absolute path
+    SPEECHCOMMANDS=$(cd ${SPEECHCOMMANDS}; pwd)
+
+    # download data files if they do not exist
+    # file name: speech_commands_v0.02.tar.gz
+    download_data=true
+    if [ -f ${SPEECHCOMMANDS}/${data_tar} ]; then
+        size=$(/bin/ls -l ${SPEECHCOMMANDS}/${data_tar} | awk '{print $5}')
+        if [ ${size} -eq ${data_tar_size} ]; then
+            download_data=false
+            log "${SPEECHCOMMANDS}/${data_tar} exists and appears to be complete."
+        else
+            log "$0: removing existing file ${SPEECHCOMMANDS}/${data_tar} because its size in bytes ${size} is not equal to the size of the archive."
+            rm ${SPEECHCOMMANDS}/${data_tar}
+        fi
+    fi
+
+    if ${download_data}; then
+        if ! command -v wget >/dev/null; then
+            log "$0: wget is not installed."
+            exit 1
+        fi
+
+        cd ${SPEECHCOMMANDS}
+        if ! wget --no-check-certificate ${data_url}; then
+            log "$0: error executing wget ${data_url}"
+            exit 1
+        fi
+    fi
+
+    # file name: speech_commands_test_set_v0.02.tar.gz
+    download_test_data=true
+    if [ -f ${SPEECHCOMMANDS}/${test_data_tar} ]; then
+        size=$(/bin/ls -l ${SPEECHCOMMANDS}/${test_data_tar} | awk '{print $5}')
+        if [ ${size} -eq ${test_data_tar_size} ]; then
+            download_test_data=false
+            log "${SPEECHCOMMANDS}/${test_data_tar} exists and appears to be complete."
+        else
+            log "$0: removing existing file ${SPEECHCOMMANDS}/${test_data_tar} because its size in bytes ${size} is not equal to the size of the archive."
+            rm ${SPEECHCOMMANDS}/${test_data_tar}
+        fi
+    fi
+
+    if ${download_test_data}; then
+        if ! command -v wget >/dev/null; then
+            log "$0: wget is not installed."
+            exit 1
+        fi
+
+        cd ${SPEECHCOMMANDS}
+        if ! wget --no-check-certificate ${test_data_url}; then
+            log "$0: error executing wget ${test_data_url}"
+            exit 1
+        fi
+    fi
+
+    log "$0: successfully downloaded ${data_tar} and ${test_data_tar}"
+
+    # un-tar
+    cd ${SPEECHCOMMANDS}
+    mkdir -p speech_commands_v0.02
+    if ! tar -xzf ${data_tar} -C speech_commands_v0.02; then
+        log "$0: error un-tarring archive ${data_tar}"
+        exit 1
+    fi
+
+    mkdir -p speech_commands_test_set_v0.02
+    if ! tar -xzf ${test_data_tar} -C speech_commands_test_set_v0.02; then
+        log "$0: error un-tarring archive ${test_data_tar}"
+        exit 1
+    fi
+
+    log "$0: successfully un-tarred ${data_tar} and ${test_data_tar}"
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: Data Preparation"
+    cd ${cur_path}
+
+    # prepare datasets
+    if [ ${num_commands} -eq 12 ]; then
+        log "Using 12 commands"
+        mkdir -p data/{train,dev,test,test_speechbrain}
+        python3 local/data_prep_12.py \
+            --data_path ${SPEECHCOMMANDS}/speech_commands_v0.02 \
+            --test_data_path ${SPEECHCOMMANDS}/speech_commands_test_set_v0.02 \
+            --train_dir data/train \
+            --dev_dir data/dev \
+            --test_dir data/test \
+            --speechbrain_testcsv local/speechbrain_test.csv \
+            --speechbrain_test_dir data/test_speechbrain
+        for x in train dev test test_speechbrain; do
+            utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+            utils/fix_data_dir.sh data/${x}
+            utils/validate_data_dir.sh --no-feats data/${x}
+        done
+    elif [ ${num_commands} -eq 35 ]; then
+        log "Using 35 commands"
+        mkdir -p data/{train,dev,test}
+        python3 local/data_prep_35.py \
+            --data_path ${SPEECHCOMMANDS}/speech_commands_v0.02 \
+            --train_dir data/train \
+            --dev_dir data/dev \
+            --test_dir data/test
+        for x in train dev test; do
+            utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+            utils/fix_data_dir.sh data/${x}
+            utils/validate_data_dir.sh --no-feats data/${x}
+        done
+    else
+        log "$0: invalid num_commands: ${num_commands}"
+        exit 1
+    fi
+fi
+
+log "$0: successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/speechcommands/asr1/local/data_prep_12.py b/egs2/speechcommands/asr1/local/data_prep_12.py
new file mode 100644
index 00000000000..b61bf6ac0f8
--- /dev/null
+++ b/egs2/speechcommands/asr1/local/data_prep_12.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Speech Commands Dataset: https://arxiv.org/abs/1804.03209
+# Our data preparation is similar to the TensorFlow script:
+# https://www.tensorflow.org/datasets/catalog/speech_commands
+
+
+import os
+import os.path
+import csv
+import glob
+import argparse
+import numpy as np
+from scipy.io import wavfile
+
+
+parser = argparse.ArgumentParser(description="Process speech commands dataset.")
+parser.add_argument(
+    "--data_path",
+    type=str,
+    default="downloads/speech_commands_v0.02",
+    help="folder containing the original data",
+)
+parser.add_argument(
+    "--test_data_path",
+    type=str,
+    default="downloads/speech_commands_test_set_v0.02",
+    help="folder containing the test set",
+)
+parser.add_argument(
+    "--train_dir",
+    type=str,
+    default="data/train",
+    help="output folder for training data",
+)
+parser.add_argument(
+    "--dev_dir", type=str, default="data/dev", help="output folder for validation data"
+)
+parser.add_argument(
+    "--test_dir", type=str, default="data/test", help="output folder for test data"
+)
+parser.add_argument(
+    "--speechbrain_testcsv",
+    type=str,
+    default="local/speechbrain_test.csv",
+    help="speechbrain test csv file",
+)
+parser.add_argument(
+    "--speechbrain_test_dir",
+    type=str,
+    default="data/test_speechbrain",
+    help="output folder for speechbrain test data",
+)
+args = parser.parse_args()
+
+
+WORDS = ["down", "go", "left", "no", "off", "on", "right", "stop", "up", "yes"]
+SILENCE = "_silence_"
+UNKNOWN = "_unknown_"
+LABELS = WORDS + [SILENCE, UNKNOWN]  # 12 labels in the test set
+BACKGROUND_NOISE = "_background_noise_"
+SAMPLE_RATE = 16000
+
+# Generate test data
+with open(os.path.join(args.test_dir, "text"), "w") as text_f, open(
+    os.path.join(args.test_dir, "wav.scp"), "w"
+) as wav_scp_f, open(os.path.join(args.test_dir, "utt2spk"), "w") as utt2spk_f:
+    for label in LABELS:
+        wav_list = [
+            n
+            for n in os.listdir(os.path.join(args.test_data_path, label))
+            if n.endswith(".wav")
+        ]
+        for wav in wav_list:
+            uttid = f'{label.strip("_")}_{wav.rstrip(".wav")}'
+            text_f.write(uttid + " " + label + "\n")
+            wav_scp_f.write(
+                uttid
+                + " "
+                + os.path.abspath(os.path.join(args.test_data_path, label, wav))
+                + "\n"
+            )
+            utt2spk_f.write(uttid + " " + uttid + "\n")
+
+# Generate train and dev data
+with open(os.path.join(args.data_path, "validation_list.txt"), "r") as dev_f:
+    dev_file_list = [line.rstrip() for line in dev_f.readlines()]
+    # add running_tap into the dev set
+    dev_file_list.append(os.path.join(BACKGROUND_NOISE, "running_tap.wav"))
+    dev_file_list = [
+        os.path.abspath(os.path.join(args.data_path, line)) for line in dev_file_list
+    ]
+with open(os.path.join(args.data_path, "testing_list.txt"), "r") as test_f:
+    test_file_list = [line.rstrip() for line in test_f.readlines()]
+    test_file_list = [
+        os.path.abspath(os.path.join(args.data_path, line)) for line in test_file_list
+    ]
+
+full_file_list = [
+    os.path.abspath(p) for p in glob.glob(os.path.join(args.data_path, "*", "*.wav"))
+]
+train_file_list = list(set(full_file_list) - set(dev_file_list) - set(test_file_list))
+
+
+# UNKOWN is around 18 times as large as any other word
+def filter_file_list(file_list, ratio=18, excluded=WORDS + [BACKGROUND_NOISE]):
+    file_dict = {}
+    for p in file_list:
+        w = p.split("/")[-2]  # word, i.e. folder name
+        if w not in file_dict:
+            file_dict[w] = []
+        file_dict[w].append(p)
+
+    new_file_list = []
+    for w in file_dict:
+        if w in excluded:
+            new_file_list += file_dict[w]
+        else:
+            new_file_list += sorted(file_dict[w])[::ratio]  # every `ratio` files
+    return new_file_list
+
+
+for name in ["train", "dev"]:
+    if name == "train":
+        file_list = train_file_list
+        out_dir = args.train_dir
+    else:
+        file_list = dev_file_list
+        out_dir = args.dev_dir
+
+    # filter the list to reduce unknown words
+    file_list = filter_file_list(file_list)
+
+    with open(os.path.join(out_dir, "text"), "w") as text_f, open(
+        os.path.join(out_dir, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join(out_dir, "utt2spk"), "w") as utt2spk_f:
+        for wav_abspath in file_list:  # absolute path
+            word, wav = wav_abspath.split("/")[-2:]
+            if word != BACKGROUND_NOISE:
+                if word in WORDS:
+                    label = word
+                else:
+                    label = UNKNOWN
+                uttid = f'{word.strip("_")}_{wav.rstrip(".wav")}'
+                text_f.write(uttid + " " + label + "\n")
+                wav_scp_f.write(uttid + " " + wav_abspath + "\n")
+                utt2spk_f.write(uttid + " " + uttid + "\n")
+            else:
+                processed_dir = os.path.join(
+                    args.data_path, BACKGROUND_NOISE, "processed"
+                )
+                os.makedirs(processed_dir, exist_ok=True)
+                label = SILENCE
+
+                # split the original audio to 1-second clips
+                wav_rate, wav_data = wavfile.read(wav_abspath)  # 1-D array
+                assert wav_rate == SAMPLE_RATE
+                for start in range(
+                    0, wav_data.shape[0] - SAMPLE_RATE, SAMPLE_RATE // 9
+                ):
+                    audio_segment = wav_data[start : start + SAMPLE_RATE]
+                    uttid = f'{wav.rstrip(".wav")}_{start:08d}'
+                    wavfile.write(
+                        os.path.join(processed_dir, f"{uttid}.wav"),
+                        SAMPLE_RATE,
+                        audio_segment,
+                    )
+                    text_f.write(uttid + " " + label + "\n")
+                    wav_scp_f.write(
+                        uttid
+                        + " "
+                        + os.path.abspath(os.path.join(processed_dir, f"{uttid}.wav"))
+                        + "\n"
+                    )
+                    utt2spk_f.write(uttid + " " + uttid + "\n")
+
+# Generate SpeechBrain test data
+with open(args.speechbrain_testcsv, "r") as f:
+    speechbrain_lines = list(csv.reader(f))[1:]  # remove header line
+
+with open(os.path.join(args.speechbrain_test_dir, "text"), "w") as text_f, open(
+    os.path.join(args.speechbrain_test_dir, "wav.scp"), "w"
+) as wav_scp_f, open(
+    os.path.join(args.speechbrain_test_dir, "utt2spk"), "w"
+) as utt2spk_f:
+    for sb_line in speechbrain_lines:
+        sb_id, _, start, stop, sb_wav = sb_line[:5]
+        command = sb_line[10]
+        wav_path = os.path.join(args.data_path, "/".join(sb_wav.split("/")[-2:]))
+        if command == "silence":
+            speechbrain_processed_dir = os.path.join(
+                os.path.split(wav_path)[0], "speechbrain_processed"
+            )
+            os.makedirs(speechbrain_processed_dir, exist_ok=True)
+            # extract audio segment
+            wav_rate, wav_data = wavfile.read(wav_path)
+            assert wav_rate == SAMPLE_RATE
+            audio_segment = wav_data[int(start) : int(stop)]
+
+            uttid = "_".join(["silence"] + sb_id.split("/")[1:])
+            wav_save_path = os.path.abspath(
+                os.path.join(speechbrain_processed_dir, uttid + ".wav")
+            )
+            wavfile.write(wav_save_path, SAMPLE_RATE, audio_segment)
+
+            text_f.write(uttid + " " + "_silence_" + "\n")
+            wav_scp_f.write(uttid + " " + wav_save_path + "\n")
+            utt2spk_f.write(uttid + " " + uttid + "\n")
+        else:
+            wav_save_path = os.path.abspath(
+                os.path.join(args.data_path, "/".join(sb_wav.split("/")[-2:]))
+            )
+            if command == "unknown":
+                uttid = "_".join(["unknown"] + sb_id.split("/"))
+                command = "_unknown_"
+            else:
+                uttid = "_".join(sb_id.split("/"))
+
+            text_f.write(uttid + " " + command + "\n")
+            wav_scp_f.write(uttid + " " + wav_save_path + "\n")
+            utt2spk_f.write(uttid + " " + uttid + "\n")
diff --git a/egs2/speechcommands/asr1/local/data_prep_35.py b/egs2/speechcommands/asr1/local/data_prep_35.py
new file mode 100644
index 00000000000..6b88e026a46
--- /dev/null
+++ b/egs2/speechcommands/asr1/local/data_prep_35.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Speech Commands Dataset: https://arxiv.org/abs/1804.03209
+
+
+import os
+import os.path
+import argparse
+import numpy as np
+
+
+parser = argparse.ArgumentParser(
+    description="Process speech commands dataset with 35 commands."
+)
+parser.add_argument(
+    "--data_path",
+    type=str,
+    default="downloads/speech_commands_v0.02",
+    help="folder containing the original data",
+)
+parser.add_argument(
+    "--train_dir",
+    type=str,
+    default="data/train",
+    help="output folder for training data",
+)
+parser.add_argument(
+    "--dev_dir", type=str, default="data/dev", help="output folder for validation data"
+)
+parser.add_argument(
+    "--test_dir", type=str, default="data/test", help="output folder for test data"
+)
+args = parser.parse_args()
+
+
+SAMPLE_RATE = 16000
+WORDS = [
+    "yes",
+    "no",
+    "up",
+    "down",
+    "left",
+    "right",
+    "on",
+    "off",
+    "stop",
+    "go",
+    "zero",
+    "one",
+    "two",
+    "three",
+    "four",
+    "five",
+    "six",
+    "seven",
+    "eight",
+    "nine",
+    "bed",
+    "bird",
+    "cat",
+    "dog",
+    "happy",
+    "house",
+    "marvin",
+    "sheila",
+    "tree",
+    "wow",
+    "backward",
+    "forward",
+    "follow",
+    "learn",
+    "visual",
+]  # 35 commands
+
+
+# Generate train and dev data
+with open(os.path.join(args.data_path, "validation_list.txt"), "r") as dev_f:
+    dev_file_list = [line.rstrip() for line in dev_f.readlines()]
+    dev_file_list = [
+        os.path.abspath(os.path.join(args.data_path, line)) for line in dev_file_list
+    ]
+with open(os.path.join(args.data_path, "testing_list.txt"), "r") as test_f:
+    test_file_list = [line.rstrip() for line in test_f.readlines()]
+    test_file_list = [
+        os.path.abspath(os.path.join(args.data_path, line)) for line in test_file_list
+    ]
+
+full_file_list = []
+for word in WORDS:
+    for wav_file in os.listdir(os.path.join(args.data_path, word)):
+        if wav_file.endswith(".wav"):
+            full_file_list.append(
+                os.path.abspath(os.path.join(args.data_path, word, wav_file))
+            )
+
+train_file_list = list(set(full_file_list) - set(dev_file_list) - set(test_file_list))
+assert len(train_file_list) + len(dev_file_list) + len(test_file_list) == len(
+    full_file_list
+)
+
+for name in ["train", "dev", "test"]:
+    if name == "train":
+        file_list = train_file_list
+        out_dir = args.train_dir
+    elif name == "dev":
+        file_list = dev_file_list
+        out_dir = args.dev_dir
+    else:
+        file_list = test_file_list
+        out_dir = args.test_dir
+
+    with open(os.path.join(out_dir, "text"), "w") as text_f, open(
+        os.path.join(out_dir, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join(out_dir, "utt2spk"), "w") as utt2spk_f:
+        for wav_abspath in file_list:  # absolute path
+            word, wav = wav_abspath.split("/")[-2:]
+            uttid = f"{word}_{wav[:-4]}"
+            text_f.write(uttid + " " + word + "\n")
+            wav_scp_f.write(uttid + " " + wav_abspath + "\n")
+            utt2spk_f.write(uttid + " " + uttid + "\n")
diff --git a/egs2/speechcommands/asr1/local/path.sh b/egs2/speechcommands/asr1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/speechcommands/asr1/local/score.py b/egs2/speechcommands/asr1/local/score.py
new file mode 100644
index 00000000000..b1c79a976c9
--- /dev/null
+++ b/egs2/speechcommands/asr1/local/score.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+
+import os
+import os.path
+import argparse
+
+parser = argparse.ArgumentParser(description="Calculate classification accuracy.")
+parser.add_argument("--wer_dir", type=str, help="folder containing hyp.trn and ref.trn")
+args = parser.parse_args()
+
+
+with open(os.path.join(args.wer_dir, "hyp.trn"), "r") as f:
+    hyp_dict = {ln.split()[1]: ln.split()[0] for ln in f.readlines()}
+with open(os.path.join(args.wer_dir, "ref.trn"), "r") as f:
+    ref_dict = {ln.split()[1]: ln.split()[0] for ln in f.readlines()}
+
+n_correct = 0
+n_samples = 0
+for sample_id in ref_dict:
+    n_samples += 1
+    if ref_dict[sample_id] == hyp_dict[sample_id]:
+        n_correct += 1
+
+with open(os.path.join(args.wer_dir, "..", "accuracy.csv"), "w") as f:
+    f.write("total,correct,accuracy\n")
+    f.write(f"{n_samples},{n_correct},{n_correct/n_samples}\n")
diff --git a/egs2/speechcommands/asr1/local/score.sh b/egs2/speechcommands/asr1/local/score.sh
new file mode 100755
index 00000000000..e4784260b71
--- /dev/null
+++ b/egs2/speechcommands/asr1/local/score.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Copyright 2021 Carnegie Mellon University (Yifan Peng)
+
+#!/usr/bin/env bash
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+inference_tag=
+
+log "$0 $*"
+
+. utils/parse_options.sh
+. ./path.sh
+
+inference_expdir="$1/${inference_tag}"
+acc_file="${inference_expdir}/accuracy.csv"
+echo "name,total,correct,accuracy" | tee ${acc_file}
+for x in ${inference_expdir}/*; do
+    if [ -d ${x} ]; then
+        testset=$(basename ${x})
+        python local/score.py --wer_dir "${x}/score_wer"
+        echo "${testset},$(tail -n 1 ${x}/accuracy.csv)" | tee -a ${acc_file} || exit 1
+    fi
+done
+
+echo "$0: Successfully wrote accuracy results to file ${acc_file}"
diff --git a/egs2/speechcommands/asr1/local/speechbrain_test.csv b/egs2/speechcommands/asr1/local/speechbrain_test.csv
new file mode 100644
index 00000000000..4ef2823e387
--- /dev/null
+++ b/egs2/speechcommands/asr1/local/speechbrain_test.csv
@@ -0,0 +1,4887 @@
+ID,duration,start,stop,wav,wav_format,wav_opts,spk_id,spk_id_format,spk_id_opts,command,command_format,command_opts,transcript,transcript_format,transcript_opts
+yes/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/yes/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,yes,string,,yes,string,
+yes/b2e2773a_nohash_0,1.0,0,16000,/localscratch/GSC/yes/b2e2773a_nohash_0.wav,wav,,b2e2773a,string,,yes,string,,yes,string,
+yes/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/yes/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,yes,string,,yes,string,
+yes/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/yes/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,yes,string,,yes,string,
+yes/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/yes/cd85758f_nohash_1.wav,wav,,cd85758f,string,,yes,string,,yes,string,
+yes/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/yes/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,yes,string,,yes,string,
+yes/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/yes/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,yes,string,,yes,string,
+yes/b7e9f841_nohash_0,1.0,0,16000,/localscratch/GSC/yes/b7e9f841_nohash_0.wav,wav,,b7e9f841,string,,yes,string,,yes,string,
+yes/4a0e2c16_nohash_0,1.0,0,16000,/localscratch/GSC/yes/4a0e2c16_nohash_0.wav,wav,,4a0e2c16,string,,yes,string,,yes,string,
+yes/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/yes/b49caed3_nohash_1.wav,wav,,b49caed3,string,,yes,string,,yes,string,
+yes/26b28ea7_nohash_1,1.0,0,16000,/localscratch/GSC/yes/26b28ea7_nohash_1.wav,wav,,26b28ea7,string,,yes,string,,yes,string,
+yes/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/yes/9a69672b_nohash_3.wav,wav,,9a69672b,string,,yes,string,,yes,string,
+yes/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/yes/e41a903b_nohash_2.wav,wav,,e41a903b,string,,yes,string,,yes,string,
+yes/e71b4ce6_nohash_0,1.0,0,16000,/localscratch/GSC/yes/e71b4ce6_nohash_0.wav,wav,,e71b4ce6,string,,yes,string,,yes,string,
+yes/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/yes/dc75148d_nohash_0.wav,wav,,dc75148d,string,,yes,string,,yes,string,
+yes/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/yes/97f4c236_nohash_2.wav,wav,,97f4c236,string,,yes,string,,yes,string,
+yes/8769c34c_nohash_4,1.0,0,16000,/localscratch/GSC/yes/8769c34c_nohash_4.wav,wav,,8769c34c,string,,yes,string,,yes,string,
+yes/81dc4a94_nohash_0,1.0,0,16000,/localscratch/GSC/yes/81dc4a94_nohash_0.wav,wav,,81dc4a94,string,,yes,string,,yes,string,
+yes/03401e93_nohash_0,1.0,0,16000,/localscratch/GSC/yes/03401e93_nohash_0.wav,wav,,03401e93,string,,yes,string,,yes,string,
+yes/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/yes/91b03183_nohash_0.wav,wav,,91b03183,string,,yes,string,,yes,string,
+yes/f5496439_nohash_1,1.0,0,16000,/localscratch/GSC/yes/f5496439_nohash_1.wav,wav,,f5496439,string,,yes,string,,yes,string,
+yes/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/yes/5170b77f_nohash_1.wav,wav,,5170b77f,string,,yes,string,,yes,string,
+yes/f5496439_nohash_0,1.0,0,16000,/localscratch/GSC/yes/f5496439_nohash_0.wav,wav,,f5496439,string,,yes,string,,yes,string,
+yes/c9e251d2_nohash_1,1.0,0,16000,/localscratch/GSC/yes/c9e251d2_nohash_1.wav,wav,,c9e251d2,string,,yes,string,,yes,string,
+yes/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/yes/db24628d_nohash_0.wav,wav,,db24628d,string,,yes,string,,yes,string,
+yes/6f2f57c1_nohash_1,1.0,0,16000,/localscratch/GSC/yes/6f2f57c1_nohash_1.wav,wav,,6f2f57c1,string,,yes,string,,yes,string,
+yes/9d171fee_nohash_0,1.0,0,16000,/localscratch/GSC/yes/9d171fee_nohash_0.wav,wav,,9d171fee,string,,yes,string,,yes,string,
+yes/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/yes/837a0f64_nohash_1.wav,wav,,837a0f64,string,,yes,string,,yes,string,
+yes/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/yes/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,yes,string,,yes,string,
+yes/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/yes/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,yes,string,,yes,string,
+yes/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/yes/97f4c236_nohash_1.wav,wav,,97f4c236,string,,yes,string,,yes,string,
+yes/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/yes/a7216980_nohash_1.wav,wav,,a7216980,string,,yes,string,,yes,string,
+yes/a60a09cf_nohash_0,1.0,0,16000,/localscratch/GSC/yes/a60a09cf_nohash_0.wav,wav,,a60a09cf,string,,yes,string,,yes,string,
+yes/6205088b_nohash_0,1.0,0,16000,/localscratch/GSC/yes/6205088b_nohash_0.wav,wav,,6205088b,string,,yes,string,,yes,string,
+yes/bfd26d6b_nohash_4,1.0,0,16000,/localscratch/GSC/yes/bfd26d6b_nohash_4.wav,wav,,bfd26d6b,string,,yes,string,,yes,string,
+yes/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/yes/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,yes,string,,yes,string,
+yes/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/yes/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,yes,string,,yes,string,
+yes/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/yes/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,yes,string,,yes,string,
+yes/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/yes/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,yes,string,,yes,string,
+yes/c9b5ff26_nohash_4,1.0,0,16000,/localscratch/GSC/yes/c9b5ff26_nohash_4.wav,wav,,c9b5ff26,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/d0faf7e4_nohash_7,1.0,0,16000,/localscratch/GSC/yes/d0faf7e4_nohash_7.wav,wav,,d0faf7e4,string,,yes,string,,yes,string,
+yes/c0e0f834_nohash_0,1.0,0,16000,/localscratch/GSC/yes/c0e0f834_nohash_0.wav,wav,,c0e0f834,string,,yes,string,,yes,string,
+yes/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/yes/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,yes,string,,yes,string,
+yes/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/yes/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,yes,string,,yes,string,
+yes/6b889021_nohash_0,1.0,0,16000,/localscratch/GSC/yes/6b889021_nohash_0.wav,wav,,6b889021,string,,yes,string,,yes,string,
+yes/a7216980_nohash_4,1.0,0,16000,/localscratch/GSC/yes/a7216980_nohash_4.wav,wav,,a7216980,string,,yes,string,,yes,string,
+yes/b49caed3_nohash_4,1.0,0,16000,/localscratch/GSC/yes/b49caed3_nohash_4.wav,wav,,b49caed3,string,,yes,string,,yes,string,
+yes/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/yes/692a88e6_nohash_3.wav,wav,,692a88e6,string,,yes,string,,yes,string,
+yes/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/yes/63f7a489_nohash_0.wav,wav,,63f7a489,string,,yes,string,,yes,string,
+yes/d5b963aa_nohash_4,1.0,0,16000,/localscratch/GSC/yes/d5b963aa_nohash_4.wav,wav,,d5b963aa,string,,yes,string,,yes,string,
+yes/d9e9f554_nohash_0,1.0,0,16000,/localscratch/GSC/yes/d9e9f554_nohash_0.wav,wav,,d9e9f554,string,,yes,string,,yes,string,
+yes/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/yes/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,yes,string,,yes,string,
+yes/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/yes/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,yes,string,,yes,string,
+yes/6205088b_nohash_1,1.0,0,16000,/localscratch/GSC/yes/6205088b_nohash_1.wav,wav,,6205088b,string,,yes,string,,yes,string,
+yes/ad6a46f1_nohash_0,1.0,0,16000,/localscratch/GSC/yes/ad6a46f1_nohash_0.wav,wav,,ad6a46f1,string,,yes,string,,yes,string,
+yes/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/yes/893705bb_nohash_0.wav,wav,,893705bb,string,,yes,string,,yes,string,
+yes/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/yes/189cbabe_nohash_1.wav,wav,,189cbabe,string,,yes,string,,yes,string,
+yes/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/yes/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,yes,string,,yes,string,
+yes/bed06fac_nohash_0,1.0,0,16000,/localscratch/GSC/yes/bed06fac_nohash_0.wav,wav,,bed06fac,string,,yes,string,,yes,string,
+yes/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/yes/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,yes,string,,yes,string,
+yes/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/yes/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,yes,string,,yes,string,
+yes/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/yes/8fe67225_nohash_0.wav,wav,,8fe67225,string,,yes,string,,yes,string,
+yes/3df9a3d4_nohash_0,1.0,0,16000,/localscratch/GSC/yes/3df9a3d4_nohash_0.wav,wav,,3df9a3d4,string,,yes,string,,yes,string,
+yes/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/yes/beb458a4_nohash_4.wav,wav,,beb458a4,string,,yes,string,,yes,string,
+yes/4290ca61_nohash_0,1.0,0,16000,/localscratch/GSC/yes/4290ca61_nohash_0.wav,wav,,4290ca61,string,,yes,string,,yes,string,
+yes/48a8a69d_nohash_0,1.0,0,16000,/localscratch/GSC/yes/48a8a69d_nohash_0.wav,wav,,48a8a69d,string,,yes,string,,yes,string,
+yes/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/yes/2796ac50_nohash_0.wav,wav,,2796ac50,string,,yes,string,,yes,string,
+yes/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/yes/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,yes,string,,yes,string,
+yes/370844f7_nohash_1,1.0,0,16000,/localscratch/GSC/yes/370844f7_nohash_1.wav,wav,,370844f7,string,,yes,string,,yes,string,
+yes/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/yes/cd85758f_nohash_0.wav,wav,,cd85758f,string,,yes,string,,yes,string,
+yes/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/yes/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,yes,string,,yes,string,
+yes/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/yes/e41a903b_nohash_1.wav,wav,,e41a903b,string,,yes,string,,yes,string,
+yes/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/yes/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,yes,string,,yes,string,
+yes/fa446c16_nohash_0,1.0,0,16000,/localscratch/GSC/yes/fa446c16_nohash_0.wav,wav,,fa446c16,string,,yes,string,,yes,string,
+yes/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/yes/37dca74f_nohash_3.wav,wav,,37dca74f,string,,yes,string,,yes,string,
+yes/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/yes/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,yes,string,,yes,string,
+yes/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/yes/8fe67225_nohash_2.wav,wav,,8fe67225,string,,yes,string,,yes,string,
+yes/1acc97de_nohash_4,1.0,0,16000,/localscratch/GSC/yes/1acc97de_nohash_4.wav,wav,,1acc97de,string,,yes,string,,yes,string,
+yes/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/yes/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,yes,string,,yes,string,
+yes/653a48f5_nohash_0,1.0,0,16000,/localscratch/GSC/yes/653a48f5_nohash_0.wav,wav,,653a48f5,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/e49428d9_nohash_3,1.0,0,16000,/localscratch/GSC/yes/e49428d9_nohash_3.wav,wav,,e49428d9,string,,yes,string,,yes,string,
+yes/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/yes/893705bb_nohash_7.wav,wav,,893705bb,string,,yes,string,,yes,string,
+yes/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/yes/8056e897_nohash_0.wav,wav,,8056e897,string,,yes,string,,yes,string,
+yes/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/yes/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,yes,string,,yes,string,
+yes/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/yes/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,yes,string,,yes,string,
+yes/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/yes/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,yes,string,,yes,string,
+yes/7e1054e7_nohash_1,1.0,0,16000,/localscratch/GSC/yes/7e1054e7_nohash_1.wav,wav,,7e1054e7,string,,yes,string,,yes,string,
+yes/3d86b69a_nohash_2,1.0,0,16000,/localscratch/GSC/yes/3d86b69a_nohash_2.wav,wav,,3d86b69a,string,,yes,string,,yes,string,
+yes/47d01978_nohash_0,1.0,0,16000,/localscratch/GSC/yes/47d01978_nohash_0.wav,wav,,47d01978,string,,yes,string,,yes,string,
+yes/c7dc7278_nohash_4,1.0,0,16000,/localscratch/GSC/yes/c7dc7278_nohash_4.wav,wav,,c7dc7278,string,,yes,string,,yes,string,
+yes/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/yes/97f4c236_nohash_0.wav,wav,,97f4c236,string,,yes,string,,yes,string,
+yes/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/yes/aa80f517_nohash_1.wav,wav,,aa80f517,string,,yes,string,,yes,string,
+yes/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/yes/0cb74144_nohash_1.wav,wav,,0cb74144,string,,yes,string,,yes,string,
+yes/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/yes/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,yes,string,,yes,string,
+yes/44715c1c_nohash_0,1.0,0,16000,/localscratch/GSC/yes/44715c1c_nohash_0.wav,wav,,44715c1c,string,,yes,string,,yes,string,
+yes/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/yes/9a69672b_nohash_0.wav,wav,,9a69672b,string,,yes,string,,yes,string,
+yes/dfdabe19_nohash_0,1.0,0,16000,/localscratch/GSC/yes/dfdabe19_nohash_0.wav,wav,,dfdabe19,string,,yes,string,,yes,string,
+yes/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/yes/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,yes,string,,yes,string,
+yes/6379c6a2_nohash_0,1.0,0,16000,/localscratch/GSC/yes/6379c6a2_nohash_0.wav,wav,,6379c6a2,string,,yes,string,,yes,string,
+yes/6205088b_nohash_2,1.0,0,16000,/localscratch/GSC/yes/6205088b_nohash_2.wav,wav,,6205088b,string,,yes,string,,yes,string,
+yes/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/yes/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,yes,string,,yes,string,
+yes/37dca74f_nohash_4,1.0,0,16000,/localscratch/GSC/yes/37dca74f_nohash_4.wav,wav,,37dca74f,string,,yes,string,,yes,string,
+yes/0bac8a71_nohash_0,1.0,0,16000,/localscratch/GSC/yes/0bac8a71_nohash_0.wav,wav,,0bac8a71,string,,yes,string,,yes,string,
+yes/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/yes/1acc97de_nohash_2.wav,wav,,1acc97de,string,,yes,string,,yes,string,
+yes/9a7c1f83_nohash_4,1.0,0,16000,/localscratch/GSC/yes/9a7c1f83_nohash_4.wav,wav,,9a7c1f83,string,,yes,string,,yes,string,
+yes/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/yes/63f7a489_nohash_2.wav,wav,,63f7a489,string,,yes,string,,yes,string,
+yes/0fa1e7a9_nohash_0,1.0,0,16000,/localscratch/GSC/yes/0fa1e7a9_nohash_0.wav,wav,,0fa1e7a9,string,,yes,string,,yes,string,
+yes/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/yes/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,yes,string,,yes,string,
+yes/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/yes/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,yes,string,,yes,string,
+yes/e49428d9_nohash_0,1.0,0,16000,/localscratch/GSC/yes/e49428d9_nohash_0.wav,wav,,e49428d9,string,,yes,string,,yes,string,
+yes/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/yes/893705bb_nohash_5.wav,wav,,893705bb,string,,yes,string,,yes,string,
+yes/fe1916ba_nohash_0,1.0,0,16000,/localscratch/GSC/yes/fe1916ba_nohash_0.wav,wav,,fe1916ba,string,,yes,string,,yes,string,
+yes/ca4d5368_nohash_4,1.0,0,16000,/localscratch/GSC/yes/ca4d5368_nohash_4.wav,wav,,ca4d5368,string,,yes,string,,yes,string,
+yes/105a0eea_nohash_2,1.0,0,16000,/localscratch/GSC/yes/105a0eea_nohash_2.wav,wav,,105a0eea,string,,yes,string,,yes,string,
+yes/42beb5eb_nohash_1,1.0,0,16000,/localscratch/GSC/yes/42beb5eb_nohash_1.wav,wav,,42beb5eb,string,,yes,string,,yes,string,
+yes/af7a8296_nohash_2,1.0,0,16000,/localscratch/GSC/yes/af7a8296_nohash_2.wav,wav,,af7a8296,string,,yes,string,,yes,string,
+yes/5f814c23_nohash_1,1.0,0,16000,/localscratch/GSC/yes/5f814c23_nohash_1.wav,wav,,5f814c23,string,,yes,string,,yes,string,
+yes/d9e9f554_nohash_1,1.0,0,16000,/localscratch/GSC/yes/d9e9f554_nohash_1.wav,wav,,d9e9f554,string,,yes,string,,yes,string,
+yes/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/yes/f9643d42_nohash_0.wav,wav,,f9643d42,string,,yes,string,,yes,string,
+yes/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/yes/beb458a4_nohash_3.wav,wav,,beb458a4,string,,yes,string,,yes,string,
+yes/af7a8296_nohash_3,1.0,0,16000,/localscratch/GSC/yes/af7a8296_nohash_3.wav,wav,,af7a8296,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/yes/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,yes,string,,yes,string,
+yes/87070229_nohash_4,1.0,0,16000,/localscratch/GSC/yes/87070229_nohash_4.wav,wav,,87070229,string,,yes,string,,yes,string,
+yes/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/yes/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,yes,string,,yes,string,
+yes/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/yes/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,yes,string,,yes,string,
+yes/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/yes/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,yes,string,,yes,string,
+yes/ca4d5368_nohash_5,1.0,0,16000,/localscratch/GSC/yes/ca4d5368_nohash_5.wav,wav,,ca4d5368,string,,yes,string,,yes,string,
+yes/84d1e469_nohash_0,1.0,0,16000,/localscratch/GSC/yes/84d1e469_nohash_0.wav,wav,,84d1e469,string,,yes,string,,yes,string,
+yes/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/yes/87070229_nohash_1.wav,wav,,87070229,string,,yes,string,,yes,string,
+yes/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/yes/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,yes,string,,yes,string,
+yes/e0c782d5_nohash_4,1.0,0,16000,/localscratch/GSC/yes/e0c782d5_nohash_4.wav,wav,,e0c782d5,string,,yes,string,,yes,string,
+yes/adebe223_nohash_0,1.0,0,16000,/localscratch/GSC/yes/adebe223_nohash_0.wav,wav,,adebe223,string,,yes,string,,yes,string,
+yes/e49428d9_nohash_2,1.0,0,16000,/localscratch/GSC/yes/e49428d9_nohash_2.wav,wav,,e49428d9,string,,yes,string,,yes,string,
+yes/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/yes/e1469561_nohash_0.wav,wav,,e1469561,string,,yes,string,,yes,string,
+yes/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/yes/e41a903b_nohash_3.wav,wav,,e41a903b,string,,yes,string,,yes,string,
+yes/94de6a6a_nohash_4,1.0,0,16000,/localscratch/GSC/yes/94de6a6a_nohash_4.wav,wav,,94de6a6a,string,,yes,string,,yes,string,
+yes/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/yes/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,yes,string,,yes,string,
+yes/837a0f64_nohash_4,1.0,0,16000,/localscratch/GSC/yes/837a0f64_nohash_4.wav,wav,,837a0f64,string,,yes,string,,yes,string,
+yes/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/yes/692a88e6_nohash_2.wav,wav,,692a88e6,string,,yes,string,,yes,string,
+yes/48a8a69d_nohash_1,1.0,0,16000,/localscratch/GSC/yes/48a8a69d_nohash_1.wav,wav,,48a8a69d,string,,yes,string,,yes,string,
+yes/5c8af87a_nohash_3,1.0,0,16000,/localscratch/GSC/yes/5c8af87a_nohash_3.wav,wav,,5c8af87a,string,,yes,string,,yes,string,
+yes/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/yes/8769c34c_nohash_1.wav,wav,,8769c34c,string,,yes,string,,yes,string,
+yes/84d1e469_nohash_1,1.0,0,16000,/localscratch/GSC/yes/84d1e469_nohash_1.wav,wav,,84d1e469,string,,yes,string,,yes,string,
+yes/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/yes/b49caed3_nohash_3.wav,wav,,b49caed3,string,,yes,string,,yes,string,
+yes/a4e8a997_nohash_0,1.0,0,16000,/localscratch/GSC/yes/a4e8a997_nohash_0.wav,wav,,a4e8a997,string,,yes,string,,yes,string,
+yes/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/yes/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,yes,string,,yes,string,
+yes/bb05582b_nohash_4,1.0,0,16000,/localscratch/GSC/yes/bb05582b_nohash_4.wav,wav,,bb05582b,string,,yes,string,,yes,string,
+yes/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/yes/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,yes,string,,yes,string,
+yes/f264e0df_nohash_1,1.0,0,16000,/localscratch/GSC/yes/f264e0df_nohash_1.wav,wav,,f264e0df,string,,yes,string,,yes,string,
+yes/a9f54d8d_nohash_0,1.0,0,16000,/localscratch/GSC/yes/a9f54d8d_nohash_0.wav,wav,,a9f54d8d,string,,yes,string,,yes,string,
+yes/42beb5eb_nohash_0,1.0,0,16000,/localscratch/GSC/yes/42beb5eb_nohash_0.wav,wav,,42beb5eb,string,,yes,string,,yes,string,
+yes/e49428d9_nohash_1,1.0,0,16000,/localscratch/GSC/yes/e49428d9_nohash_1.wav,wav,,e49428d9,string,,yes,string,,yes,string,
+yes/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/yes/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,yes,string,,yes,string,
+yes/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/yes/aa80f517_nohash_0.wav,wav,,aa80f517,string,,yes,string,,yes,string,
+yes/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/yes/5170b77f_nohash_2.wav,wav,,5170b77f,string,,yes,string,,yes,string,
+yes/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/yes/db24628d_nohash_1.wav,wav,,db24628d,string,,yes,string,,yes,string,
+yes/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/yes/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,yes,string,,yes,string,
+yes/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/yes/beb458a4_nohash_1.wav,wav,,beb458a4,string,,yes,string,,yes,string,
+yes/5eb5fc74_nohash_1,1.0,0,16000,/localscratch/GSC/yes/5eb5fc74_nohash_1.wav,wav,,5eb5fc74,string,,yes,string,,yes,string,
+yes/f0ae7203_nohash_1,1.0,0,16000,/localscratch/GSC/yes/f0ae7203_nohash_1.wav,wav,,f0ae7203,string,,yes,string,,yes,string,
+yes/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/yes/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,yes,string,,yes,string,
+yes/5e3dde6b_nohash_3,1.0,0,16000,/localscratch/GSC/yes/5e3dde6b_nohash_3.wav,wav,,5e3dde6b,string,,yes,string,,yes,string,
+yes/4620dc14_nohash_0,1.0,0,16000,/localscratch/GSC/yes/4620dc14_nohash_0.wav,wav,,4620dc14,string,,yes,string,,yes,string,
+yes/4845bb10_nohash_0,1.0,0,16000,/localscratch/GSC/yes/4845bb10_nohash_0.wav,wav,,4845bb10,string,,yes,string,,yes,string,
+yes/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/yes/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,yes,string,,yes,string,
+yes/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/yes/bb05582b_nohash_1.wav,wav,,bb05582b,string,,yes,string,,yes,string,
+yes/5eb5fc74_nohash_0,1.0,0,16000,/localscratch/GSC/yes/5eb5fc74_nohash_0.wav,wav,,5eb5fc74,string,,yes,string,,yes,string,
+yes/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/yes/f9643d42_nohash_2.wav,wav,,f9643d42,string,,yes,string,,yes,string,
+yes/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/yes/f292725f_nohash_0.wav,wav,,f292725f,string,,yes,string,,yes,string,
+yes/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/yes/a7216980_nohash_2.wav,wav,,a7216980,string,,yes,string,,yes,string,
+yes/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/yes/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,yes,string,,yes,string,
+yes/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/yes/1acc97de_nohash_0.wav,wav,,1acc97de,string,,yes,string,,yes,string,
+yes/aa80f517_nohash_4,1.0,0,16000,/localscratch/GSC/yes/aa80f517_nohash_4.wav,wav,,aa80f517,string,,yes,string,,yes,string,
+yes/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/yes/aa80f517_nohash_2.wav,wav,,aa80f517,string,,yes,string,,yes,string,
+yes/82b99576_nohash_1,1.0,0,16000,/localscratch/GSC/yes/82b99576_nohash_1.wav,wav,,82b99576,string,,yes,string,,yes,string,
+yes/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/yes/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,yes,string,,yes,string,
+yes/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/yes/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,yes,string,,yes,string,
+yes/ea37ca08_nohash_0,1.0,0,16000,/localscratch/GSC/yes/ea37ca08_nohash_0.wav,wav,,ea37ca08,string,,yes,string,,yes,string,
+yes/7e1054e7_nohash_0,1.0,0,16000,/localscratch/GSC/yes/7e1054e7_nohash_0.wav,wav,,7e1054e7,string,,yes,string,,yes,string,
+yes/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/yes/a7216980_nohash_0.wav,wav,,a7216980,string,,yes,string,,yes,string,
+yes/6736bc64_nohash_2,1.0,0,16000,/localscratch/GSC/yes/6736bc64_nohash_2.wav,wav,,6736bc64,string,,yes,string,,yes,string,
+yes/db24628d_nohash_3,1.0,0,16000,/localscratch/GSC/yes/db24628d_nohash_3.wav,wav,,db24628d,string,,yes,string,,yes,string,
+yes/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/yes/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,yes,string,,yes,string,
+yes/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/yes/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,yes,string,,yes,string,
+yes/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/yes/b49caed3_nohash_2.wav,wav,,b49caed3,string,,yes,string,,yes,string,
+yes/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/yes/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,yes,string,,yes,string,
+yes/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/yes/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,yes,string,,yes,string,
+yes/0ea0e2f4_nohash_0,1.0,0,16000,/localscratch/GSC/yes/0ea0e2f4_nohash_0.wav,wav,,0ea0e2f4,string,,yes,string,,yes,string,
+yes/cd85758f_nohash_4,1.0,0,16000,/localscratch/GSC/yes/cd85758f_nohash_4.wav,wav,,cd85758f,string,,yes,string,,yes,string,
+yes/ffb86d3c_nohash_0,1.0,0,16000,/localscratch/GSC/yes/ffb86d3c_nohash_0.wav,wav,,ffb86d3c,string,,yes,string,,yes,string,
+yes/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/yes/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,yes,string,,yes,string,
+yes/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/yes/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,yes,string,,yes,string,
+yes/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/yes/cd85758f_nohash_3.wav,wav,,cd85758f,string,,yes,string,,yes,string,
+yes/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/yes/91b03183_nohash_2.wav,wav,,91b03183,string,,yes,string,,yes,string,
+yes/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/yes/63f7a489_nohash_3.wav,wav,,63f7a489,string,,yes,string,,yes,string,
+yes/5e3dde6b_nohash_4,1.0,0,16000,/localscratch/GSC/yes/5e3dde6b_nohash_4.wav,wav,,5e3dde6b,string,,yes,string,,yes,string,
+yes/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/yes/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,yes,string,,yes,string,
+yes/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/yes/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,yes,string,,yes,string,
+yes/3d86b69a_nohash_3,1.0,0,16000,/localscratch/GSC/yes/3d86b69a_nohash_3.wav,wav,,3d86b69a,string,,yes,string,,yes,string,
+yes/a80f9f53_nohash_0,1.0,0,16000,/localscratch/GSC/yes/a80f9f53_nohash_0.wav,wav,,a80f9f53,string,,yes,string,,yes,string,
+yes/4f8ef132_nohash_0,1.0,0,16000,/localscratch/GSC/yes/4f8ef132_nohash_0.wav,wav,,4f8ef132,string,,yes,string,,yes,string,
+yes/c9e251d2_nohash_0,1.0,0,16000,/localscratch/GSC/yes/c9e251d2_nohash_0.wav,wav,,c9e251d2,string,,yes,string,,yes,string,
+yes/553f1a79_nohash_0,1.0,0,16000,/localscratch/GSC/yes/553f1a79_nohash_0.wav,wav,,553f1a79,string,,yes,string,,yes,string,
+yes/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/yes/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,yes,string,,yes,string,
+yes/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/yes/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,yes,string,,yes,string,
+yes/af405b69_nohash_0,1.0,0,16000,/localscratch/GSC/yes/af405b69_nohash_0.wav,wav,,af405b69,string,,yes,string,,yes,string,
+yes/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/yes/b49caed3_nohash_0.wav,wav,,b49caed3,string,,yes,string,,yes,string,
+yes/63f7a489_nohash_4,1.0,0,16000,/localscratch/GSC/yes/63f7a489_nohash_4.wav,wav,,63f7a489,string,,yes,string,,yes,string,
+yes/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/yes/881583a6_nohash_0.wav,wav,,881583a6,string,,yes,string,,yes,string,
+yes/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/yes/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,yes,string,,yes,string,
+yes/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/yes/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,yes,string,,yes,string,
+yes/af7a8296_nohash_0,1.0,0,16000,/localscratch/GSC/yes/af7a8296_nohash_0.wav,wav,,af7a8296,string,,yes,string,,yes,string,
+yes/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/yes/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,yes,string,,yes,string,
+yes/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/yes/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,yes,string,,yes,string,
+yes/9a7c1f83_nohash_5,1.0,0,16000,/localscratch/GSC/yes/9a7c1f83_nohash_5.wav,wav,,9a7c1f83,string,,yes,string,,yes,string,
+yes/4290ca61_nohash_1,1.0,0,16000,/localscratch/GSC/yes/4290ca61_nohash_1.wav,wav,,4290ca61,string,,yes,string,,yes,string,
+yes/d1bf406b_nohash_0,1.0,0,16000,/localscratch/GSC/yes/d1bf406b_nohash_0.wav,wav,,d1bf406b,string,,yes,string,,yes,string,
+yes/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/yes/692a88e6_nohash_1.wav,wav,,692a88e6,string,,yes,string,,yes,string,
+yes/27c30960_nohash_0,1.0,0,16000,/localscratch/GSC/yes/27c30960_nohash_0.wav,wav,,27c30960,string,,yes,string,,yes,string,
+yes/d962e5ac_nohash_4,1.0,0,16000,/localscratch/GSC/yes/d962e5ac_nohash_4.wav,wav,,d962e5ac,string,,yes,string,,yes,string,
+yes/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/yes/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,yes,string,,yes,string,
+yes/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/yes/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,yes,string,,yes,string,
+yes/863880b7_nohash_0,1.0,0,16000,/localscratch/GSC/yes/863880b7_nohash_0.wav,wav,,863880b7,string,,yes,string,,yes,string,
+yes/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/yes/37dca74f_nohash_2.wav,wav,,37dca74f,string,,yes,string,,yes,string,
+yes/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/yes/bb05582b_nohash_2.wav,wav,,bb05582b,string,,yes,string,,yes,string,
+yes/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/yes/692a88e6_nohash_0.wav,wav,,692a88e6,string,,yes,string,,yes,string,
+yes/105a0eea_nohash_0,1.0,0,16000,/localscratch/GSC/yes/105a0eea_nohash_0.wav,wav,,105a0eea,string,,yes,string,,yes,string,
+yes/8c7f81df_nohash_1,1.0,0,16000,/localscratch/GSC/yes/8c7f81df_nohash_1.wav,wav,,8c7f81df,string,,yes,string,,yes,string,
+yes/84d1e469_nohash_2,1.0,0,16000,/localscratch/GSC/yes/84d1e469_nohash_2.wav,wav,,84d1e469,string,,yes,string,,yes,string,
+yes/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/yes/aa80f517_nohash_3.wav,wav,,aa80f517,string,,yes,string,,yes,string,
+yes/8fe52b97_nohash_0,1.0,0,16000,/localscratch/GSC/yes/8fe52b97_nohash_0.wav,wav,,8fe52b97,string,,yes,string,,yes,string,
+yes/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/yes/893705bb_nohash_1.wav,wav,,893705bb,string,,yes,string,,yes,string,
+yes/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/yes/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,yes,string,,yes,string,
+yes/6736bc64_nohash_0,1.0,0,16000,/localscratch/GSC/yes/6736bc64_nohash_0.wav,wav,,6736bc64,string,,yes,string,,yes,string,
+yes/a591c2ea_nohash_1,1.0,0,16000,/localscratch/GSC/yes/a591c2ea_nohash_1.wav,wav,,a591c2ea,string,,yes,string,,yes,string,
+yes/f264e0df_nohash_0,1.0,0,16000,/localscratch/GSC/yes/f264e0df_nohash_0.wav,wav,,f264e0df,string,,yes,string,,yes,string,
+yes/4fd1443e_nohash_4,1.0,0,16000,/localscratch/GSC/yes/4fd1443e_nohash_4.wav,wav,,4fd1443e,string,,yes,string,,yes,string,
+yes/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/yes/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,yes,string,,yes,string,
+yes/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/yes/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,yes,string,,yes,string,
+yes/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/yes/cd85758f_nohash_2.wav,wav,,cd85758f,string,,yes,string,,yes,string,
+yes/3df9a3d4_nohash_1,1.0,0,16000,/localscratch/GSC/yes/3df9a3d4_nohash_1.wav,wav,,3df9a3d4,string,,yes,string,,yes,string,
+yes/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/yes/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,yes,string,,yes,string,
+yes/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/yes/63f7a489_nohash_1.wav,wav,,63f7a489,string,,yes,string,,yes,string,
+yes/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/yes/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,yes,string,,yes,string,
+yes/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/yes/189cbabe_nohash_2.wav,wav,,189cbabe,string,,yes,string,,yes,string,
+yes/fe1916ba_nohash_1,1.0,0,16000,/localscratch/GSC/yes/fe1916ba_nohash_1.wav,wav,,fe1916ba,string,,yes,string,,yes,string,
+yes/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/yes/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,yes,string,,yes,string,
+yes/a591c2ea_nohash_0,1.0,0,16000,/localscratch/GSC/yes/a591c2ea_nohash_0.wav,wav,,a591c2ea,string,,yes,string,,yes,string,
+yes/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/yes/422d3197_nohash_0.wav,wav,,422d3197,string,,yes,string,,yes,string,
+yes/fdb5155e_nohash_0,1.0,0,16000,/localscratch/GSC/yes/fdb5155e_nohash_0.wav,wav,,fdb5155e,string,,yes,string,,yes,string,
+yes/8ec6dab6_nohash_0,1.0,0,16000,/localscratch/GSC/yes/8ec6dab6_nohash_0.wav,wav,,8ec6dab6,string,,yes,string,,yes,string,
+yes/aa233654_nohash_0,1.0,0,16000,/localscratch/GSC/yes/aa233654_nohash_0.wav,wav,,aa233654,string,,yes,string,,yes,string,
+yes/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/yes/893705bb_nohash_6.wav,wav,,893705bb,string,,yes,string,,yes,string,
+yes/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/yes/189cbabe_nohash_0.wav,wav,,189cbabe,string,,yes,string,,yes,string,
+yes/8c7f81df_nohash_0,1.0,0,16000,/localscratch/GSC/yes/8c7f81df_nohash_0.wav,wav,,8c7f81df,string,,yes,string,,yes,string,
+yes/210f3aa9_nohash_0,1.0,0,16000,/localscratch/GSC/yes/210f3aa9_nohash_0.wav,wav,,210f3aa9,string,,yes,string,,yes,string,
+yes/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/yes/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,yes,string,,yes,string,
+yes/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/yes/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,yes,string,,yes,string,
+yes/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/yes/7257420c_nohash_0.wav,wav,,7257420c,string,,yes,string,,yes,string,
+yes/2796ac50_nohash_1,1.0,0,16000,/localscratch/GSC/yes/2796ac50_nohash_1.wav,wav,,2796ac50,string,,yes,string,,yes,string,
+yes/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/yes/beb458a4_nohash_0.wav,wav,,beb458a4,string,,yes,string,,yes,string,
+yes/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/yes/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,yes,string,,yes,string,
+yes/a60a09cf_nohash_1,1.0,0,16000,/localscratch/GSC/yes/a60a09cf_nohash_1.wav,wav,,a60a09cf,string,,yes,string,,yes,string,
+yes/5f814c23_nohash_0,1.0,0,16000,/localscratch/GSC/yes/5f814c23_nohash_0.wav,wav,,5f814c23,string,,yes,string,,yes,string,
+yes/dcb57584_nohash_0,1.0,0,16000,/localscratch/GSC/yes/dcb57584_nohash_0.wav,wav,,dcb57584,string,,yes,string,,yes,string,
+yes/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/yes/6f689791_nohash_0.wav,wav,,6f689791,string,,yes,string,,yes,string,
+yes/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/yes/87070229_nohash_2.wav,wav,,87070229,string,,yes,string,,yes,string,
+yes/1cb788bc_nohash_0,1.0,0,16000,/localscratch/GSC/yes/1cb788bc_nohash_0.wav,wav,,1cb788bc,string,,yes,string,,yes,string,
+yes/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/yes/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,yes,string,,yes,string,
+yes/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/yes/1acc97de_nohash_3.wav,wav,,1acc97de,string,,yes,string,,yes,string,
+yes/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/yes/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,yes,string,,yes,string,
+yes/563aa4e6_nohash_4,1.0,0,16000,/localscratch/GSC/yes/563aa4e6_nohash_4.wav,wav,,563aa4e6,string,,yes,string,,yes,string,
+yes/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/yes/87070229_nohash_0.wav,wav,,87070229,string,,yes,string,,yes,string,
+yes/6f2f57c1_nohash_2,1.0,0,16000,/localscratch/GSC/yes/6f2f57c1_nohash_2.wav,wav,,6f2f57c1,string,,yes,string,,yes,string,
+yes/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/yes/97f4c236_nohash_4.wav,wav,,97f4c236,string,,yes,string,,yes,string,
+yes/9a7c1f83_nohash_2,1.0,0,16000,/localscratch/GSC/yes/9a7c1f83_nohash_2.wav,wav,,9a7c1f83,string,,yes,string,,yes,string,
+yes/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/yes/837a0f64_nohash_0.wav,wav,,837a0f64,string,,yes,string,,yes,string,
+yes/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/yes/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,yes,string,,yes,string,
+yes/b737ee80_nohash_1,1.0,0,16000,/localscratch/GSC/yes/b737ee80_nohash_1.wav,wav,,b737ee80,string,,yes,string,,yes,string,
+yes/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/yes/8769c34c_nohash_2.wav,wav,,8769c34c,string,,yes,string,,yes,string,
+yes/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/yes/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,yes,string,,yes,string,
+yes/b737ee80_nohash_0,1.0,0,16000,/localscratch/GSC/yes/b737ee80_nohash_0.wav,wav,,b737ee80,string,,yes,string,,yes,string,
+yes/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/yes/a7216980_nohash_3.wav,wav,,a7216980,string,,yes,string,,yes,string,
+yes/8494fba8_nohash_0,1.0,0,16000,/localscratch/GSC/yes/8494fba8_nohash_0.wav,wav,,8494fba8,string,,yes,string,,yes,string,
+yes/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/yes/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,yes,string,,yes,string,
+yes/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/yes/8fe67225_nohash_1.wav,wav,,8fe67225,string,,yes,string,,yes,string,
+yes/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/yes/837a0f64_nohash_2.wav,wav,,837a0f64,string,,yes,string,,yes,string,
+yes/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/yes/db24628d_nohash_2.wav,wav,,db24628d,string,,yes,string,,yes,string,
+yes/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/yes/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/yes/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/yes/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,yes,string,,yes,string,
+yes/d5ca80c6_nohash_0,1.0,0,16000,/localscratch/GSC/yes/d5ca80c6_nohash_0.wav,wav,,d5ca80c6,string,,yes,string,,yes,string,
+yes/fdb5155e_nohash_1,1.0,0,16000,/localscratch/GSC/yes/fdb5155e_nohash_1.wav,wav,,fdb5155e,string,,yes,string,,yes,string,
+yes/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/yes/893705bb_nohash_2.wav,wav,,893705bb,string,,yes,string,,yes,string,
+yes/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/yes/9a69672b_nohash_2.wav,wav,,9a69672b,string,,yes,string,,yes,string,
+yes/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/yes/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,yes,string,,yes,string,
+yes/f6af2457_nohash_0,1.0,0,16000,/localscratch/GSC/yes/f6af2457_nohash_0.wav,wav,,f6af2457,string,,yes,string,,yes,string,
+yes/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/yes/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,yes,string,,yes,string,
+yes/e41a903b_nohash_4,1.0,0,16000,/localscratch/GSC/yes/e41a903b_nohash_4.wav,wav,,e41a903b,string,,yes,string,,yes,string,
+yes/7192fddc_nohash_0,1.0,0,16000,/localscratch/GSC/yes/7192fddc_nohash_0.wav,wav,,7192fddc,string,,yes,string,,yes,string,
+yes/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/yes/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,yes,string,,yes,string,
+yes/5f814c23_nohash_2,1.0,0,16000,/localscratch/GSC/yes/5f814c23_nohash_2.wav,wav,,5f814c23,string,,yes,string,,yes,string,
+yes/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/yes/beb458a4_nohash_2.wav,wav,,beb458a4,string,,yes,string,,yes,string,
+yes/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/yes/0cb74144_nohash_2.wav,wav,,0cb74144,string,,yes,string,,yes,string,
+yes/d0faf7e4_nohash_4,1.0,0,16000,/localscratch/GSC/yes/d0faf7e4_nohash_4.wav,wav,,d0faf7e4,string,,yes,string,,yes,string,
+yes/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/yes/97f4c236_nohash_3.wav,wav,,97f4c236,string,,yes,string,,yes,string,
+yes/3f170018_nohash_0,1.0,0,16000,/localscratch/GSC/yes/3f170018_nohash_0.wav,wav,,3f170018,string,,yes,string,,yes,string,
+yes/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/yes/8769c34c_nohash_0.wav,wav,,8769c34c,string,,yes,string,,yes,string,
+yes/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/yes/8fe67225_nohash_3.wav,wav,,8fe67225,string,,yes,string,,yes,string,
+yes/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/yes/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,yes,string,,yes,string,
+yes/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/yes/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,yes,string,,yes,string,
+yes/5ff3f9a1_nohash_1,1.0,0,16000,/localscratch/GSC/yes/5ff3f9a1_nohash_1.wav,wav,,5ff3f9a1,string,,yes,string,,yes,string,
+yes/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/yes/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,yes,string,,yes,string,
+yes/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/yes/0cb74144_nohash_0.wav,wav,,0cb74144,string,,yes,string,,yes,string,
+yes/6f2f57c1_nohash_0,1.0,0,16000,/localscratch/GSC/yes/6f2f57c1_nohash_0.wav,wav,,6f2f57c1,string,,yes,string,,yes,string,
+yes/283d7a53_nohash_0,1.0,0,16000,/localscratch/GSC/yes/283d7a53_nohash_0.wav,wav,,283d7a53,string,,yes,string,,yes,string,
+yes/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/yes/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,yes,string,,yes,string,
+yes/a6f2fd71_nohash_4,1.0,0,16000,/localscratch/GSC/yes/a6f2fd71_nohash_4.wav,wav,,a6f2fd71,string,,yes,string,,yes,string,
+yes/d1bf406b_nohash_1,1.0,0,16000,/localscratch/GSC/yes/d1bf406b_nohash_1.wav,wav,,d1bf406b,string,,yes,string,,yes,string,
+yes/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/yes/37dca74f_nohash_1.wav,wav,,37dca74f,string,,yes,string,,yes,string,
+yes/d5ca80c6_nohash_1,1.0,0,16000,/localscratch/GSC/yes/d5ca80c6_nohash_1.wav,wav,,d5ca80c6,string,,yes,string,,yes,string,
+yes/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/yes/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,yes,string,,yes,string,
+yes/1093c8e7_nohash_0,1.0,0,16000,/localscratch/GSC/yes/1093c8e7_nohash_0.wav,wav,,1093c8e7,string,,yes,string,,yes,string,
+yes/d0faf7e4_nohash_5,1.0,0,16000,/localscratch/GSC/yes/d0faf7e4_nohash_5.wav,wav,,d0faf7e4,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_6,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_6.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/9a7c1f83_nohash_3,1.0,0,16000,/localscratch/GSC/yes/9a7c1f83_nohash_3.wav,wav,,9a7c1f83,string,,yes,string,,yes,string,
+yes/370844f7_nohash_0,1.0,0,16000,/localscratch/GSC/yes/370844f7_nohash_0.wav,wav,,370844f7,string,,yes,string,,yes,string,
+yes/d103dd6e_nohash_0,1.0,0,16000,/localscratch/GSC/yes/d103dd6e_nohash_0.wav,wav,,d103dd6e,string,,yes,string,,yes,string,
+yes/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/yes/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_7,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_7.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/yes/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,yes,string,,yes,string,
+yes/105a0eea_nohash_1,1.0,0,16000,/localscratch/GSC/yes/105a0eea_nohash_1.wav,wav,,105a0eea,string,,yes,string,,yes,string,
+yes/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/yes/68dd409e_nohash_0.wav,wav,,68dd409e,string,,yes,string,,yes,string,
+yes/6f2f57c1_nohash_3,1.0,0,16000,/localscratch/GSC/yes/6f2f57c1_nohash_3.wav,wav,,6f2f57c1,string,,yes,string,,yes,string,
+yes/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/yes/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,yes,string,,yes,string,
+yes/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/yes/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,yes,string,,yes,string,
+yes/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/yes/893705bb_nohash_3.wav,wav,,893705bb,string,,yes,string,,yes,string,
+yes/475b61f1_nohash_0,1.0,0,16000,/localscratch/GSC/yes/475b61f1_nohash_0.wav,wav,,475b61f1,string,,yes,string,,yes,string,
+yes/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/yes/e41a903b_nohash_0.wav,wav,,e41a903b,string,,yes,string,,yes,string,
+yes/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/yes/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,yes,string,,yes,string,
+yes/91b03183_nohash_3,1.0,0,16000,/localscratch/GSC/yes/91b03183_nohash_3.wav,wav,,91b03183,string,,yes,string,,yes,string,
+yes/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/yes/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,yes,string,,yes,string,
+yes/af7a8296_nohash_1,1.0,0,16000,/localscratch/GSC/yes/af7a8296_nohash_1.wav,wav,,af7a8296,string,,yes,string,,yes,string,
+yes/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/yes/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,yes,string,,yes,string,
+yes/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/yes/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,yes,string,,yes,string,
+yes/cc592808_nohash_0,1.0,0,16000,/localscratch/GSC/yes/cc592808_nohash_0.wav,wav,,cc592808,string,,yes,string,,yes,string,
+yes/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/yes/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/587f3271_nohash_0,1.0,0,16000,/localscratch/GSC/yes/587f3271_nohash_0.wav,wav,,587f3271,string,,yes,string,,yes,string,
+yes/6736bc64_nohash_1,1.0,0,16000,/localscratch/GSC/yes/6736bc64_nohash_1.wav,wav,,6736bc64,string,,yes,string,,yes,string,
+yes/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/yes/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,yes,string,,yes,string,
+yes/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/yes/e1469561_nohash_3.wav,wav,,e1469561,string,,yes,string,,yes,string,
+yes/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/yes/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,yes,string,,yes,string,
+yes/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/yes/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,yes,string,,yes,string,
+yes/022cd682_nohash_0,1.0,0,16000,/localscratch/GSC/yes/022cd682_nohash_0.wav,wav,,022cd682,string,,yes,string,,yes,string,
+yes/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/yes/2d82a556_nohash_0.wav,wav,,2d82a556,string,,yes,string,,yes,string,
+yes/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/yes/1acc97de_nohash_1.wav,wav,,1acc97de,string,,yes,string,,yes,string,
+yes/2c6d3924_nohash_1,1.0,0,16000,/localscratch/GSC/yes/2c6d3924_nohash_1.wav,wav,,2c6d3924,string,,yes,string,,yes,string,
+yes/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/yes/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,yes,string,,yes,string,
+yes/d1a4fb3f_nohash_1,1.0,0,16000,/localscratch/GSC/yes/d1a4fb3f_nohash_1.wav,wav,,d1a4fb3f,string,,yes,string,,yes,string,
+yes/1f3bece8_nohash_0,1.0,0,16000,/localscratch/GSC/yes/1f3bece8_nohash_0.wav,wav,,1f3bece8,string,,yes,string,,yes,string,
+yes/553f1a79_nohash_1,1.0,0,16000,/localscratch/GSC/yes/553f1a79_nohash_1.wav,wav,,553f1a79,string,,yes,string,,yes,string,
+yes/4c6167ca_nohash_8,1.0,0,16000,/localscratch/GSC/yes/4c6167ca_nohash_8.wav,wav,,4c6167ca,string,,yes,string,,yes,string,
+yes/5170b77f_nohash_4,1.0,0,16000,/localscratch/GSC/yes/5170b77f_nohash_4.wav,wav,,5170b77f,string,,yes,string,,yes,string,
+yes/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/yes/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,yes,string,,yes,string,
+yes/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/yes/87070229_nohash_3.wav,wav,,87070229,string,,yes,string,,yes,string,
+yes/5828dfa2_nohash_0,1.0,0,16000,/localscratch/GSC/yes/5828dfa2_nohash_0.wav,wav,,5828dfa2,string,,yes,string,,yes,string,
+yes/1538beb7_nohash_0,1.0,0,16000,/localscratch/GSC/yes/1538beb7_nohash_0.wav,wav,,1538beb7,string,,yes,string,,yes,string,
+yes/f264e0df_nohash_2,1.0,0,16000,/localscratch/GSC/yes/f264e0df_nohash_2.wav,wav,,f264e0df,string,,yes,string,,yes,string,
+yes/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/yes/91b03183_nohash_1.wav,wav,,91b03183,string,,yes,string,,yes,string,
+yes/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/yes/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,yes,string,,yes,string,
+yes/4845bb10_nohash_1,1.0,0,16000,/localscratch/GSC/yes/4845bb10_nohash_1.wav,wav,,4845bb10,string,,yes,string,,yes,string,
+yes/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/yes/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,yes,string,,yes,string,
+yes/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/yes/bb05582b_nohash_0.wav,wav,,bb05582b,string,,yes,string,,yes,string,
+yes/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/yes/8fe67225_nohash_4.wav,wav,,8fe67225,string,,yes,string,,yes,string,
+yes/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/yes/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,yes,string,,yes,string,
+yes/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/yes/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,yes,string,,yes,string,
+yes/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/yes/f9643d42_nohash_1.wav,wav,,f9643d42,string,,yes,string,,yes,string,
+yes/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/yes/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,yes,string,,yes,string,
+yes/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/yes/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,yes,string,,yes,string,
+yes/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/yes/837a0f64_nohash_3.wav,wav,,837a0f64,string,,yes,string,,yes,string,
+yes/f297e878_nohash_0,1.0,0,16000,/localscratch/GSC/yes/f297e878_nohash_0.wav,wav,,f297e878,string,,yes,string,,yes,string,
+yes/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/yes/e1469561_nohash_1.wav,wav,,e1469561,string,,yes,string,,yes,string,
+yes/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/yes/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,yes,string,,yes,string,
+yes/e5e54cee_nohash_0,1.0,0,16000,/localscratch/GSC/yes/e5e54cee_nohash_0.wav,wav,,e5e54cee,string,,yes,string,,yes,string,
+yes/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/yes/5170b77f_nohash_3.wav,wav,,5170b77f,string,,yes,string,,yes,string,
+yes/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/yes/f9643d42_nohash_3.wav,wav,,f9643d42,string,,yes,string,,yes,string,
+yes/caedb73a_nohash_0,1.0,0,16000,/localscratch/GSC/yes/caedb73a_nohash_0.wav,wav,,caedb73a,string,,yes,string,,yes,string,
+yes/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/yes/893705bb_nohash_4.wav,wav,,893705bb,string,,yes,string,,yes,string,
+yes/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/yes/e1469561_nohash_2.wav,wav,,e1469561,string,,yes,string,,yes,string,
+yes/d0faf7e4_nohash_6,1.0,0,16000,/localscratch/GSC/yes/d0faf7e4_nohash_6.wav,wav,,d0faf7e4,string,,yes,string,,yes,string,
+yes/692a88e6_nohash_4,1.0,0,16000,/localscratch/GSC/yes/692a88e6_nohash_4.wav,wav,,692a88e6,string,,yes,string,,yes,string,
+yes/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/yes/c7124b73_nohash_0.wav,wav,,c7124b73,string,,yes,string,,yes,string,
+yes/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/yes/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,yes,string,,yes,string,
+yes/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/yes/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,yes,string,,yes,string,
+yes/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/yes/e1469561_nohash_4.wav,wav,,e1469561,string,,yes,string,,yes,string,
+yes/cfbedff9_nohash_4,1.0,0,16000,/localscratch/GSC/yes/cfbedff9_nohash_4.wav,wav,,cfbedff9,string,,yes,string,,yes,string,
+yes/e5e54cee_nohash_1,1.0,0,16000,/localscratch/GSC/yes/e5e54cee_nohash_1.wav,wav,,e5e54cee,string,,yes,string,,yes,string,
+yes/9a69672b_nohash_4,1.0,0,16000,/localscratch/GSC/yes/9a69672b_nohash_4.wav,wav,,9a69672b,string,,yes,string,,yes,string,
+yes/9a356ab9_nohash_0,1.0,0,16000,/localscratch/GSC/yes/9a356ab9_nohash_0.wav,wav,,9a356ab9,string,,yes,string,,yes,string,
+yes/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/yes/8769c34c_nohash_3.wav,wav,,8769c34c,string,,yes,string,,yes,string,
+yes/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/yes/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,yes,string,,yes,string,
+yes/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/yes/bb05582b_nohash_3.wav,wav,,bb05582b,string,,yes,string,,yes,string,
+yes/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/yes/5170b77f_nohash_0.wav,wav,,5170b77f,string,,yes,string,,yes,string,
+yes/82d0d3ba_nohash_0,1.0,0,16000,/localscratch/GSC/yes/82d0d3ba_nohash_0.wav,wav,,82d0d3ba,string,,yes,string,,yes,string,
+yes/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/yes/0cb74144_nohash_3.wav,wav,,0cb74144,string,,yes,string,,yes,string,
+yes/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/yes/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,yes,string,,yes,string,
+yes/798f702a_nohash_0,1.0,0,16000,/localscratch/GSC/yes/798f702a_nohash_0.wav,wav,,798f702a,string,,yes,string,,yes,string,
+yes/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/yes/9a69672b_nohash_1.wav,wav,,9a69672b,string,,yes,string,,yes,string,
+yes/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/yes/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,yes,string,,yes,string,
+yes/43fc47a7_nohash_1,1.0,0,16000,/localscratch/GSC/yes/43fc47a7_nohash_1.wav,wav,,43fc47a7,string,,yes,string,,yes,string,
+yes/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/yes/37dca74f_nohash_0.wav,wav,,37dca74f,string,,yes,string,,yes,string,
+yes/82b99576_nohash_0,1.0,0,16000,/localscratch/GSC/yes/82b99576_nohash_0.wav,wav,,82b99576,string,,yes,string,,yes,string,
+yes/e49428d9_nohash_4,1.0,0,16000,/localscratch/GSC/yes/e49428d9_nohash_4.wav,wav,,e49428d9,string,,yes,string,,yes,string,
+no/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/no/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,no,string,,no,string,
+no/b2e2773a_nohash_0,1.0,0,16000,/localscratch/GSC/no/b2e2773a_nohash_0.wav,wav,,b2e2773a,string,,no,string,,no,string,
+no/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/no/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,no,string,,no,string,
+no/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/no/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,no,string,,no,string,
+no/7dc95912_nohash_0,1.0,0,16000,/localscratch/GSC/no/7dc95912_nohash_0.wav,wav,,7dc95912,string,,no,string,,no,string,
+no/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/no/cd85758f_nohash_1.wav,wav,,cd85758f,string,,no,string,,no,string,
+no/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/no/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,no,string,,no,string,
+no/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/no/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,no,string,,no,string,
+no/62ff07ef_nohash_1,1.0,0,16000,/localscratch/GSC/no/62ff07ef_nohash_1.wav,wav,,62ff07ef,string,,no,string,,no,string,
+no/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/no/b49caed3_nohash_1.wav,wav,,b49caed3,string,,no,string,,no,string,
+no/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/no/9a69672b_nohash_3.wav,wav,,9a69672b,string,,no,string,,no,string,
+no/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/no/e41a903b_nohash_2.wav,wav,,e41a903b,string,,no,string,,no,string,
+no/e71b4ce6_nohash_0,1.0,0,16000,/localscratch/GSC/no/e71b4ce6_nohash_0.wav,wav,,e71b4ce6,string,,no,string,,no,string,
+no/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/no/dc75148d_nohash_0.wav,wav,,dc75148d,string,,no,string,,no,string,
+no/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/no/97f4c236_nohash_2.wav,wav,,97f4c236,string,,no,string,,no,string,
+no/03401e93_nohash_0,1.0,0,16000,/localscratch/GSC/no/03401e93_nohash_0.wav,wav,,03401e93,string,,no,string,,no,string,
+no/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/no/91b03183_nohash_0.wav,wav,,91b03183,string,,no,string,,no,string,
+no/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/no/5170b77f_nohash_1.wav,wav,,5170b77f,string,,no,string,,no,string,
+no/47d01978_nohash_1,1.0,0,16000,/localscratch/GSC/no/47d01978_nohash_1.wav,wav,,47d01978,string,,no,string,,no,string,
+no/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/no/db24628d_nohash_0.wav,wav,,db24628d,string,,no,string,,no,string,
+no/9d171fee_nohash_0,1.0,0,16000,/localscratch/GSC/no/9d171fee_nohash_0.wav,wav,,9d171fee,string,,no,string,,no,string,
+no/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/no/837a0f64_nohash_1.wav,wav,,837a0f64,string,,no,string,,no,string,
+no/8625475c_nohash_0,1.0,0,16000,/localscratch/GSC/no/8625475c_nohash_0.wav,wav,,8625475c,string,,no,string,,no,string,
+no/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/no/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,no,string,,no,string,
+no/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/no/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,no,string,,no,string,
+no/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/no/97f4c236_nohash_1.wav,wav,,97f4c236,string,,no,string,,no,string,
+no/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/no/a7216980_nohash_1.wav,wav,,a7216980,string,,no,string,,no,string,
+no/a60a09cf_nohash_0,1.0,0,16000,/localscratch/GSC/no/a60a09cf_nohash_0.wav,wav,,a60a09cf,string,,no,string,,no,string,
+no/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/no/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,no,string,,no,string,
+no/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/no/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,no,string,,no,string,
+no/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/no/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,no,string,,no,string,
+no/135c6841_nohash_1,1.0,0,16000,/localscratch/GSC/no/135c6841_nohash_1.wav,wav,,135c6841,string,,no,string,,no,string,
+no/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/no/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,no,string,,no,string,
+no/c0e0f834_nohash_0,1.0,0,16000,/localscratch/GSC/no/c0e0f834_nohash_0.wav,wav,,c0e0f834,string,,no,string,,no,string,
+no/a2473d62_nohash_2,1.0,0,16000,/localscratch/GSC/no/a2473d62_nohash_2.wav,wav,,a2473d62,string,,no,string,,no,string,
+no/28497c5b_nohash_1,1.0,0,16000,/localscratch/GSC/no/28497c5b_nohash_1.wav,wav,,28497c5b,string,,no,string,,no,string,
+no/3efef882_nohash_1,1.0,0,16000,/localscratch/GSC/no/3efef882_nohash_1.wav,wav,,3efef882,string,,no,string,,no,string,
+no/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/no/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,no,string,,no,string,
+no/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/no/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,no,string,,no,string,
+no/a7216980_nohash_4,1.0,0,16000,/localscratch/GSC/no/a7216980_nohash_4.wav,wav,,a7216980,string,,no,string,,no,string,
+no/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/no/692a88e6_nohash_3.wav,wav,,692a88e6,string,,no,string,,no,string,
+no/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/no/63f7a489_nohash_0.wav,wav,,63f7a489,string,,no,string,,no,string,
+no/d5b963aa_nohash_4,1.0,0,16000,/localscratch/GSC/no/d5b963aa_nohash_4.wav,wav,,d5b963aa,string,,no,string,,no,string,
+no/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/no/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,no,string,,no,string,
+no/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/no/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,no,string,,no,string,
+no/2fa39636_nohash_1,1.0,0,16000,/localscratch/GSC/no/2fa39636_nohash_1.wav,wav,,2fa39636,string,,no,string,,no,string,
+no/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_0.wav,wav,,893705bb,string,,no,string,,no,string,
+no/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/no/189cbabe_nohash_1.wav,wav,,189cbabe,string,,no,string,,no,string,
+no/6e916de8_nohash_2,1.0,0,16000,/localscratch/GSC/no/6e916de8_nohash_2.wav,wav,,6e916de8,string,,no,string,,no,string,
+no/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/no/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,no,string,,no,string,
+no/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/no/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,no,string,,no,string,
+no/af8b2f2c_nohash_1,1.0,0,16000,/localscratch/GSC/no/af8b2f2c_nohash_1.wav,wav,,af8b2f2c,string,,no,string,,no,string,
+no/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/no/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,no,string,,no,string,
+no/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/no/8fe67225_nohash_0.wav,wav,,8fe67225,string,,no,string,,no,string,
+no/3df9a3d4_nohash_0,1.0,0,16000,/localscratch/GSC/no/3df9a3d4_nohash_0.wav,wav,,3df9a3d4,string,,no,string,,no,string,
+no/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/no/beb458a4_nohash_4.wav,wav,,beb458a4,string,,no,string,,no,string,
+no/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/no/2796ac50_nohash_0.wav,wav,,2796ac50,string,,no,string,,no,string,
+no/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/no/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,no,string,,no,string,
+no/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/no/cd85758f_nohash_0.wav,wav,,cd85758f,string,,no,string,,no,string,
+no/0cb74144_nohash_4,1.0,0,16000,/localscratch/GSC/no/0cb74144_nohash_4.wav,wav,,0cb74144,string,,no,string,,no,string,
+no/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/no/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,no,string,,no,string,
+no/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/no/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,no,string,,no,string,
+no/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/no/e41a903b_nohash_1.wav,wav,,e41a903b,string,,no,string,,no,string,
+no/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/no/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,no,string,,no,string,
+no/fa446c16_nohash_0,1.0,0,16000,/localscratch/GSC/no/fa446c16_nohash_0.wav,wav,,fa446c16,string,,no,string,,no,string,
+no/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/no/37dca74f_nohash_3.wav,wav,,37dca74f,string,,no,string,,no,string,
+no/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/no/8fe67225_nohash_2.wav,wav,,8fe67225,string,,no,string,,no,string,
+no/1acc97de_nohash_4,1.0,0,16000,/localscratch/GSC/no/1acc97de_nohash_4.wav,wav,,1acc97de,string,,no,string,,no,string,
+no/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/no/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,no,string,,no,string,
+no/653a48f5_nohash_0,1.0,0,16000,/localscratch/GSC/no/653a48f5_nohash_0.wav,wav,,653a48f5,string,,no,string,,no,string,
+no/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/no/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,no,string,,no,string,
+no/e49428d9_nohash_3,1.0,0,16000,/localscratch/GSC/no/e49428d9_nohash_3.wav,wav,,e49428d9,string,,no,string,,no,string,
+no/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_7.wav,wav,,893705bb,string,,no,string,,no,string,
+no/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/no/8056e897_nohash_0.wav,wav,,8056e897,string,,no,string,,no,string,
+no/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/no/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,no,string,,no,string,
+no/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/no/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,no,string,,no,string,
+no/caf9fceb_nohash_0,1.0,0,16000,/localscratch/GSC/no/caf9fceb_nohash_0.wav,wav,,caf9fceb,string,,no,string,,no,string,
+no/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/no/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,no,string,,no,string,
+no/3d86b69a_nohash_2,1.0,0,16000,/localscratch/GSC/no/3d86b69a_nohash_2.wav,wav,,3d86b69a,string,,no,string,,no,string,
+no/47d01978_nohash_0,1.0,0,16000,/localscratch/GSC/no/47d01978_nohash_0.wav,wav,,47d01978,string,,no,string,,no,string,
+no/c7dc7278_nohash_4,1.0,0,16000,/localscratch/GSC/no/c7dc7278_nohash_4.wav,wav,,c7dc7278,string,,no,string,,no,string,
+no/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/no/97f4c236_nohash_0.wav,wav,,97f4c236,string,,no,string,,no,string,
+no/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/no/aa80f517_nohash_1.wav,wav,,aa80f517,string,,no,string,,no,string,
+no/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/no/0cb74144_nohash_1.wav,wav,,0cb74144,string,,no,string,,no,string,
+no/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/no/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,no,string,,no,string,
+no/44715c1c_nohash_0,1.0,0,16000,/localscratch/GSC/no/44715c1c_nohash_0.wav,wav,,44715c1c,string,,no,string,,no,string,
+no/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/no/9a69672b_nohash_0.wav,wav,,9a69672b,string,,no,string,,no,string,
+no/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/no/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,no,string,,no,string,
+no/83957201_nohash_0,1.0,0,16000,/localscratch/GSC/no/83957201_nohash_0.wav,wav,,83957201,string,,no,string,,no,string,
+no/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/no/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,no,string,,no,string,
+no/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/no/1acc97de_nohash_2.wav,wav,,1acc97de,string,,no,string,,no,string,
+no/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/no/63f7a489_nohash_2.wav,wav,,63f7a489,string,,no,string,,no,string,
+no/a2473d62_nohash_1,1.0,0,16000,/localscratch/GSC/no/a2473d62_nohash_1.wav,wav,,a2473d62,string,,no,string,,no,string,
+no/1cb788bc_nohash_1,1.0,0,16000,/localscratch/GSC/no/1cb788bc_nohash_1.wav,wav,,1cb788bc,string,,no,string,,no,string,
+no/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/no/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,no,string,,no,string,
+no/e49428d9_nohash_0,1.0,0,16000,/localscratch/GSC/no/e49428d9_nohash_0.wav,wav,,e49428d9,string,,no,string,,no,string,
+no/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_5.wav,wav,,893705bb,string,,no,string,,no,string,
+no/fe1916ba_nohash_0,1.0,0,16000,/localscratch/GSC/no/fe1916ba_nohash_0.wav,wav,,fe1916ba,string,,no,string,,no,string,
+no/ca4d5368_nohash_4,1.0,0,16000,/localscratch/GSC/no/ca4d5368_nohash_4.wav,wav,,ca4d5368,string,,no,string,,no,string,
+no/5744b6a7_nohash_0,1.0,0,16000,/localscratch/GSC/no/5744b6a7_nohash_0.wav,wav,,5744b6a7,string,,no,string,,no,string,
+no/135c6841_nohash_0,1.0,0,16000,/localscratch/GSC/no/135c6841_nohash_0.wav,wav,,135c6841,string,,no,string,,no,string,
+no/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/no/f9643d42_nohash_0.wav,wav,,f9643d42,string,,no,string,,no,string,
+no/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/no/beb458a4_nohash_3.wav,wav,,beb458a4,string,,no,string,,no,string,
+no/0c540988_nohash_0,1.0,0,16000,/localscratch/GSC/no/0c540988_nohash_0.wav,wav,,0c540988,string,,no,string,,no,string,
+no/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/no/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,no,string,,no,string,
+no/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/no/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,no,string,,no,string,
+no/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/no/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,no,string,,no,string,
+no/03401e93_nohash_1,1.0,0,16000,/localscratch/GSC/no/03401e93_nohash_1.wav,wav,,03401e93,string,,no,string,,no,string,
+no/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/no/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,no,string,,no,string,
+no/3b4f8f24_nohash_4,1.0,0,16000,/localscratch/GSC/no/3b4f8f24_nohash_4.wav,wav,,3b4f8f24,string,,no,string,,no,string,
+no/8a325749_nohash_0,1.0,0,16000,/localscratch/GSC/no/8a325749_nohash_0.wav,wav,,8a325749,string,,no,string,,no,string,
+no/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/no/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,no,string,,no,string,
+no/ca4d5368_nohash_5,1.0,0,16000,/localscratch/GSC/no/ca4d5368_nohash_5.wav,wav,,ca4d5368,string,,no,string,,no,string,
+no/84d1e469_nohash_0,1.0,0,16000,/localscratch/GSC/no/84d1e469_nohash_0.wav,wav,,84d1e469,string,,no,string,,no,string,
+no/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/no/87070229_nohash_1.wav,wav,,87070229,string,,no,string,,no,string,
+no/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/no/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,no,string,,no,string,
+no/e49428d9_nohash_2,1.0,0,16000,/localscratch/GSC/no/e49428d9_nohash_2.wav,wav,,e49428d9,string,,no,string,,no,string,
+no/af405b69_nohash_2,1.0,0,16000,/localscratch/GSC/no/af405b69_nohash_2.wav,wav,,af405b69,string,,no,string,,no,string,
+no/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/no/e1469561_nohash_0.wav,wav,,e1469561,string,,no,string,,no,string,
+no/db24628d_nohash_4,1.0,0,16000,/localscratch/GSC/no/db24628d_nohash_4.wav,wav,,db24628d,string,,no,string,,no,string,
+no/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/no/e41a903b_nohash_3.wav,wav,,e41a903b,string,,no,string,,no,string,
+no/94de6a6a_nohash_4,1.0,0,16000,/localscratch/GSC/no/94de6a6a_nohash_4.wav,wav,,94de6a6a,string,,no,string,,no,string,
+no/fa446c16_nohash_1,1.0,0,16000,/localscratch/GSC/no/fa446c16_nohash_1.wav,wav,,fa446c16,string,,no,string,,no,string,
+no/6e916de8_nohash_1,1.0,0,16000,/localscratch/GSC/no/6e916de8_nohash_1.wav,wav,,6e916de8,string,,no,string,,no,string,
+no/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/no/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,no,string,,no,string,
+no/837a0f64_nohash_4,1.0,0,16000,/localscratch/GSC/no/837a0f64_nohash_4.wav,wav,,837a0f64,string,,no,string,,no,string,
+no/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/no/692a88e6_nohash_2.wav,wav,,692a88e6,string,,no,string,,no,string,
+no/85834399_nohash_0,1.0,0,16000,/localscratch/GSC/no/85834399_nohash_0.wav,wav,,85834399,string,,no,string,,no,string,
+no/5c8af87a_nohash_3,1.0,0,16000,/localscratch/GSC/no/5c8af87a_nohash_3.wav,wav,,5c8af87a,string,,no,string,,no,string,
+no/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/no/8769c34c_nohash_1.wav,wav,,8769c34c,string,,no,string,,no,string,
+no/84d1e469_nohash_1,1.0,0,16000,/localscratch/GSC/no/84d1e469_nohash_1.wav,wav,,84d1e469,string,,no,string,,no,string,
+no/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/no/b49caed3_nohash_3.wav,wav,,b49caed3,string,,no,string,,no,string,
+no/a4e8a997_nohash_0,1.0,0,16000,/localscratch/GSC/no/a4e8a997_nohash_0.wav,wav,,a4e8a997,string,,no,string,,no,string,
+no/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/no/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,no,string,,no,string,
+no/bb05582b_nohash_4,1.0,0,16000,/localscratch/GSC/no/bb05582b_nohash_4.wav,wav,,bb05582b,string,,no,string,,no,string,
+no/1fe4c891_nohash_0,1.0,0,16000,/localscratch/GSC/no/1fe4c891_nohash_0.wav,wav,,1fe4c891,string,,no,string,,no,string,
+no/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/no/2fa39636_nohash_0.wav,wav,,2fa39636,string,,no,string,,no,string,
+no/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/no/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,no,string,,no,string,
+no/881583a6_nohash_2,1.0,0,16000,/localscratch/GSC/no/881583a6_nohash_2.wav,wav,,881583a6,string,,no,string,,no,string,
+no/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/no/a4383927_nohash_0.wav,wav,,a4383927,string,,no,string,,no,string,
+no/42beb5eb_nohash_0,1.0,0,16000,/localscratch/GSC/no/42beb5eb_nohash_0.wav,wav,,42beb5eb,string,,no,string,,no,string,
+no/e49428d9_nohash_1,1.0,0,16000,/localscratch/GSC/no/e49428d9_nohash_1.wav,wav,,e49428d9,string,,no,string,,no,string,
+no/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/no/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,no,string,,no,string,
+no/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/no/aa80f517_nohash_0.wav,wav,,aa80f517,string,,no,string,,no,string,
+no/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/no/5170b77f_nohash_2.wav,wav,,5170b77f,string,,no,string,,no,string,
+no/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/no/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,no,string,,no,string,
+no/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/no/db24628d_nohash_1.wav,wav,,db24628d,string,,no,string,,no,string,
+no/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/no/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,no,string,,no,string,
+no/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/no/beb458a4_nohash_1.wav,wav,,beb458a4,string,,no,string,,no,string,
+no/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/no/d7467392_nohash_0.wav,wav,,d7467392,string,,no,string,,no,string,
+no/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/no/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,no,string,,no,string,
+no/a1533da4_nohash_0,1.0,0,16000,/localscratch/GSC/no/a1533da4_nohash_0.wav,wav,,a1533da4,string,,no,string,,no,string,
+no/5e3dde6b_nohash_3,1.0,0,16000,/localscratch/GSC/no/5e3dde6b_nohash_3.wav,wav,,5e3dde6b,string,,no,string,,no,string,
+no/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/no/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,no,string,,no,string,
+no/90b94017_nohash_0,1.0,0,16000,/localscratch/GSC/no/90b94017_nohash_0.wav,wav,,90b94017,string,,no,string,,no,string,
+no/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/no/bb05582b_nohash_1.wav,wav,,bb05582b,string,,no,string,,no,string,
+no/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/no/5f01c798_nohash_0.wav,wav,,5f01c798,string,,no,string,,no,string,
+no/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/no/f9643d42_nohash_2.wav,wav,,f9643d42,string,,no,string,,no,string,
+no/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/no/f292725f_nohash_0.wav,wav,,f292725f,string,,no,string,,no,string,
+no/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/no/a7216980_nohash_2.wav,wav,,a7216980,string,,no,string,,no,string,
+no/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/no/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,no,string,,no,string,
+no/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/no/1acc97de_nohash_0.wav,wav,,1acc97de,string,,no,string,,no,string,
+no/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/no/aa80f517_nohash_2.wav,wav,,aa80f517,string,,no,string,,no,string,
+no/8ec6dab6_nohash_2,1.0,0,16000,/localscratch/GSC/no/8ec6dab6_nohash_2.wav,wav,,8ec6dab6,string,,no,string,,no,string,
+no/82b99576_nohash_1,1.0,0,16000,/localscratch/GSC/no/82b99576_nohash_1.wav,wav,,82b99576,string,,no,string,,no,string,
+no/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/no/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,no,string,,no,string,
+no/9e2ce5e3_nohash_2,1.0,0,16000,/localscratch/GSC/no/9e2ce5e3_nohash_2.wav,wav,,9e2ce5e3,string,,no,string,,no,string,
+no/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/no/a7216980_nohash_0.wav,wav,,a7216980,string,,no,string,,no,string,
+no/db24628d_nohash_3,1.0,0,16000,/localscratch/GSC/no/db24628d_nohash_3.wav,wav,,db24628d,string,,no,string,,no,string,
+no/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/no/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,no,string,,no,string,
+no/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/no/b49caed3_nohash_2.wav,wav,,b49caed3,string,,no,string,,no,string,
+no/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_8.wav,wav,,893705bb,string,,no,string,,no,string,
+no/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/no/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,no,string,,no,string,
+no/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/no/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,no,string,,no,string,
+no/0ea0e2f4_nohash_0,1.0,0,16000,/localscratch/GSC/no/0ea0e2f4_nohash_0.wav,wav,,0ea0e2f4,string,,no,string,,no,string,
+no/ffb86d3c_nohash_0,1.0,0,16000,/localscratch/GSC/no/ffb86d3c_nohash_0.wav,wav,,ffb86d3c,string,,no,string,,no,string,
+no/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/no/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,no,string,,no,string,
+no/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/no/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,no,string,,no,string,
+no/ef2a3cfb_nohash_0,1.0,0,16000,/localscratch/GSC/no/ef2a3cfb_nohash_0.wav,wav,,ef2a3cfb,string,,no,string,,no,string,
+no/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/no/cd85758f_nohash_3.wav,wav,,cd85758f,string,,no,string,,no,string,
+no/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/no/91b03183_nohash_2.wav,wav,,91b03183,string,,no,string,,no,string,
+no/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/no/63f7a489_nohash_3.wav,wav,,63f7a489,string,,no,string,,no,string,
+no/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/no/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,no,string,,no,string,
+no/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/no/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,no,string,,no,string,
+no/3d86b69a_nohash_3,1.0,0,16000,/localscratch/GSC/no/3d86b69a_nohash_3.wav,wav,,3d86b69a,string,,no,string,,no,string,
+no/0c40e715_nohash_0,1.0,0,16000,/localscratch/GSC/no/0c40e715_nohash_0.wav,wav,,0c40e715,string,,no,string,,no,string,
+no/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/no/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,no,string,,no,string,
+no/135c6841_nohash_3,1.0,0,16000,/localscratch/GSC/no/135c6841_nohash_3.wav,wav,,135c6841,string,,no,string,,no,string,
+no/8fe52b97_nohash_1,1.0,0,16000,/localscratch/GSC/no/8fe52b97_nohash_1.wav,wav,,8fe52b97,string,,no,string,,no,string,
+no/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/no/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,no,string,,no,string,
+no/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/no/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,no,string,,no,string,
+no/f9643d42_nohash_4,1.0,0,16000,/localscratch/GSC/no/f9643d42_nohash_4.wav,wav,,f9643d42,string,,no,string,,no,string,
+no/af405b69_nohash_0,1.0,0,16000,/localscratch/GSC/no/af405b69_nohash_0.wav,wav,,af405b69,string,,no,string,,no,string,
+no/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/no/b49caed3_nohash_0.wav,wav,,b49caed3,string,,no,string,,no,string,
+no/63f7a489_nohash_4,1.0,0,16000,/localscratch/GSC/no/63f7a489_nohash_4.wav,wav,,63f7a489,string,,no,string,,no,string,
+no/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/no/881583a6_nohash_0.wav,wav,,881583a6,string,,no,string,,no,string,
+no/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/no/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,no,string,,no,string,
+no/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/no/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,no,string,,no,string,
+no/af7a8296_nohash_0,1.0,0,16000,/localscratch/GSC/no/af7a8296_nohash_0.wav,wav,,af7a8296,string,,no,string,,no,string,
+no/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/no/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,no,string,,no,string,
+no/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/no/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,no,string,,no,string,
+no/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/no/692a88e6_nohash_1.wav,wav,,692a88e6,string,,no,string,,no,string,
+no/27c30960_nohash_0,1.0,0,16000,/localscratch/GSC/no/27c30960_nohash_0.wav,wav,,27c30960,string,,no,string,,no,string,
+no/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/no/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,no,string,,no,string,
+no/7bae88ed_nohash_0,1.0,0,16000,/localscratch/GSC/no/7bae88ed_nohash_0.wav,wav,,7bae88ed,string,,no,string,,no,string,
+no/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/no/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,no,string,,no,string,
+no/863880b7_nohash_0,1.0,0,16000,/localscratch/GSC/no/863880b7_nohash_0.wav,wav,,863880b7,string,,no,string,,no,string,
+no/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/no/37dca74f_nohash_2.wav,wav,,37dca74f,string,,no,string,,no,string,
+no/d91a159e_nohash_1,1.0,0,16000,/localscratch/GSC/no/d91a159e_nohash_1.wav,wav,,d91a159e,string,,no,string,,no,string,
+no/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/no/bb05582b_nohash_2.wav,wav,,bb05582b,string,,no,string,,no,string,
+no/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/no/692a88e6_nohash_0.wav,wav,,692a88e6,string,,no,string,,no,string,
+no/cfde27ba_nohash_0,1.0,0,16000,/localscratch/GSC/no/cfde27ba_nohash_0.wav,wav,,cfde27ba,string,,no,string,,no,string,
+no/8c7f81df_nohash_1,1.0,0,16000,/localscratch/GSC/no/8c7f81df_nohash_1.wav,wav,,8c7f81df,string,,no,string,,no,string,
+no/84d1e469_nohash_2,1.0,0,16000,/localscratch/GSC/no/84d1e469_nohash_2.wav,wav,,84d1e469,string,,no,string,,no,string,
+no/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/no/aa80f517_nohash_3.wav,wav,,aa80f517,string,,no,string,,no,string,
+no/8fe52b97_nohash_0,1.0,0,16000,/localscratch/GSC/no/8fe52b97_nohash_0.wav,wav,,8fe52b97,string,,no,string,,no,string,
+no/27c30960_nohash_1,1.0,0,16000,/localscratch/GSC/no/27c30960_nohash_1.wav,wav,,27c30960,string,,no,string,,no,string,
+no/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_1.wav,wav,,893705bb,string,,no,string,,no,string,
+no/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/no/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,no,string,,no,string,
+no/6736bc64_nohash_0,1.0,0,16000,/localscratch/GSC/no/6736bc64_nohash_0.wav,wav,,6736bc64,string,,no,string,,no,string,
+no/85834399_nohash_2,1.0,0,16000,/localscratch/GSC/no/85834399_nohash_2.wav,wav,,85834399,string,,no,string,,no,string,
+no/18f8afd5_nohash_4,1.0,0,16000,/localscratch/GSC/no/18f8afd5_nohash_4.wav,wav,,18f8afd5,string,,no,string,,no,string,
+no/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/no/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,no,string,,no,string,
+no/c518d1b1_nohash_0,1.0,0,16000,/localscratch/GSC/no/c518d1b1_nohash_0.wav,wav,,c518d1b1,string,,no,string,,no,string,
+no/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/no/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,no,string,,no,string,
+no/85834399_nohash_1,1.0,0,16000,/localscratch/GSC/no/85834399_nohash_1.wav,wav,,85834399,string,,no,string,,no,string,
+no/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/no/cd85758f_nohash_2.wav,wav,,cd85758f,string,,no,string,,no,string,
+no/3cbd76a3_nohash_0,1.0,0,16000,/localscratch/GSC/no/3cbd76a3_nohash_0.wav,wav,,3cbd76a3,string,,no,string,,no,string,
+no/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/no/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,no,string,,no,string,
+no/189cbabe_nohash_4,1.0,0,16000,/localscratch/GSC/no/189cbabe_nohash_4.wav,wav,,189cbabe,string,,no,string,,no,string,
+no/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/no/63f7a489_nohash_1.wav,wav,,63f7a489,string,,no,string,,no,string,
+no/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/no/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,no,string,,no,string,
+no/5f449e17_nohash_0,1.0,0,16000,/localscratch/GSC/no/5f449e17_nohash_0.wav,wav,,5f449e17,string,,no,string,,no,string,
+no/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/no/189cbabe_nohash_2.wav,wav,,189cbabe,string,,no,string,,no,string,
+no/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/no/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,no,string,,no,string,
+no/8ec6dab6_nohash_1,1.0,0,16000,/localscratch/GSC/no/8ec6dab6_nohash_1.wav,wav,,8ec6dab6,string,,no,string,,no,string,
+no/a591c2ea_nohash_0,1.0,0,16000,/localscratch/GSC/no/a591c2ea_nohash_0.wav,wav,,a591c2ea,string,,no,string,,no,string,
+no/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/no/422d3197_nohash_0.wav,wav,,422d3197,string,,no,string,,no,string,
+no/fdb5155e_nohash_0,1.0,0,16000,/localscratch/GSC/no/fdb5155e_nohash_0.wav,wav,,fdb5155e,string,,no,string,,no,string,
+no/8ec6dab6_nohash_0,1.0,0,16000,/localscratch/GSC/no/8ec6dab6_nohash_0.wav,wav,,8ec6dab6,string,,no,string,,no,string,
+no/aa233654_nohash_0,1.0,0,16000,/localscratch/GSC/no/aa233654_nohash_0.wav,wav,,aa233654,string,,no,string,,no,string,
+no/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_6.wav,wav,,893705bb,string,,no,string,,no,string,
+no/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/no/189cbabe_nohash_0.wav,wav,,189cbabe,string,,no,string,,no,string,
+no/8c7f81df_nohash_0,1.0,0,16000,/localscratch/GSC/no/8c7f81df_nohash_0.wav,wav,,8c7f81df,string,,no,string,,no,string,
+no/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/no/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,no,string,,no,string,
+no/283d7a53_nohash_1,1.0,0,16000,/localscratch/GSC/no/283d7a53_nohash_1.wav,wav,,283d7a53,string,,no,string,,no,string,
+no/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/no/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,no,string,,no,string,
+no/d91a159e_nohash_0,1.0,0,16000,/localscratch/GSC/no/d91a159e_nohash_0.wav,wav,,d91a159e,string,,no,string,,no,string,
+no/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/no/7257420c_nohash_0.wav,wav,,7257420c,string,,no,string,,no,string,
+no/2796ac50_nohash_1,1.0,0,16000,/localscratch/GSC/no/2796ac50_nohash_1.wav,wav,,2796ac50,string,,no,string,,no,string,
+no/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/no/beb458a4_nohash_0.wav,wav,,beb458a4,string,,no,string,,no,string,
+no/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/no/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,no,string,,no,string,
+no/a60a09cf_nohash_1,1.0,0,16000,/localscratch/GSC/no/a60a09cf_nohash_1.wav,wav,,a60a09cf,string,,no,string,,no,string,
+no/893705bb_nohash_12,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_12.wav,wav,,893705bb,string,,no,string,,no,string,
+no/62ff07ef_nohash_0,1.0,0,16000,/localscratch/GSC/no/62ff07ef_nohash_0.wav,wav,,62ff07ef,string,,no,string,,no,string,
+no/5f814c23_nohash_0,1.0,0,16000,/localscratch/GSC/no/5f814c23_nohash_0.wav,wav,,5f814c23,string,,no,string,,no,string,
+no/dcb57584_nohash_0,1.0,0,16000,/localscratch/GSC/no/dcb57584_nohash_0.wav,wav,,dcb57584,string,,no,string,,no,string,
+no/67961766_nohash_0,1.0,0,16000,/localscratch/GSC/no/67961766_nohash_0.wav,wav,,67961766,string,,no,string,,no,string,
+no/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/no/6f689791_nohash_0.wav,wav,,6f689791,string,,no,string,,no,string,
+no/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/no/87070229_nohash_2.wav,wav,,87070229,string,,no,string,,no,string,
+no/1cb788bc_nohash_0,1.0,0,16000,/localscratch/GSC/no/1cb788bc_nohash_0.wav,wav,,1cb788bc,string,,no,string,,no,string,
+no/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/no/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,no,string,,no,string,
+no/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/no/1acc97de_nohash_3.wav,wav,,1acc97de,string,,no,string,,no,string,
+no/5c8af87a_nohash_4,1.0,0,16000,/localscratch/GSC/no/5c8af87a_nohash_4.wav,wav,,5c8af87a,string,,no,string,,no,string,
+no/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/no/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,no,string,,no,string,
+no/563aa4e6_nohash_4,1.0,0,16000,/localscratch/GSC/no/563aa4e6_nohash_4.wav,wav,,563aa4e6,string,,no,string,,no,string,
+no/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/no/87070229_nohash_0.wav,wav,,87070229,string,,no,string,,no,string,
+no/6021f08b_nohash_0,1.0,0,16000,/localscratch/GSC/no/6021f08b_nohash_0.wav,wav,,6021f08b,string,,no,string,,no,string,
+no/03401e93_nohash_3,1.0,0,16000,/localscratch/GSC/no/03401e93_nohash_3.wav,wav,,03401e93,string,,no,string,,no,string,
+no/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/no/97f4c236_nohash_4.wav,wav,,97f4c236,string,,no,string,,no,string,
+no/9a7c1f83_nohash_2,1.0,0,16000,/localscratch/GSC/no/9a7c1f83_nohash_2.wav,wav,,9a7c1f83,string,,no,string,,no,string,
+no/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/no/837a0f64_nohash_0.wav,wav,,837a0f64,string,,no,string,,no,string,
+no/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/no/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,no,string,,no,string,
+no/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/no/8769c34c_nohash_2.wav,wav,,8769c34c,string,,no,string,,no,string,
+no/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/no/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,no,string,,no,string,
+no/881583a6_nohash_1,1.0,0,16000,/localscratch/GSC/no/881583a6_nohash_1.wav,wav,,881583a6,string,,no,string,,no,string,
+no/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/no/a7216980_nohash_3.wav,wav,,a7216980,string,,no,string,,no,string,
+no/8494fba8_nohash_0,1.0,0,16000,/localscratch/GSC/no/8494fba8_nohash_0.wav,wav,,8494fba8,string,,no,string,,no,string,
+no/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/no/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,no,string,,no,string,
+no/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/no/8fe67225_nohash_1.wav,wav,,8fe67225,string,,no,string,,no,string,
+no/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/no/837a0f64_nohash_2.wav,wav,,837a0f64,string,,no,string,,no,string,
+no/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/no/db24628d_nohash_2.wav,wav,,db24628d,string,,no,string,,no,string,
+no/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/no/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,no,string,,no,string,
+no/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/no/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,no,string,,no,string,
+no/67961766_nohash_1,1.0,0,16000,/localscratch/GSC/no/67961766_nohash_1.wav,wav,,67961766,string,,no,string,,no,string,
+no/893705bb_nohash_11,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_11.wav,wav,,893705bb,string,,no,string,,no,string,
+no/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/no/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,no,string,,no,string,
+no/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/no/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,no,string,,no,string,
+no/d5ca80c6_nohash_0,1.0,0,16000,/localscratch/GSC/no/d5ca80c6_nohash_0.wav,wav,,d5ca80c6,string,,no,string,,no,string,
+no/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_2.wav,wav,,893705bb,string,,no,string,,no,string,
+no/8494fba8_nohash_1,1.0,0,16000,/localscratch/GSC/no/8494fba8_nohash_1.wav,wav,,8494fba8,string,,no,string,,no,string,
+no/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/no/9a69672b_nohash_2.wav,wav,,9a69672b,string,,no,string,,no,string,
+no/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/no/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,no,string,,no,string,
+no/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/no/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,no,string,,no,string,
+no/af405b69_nohash_1,1.0,0,16000,/localscratch/GSC/no/af405b69_nohash_1.wav,wav,,af405b69,string,,no,string,,no,string,
+no/e41a903b_nohash_4,1.0,0,16000,/localscratch/GSC/no/e41a903b_nohash_4.wav,wav,,e41a903b,string,,no,string,,no,string,
+no/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/no/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,no,string,,no,string,
+no/af130f12_nohash_1,1.0,0,16000,/localscratch/GSC/no/af130f12_nohash_1.wav,wav,,af130f12,string,,no,string,,no,string,
+no/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/no/beb458a4_nohash_2.wav,wav,,beb458a4,string,,no,string,,no,string,
+no/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/no/0cb74144_nohash_2.wav,wav,,0cb74144,string,,no,string,,no,string,
+no/d0faf7e4_nohash_4,1.0,0,16000,/localscratch/GSC/no/d0faf7e4_nohash_4.wav,wav,,d0faf7e4,string,,no,string,,no,string,
+no/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/no/97f4c236_nohash_3.wav,wav,,97f4c236,string,,no,string,,no,string,
+no/3f170018_nohash_0,1.0,0,16000,/localscratch/GSC/no/3f170018_nohash_0.wav,wav,,3f170018,string,,no,string,,no,string,
+no/82b99576_nohash_2,1.0,0,16000,/localscratch/GSC/no/82b99576_nohash_2.wav,wav,,82b99576,string,,no,string,,no,string,
+no/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/no/8769c34c_nohash_0.wav,wav,,8769c34c,string,,no,string,,no,string,
+no/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/no/8fe67225_nohash_3.wav,wav,,8fe67225,string,,no,string,,no,string,
+no/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/no/189cbabe_nohash_3.wav,wav,,189cbabe,string,,no,string,,no,string,
+no/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/no/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,no,string,,no,string,
+no/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/no/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,no,string,,no,string,
+no/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/no/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,no,string,,no,string,
+no/24ad3ebe_nohash_0,1.0,0,16000,/localscratch/GSC/no/24ad3ebe_nohash_0.wav,wav,,24ad3ebe,string,,no,string,,no,string,
+no/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/no/0cb74144_nohash_0.wav,wav,,0cb74144,string,,no,string,,no,string,
+no/6f2f57c1_nohash_0,1.0,0,16000,/localscratch/GSC/no/6f2f57c1_nohash_0.wav,wav,,6f2f57c1,string,,no,string,,no,string,
+no/283d7a53_nohash_0,1.0,0,16000,/localscratch/GSC/no/283d7a53_nohash_0.wav,wav,,283d7a53,string,,no,string,,no,string,
+no/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/no/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,no,string,,no,string,
+no/a6f2fd71_nohash_4,1.0,0,16000,/localscratch/GSC/no/a6f2fd71_nohash_4.wav,wav,,a6f2fd71,string,,no,string,,no,string,
+no/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/no/37dca74f_nohash_1.wav,wav,,37dca74f,string,,no,string,,no,string,
+no/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/no/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,no,string,,no,string,
+no/422d3197_nohash_1,1.0,0,16000,/localscratch/GSC/no/422d3197_nohash_1.wav,wav,,422d3197,string,,no,string,,no,string,
+no/1093c8e7_nohash_0,1.0,0,16000,/localscratch/GSC/no/1093c8e7_nohash_0.wav,wav,,1093c8e7,string,,no,string,,no,string,
+no/f2e59fea_nohash_4,1.0,0,16000,/localscratch/GSC/no/f2e59fea_nohash_4.wav,wav,,f2e59fea,string,,no,string,,no,string,
+no/4c6167ca_nohash_6,1.0,0,16000,/localscratch/GSC/no/4c6167ca_nohash_6.wav,wav,,4c6167ca,string,,no,string,,no,string,
+no/9a7c1f83_nohash_3,1.0,0,16000,/localscratch/GSC/no/9a7c1f83_nohash_3.wav,wav,,9a7c1f83,string,,no,string,,no,string,
+no/9e2ce5e3_nohash_1,1.0,0,16000,/localscratch/GSC/no/9e2ce5e3_nohash_1.wav,wav,,9e2ce5e3,string,,no,string,,no,string,
+no/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/no/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,no,string,,no,string,
+no/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/no/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,no,string,,no,string,
+no/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/no/68dd409e_nohash_0.wav,wav,,68dd409e,string,,no,string,,no,string,
+no/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/no/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,no,string,,no,string,
+no/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/no/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,no,string,,no,string,
+no/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_3.wav,wav,,893705bb,string,,no,string,,no,string,
+no/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/no/f428ca69_nohash_0.wav,wav,,f428ca69,string,,no,string,,no,string,
+no/3d86b69a_nohash_4,1.0,0,16000,/localscratch/GSC/no/3d86b69a_nohash_4.wav,wav,,3d86b69a,string,,no,string,,no,string,
+no/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/no/e41a903b_nohash_0.wav,wav,,e41a903b,string,,no,string,,no,string,
+no/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/no/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,no,string,,no,string,
+no/91b03183_nohash_3,1.0,0,16000,/localscratch/GSC/no/91b03183_nohash_3.wav,wav,,91b03183,string,,no,string,,no,string,
+no/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/no/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,no,string,,no,string,
+no/af7a8296_nohash_1,1.0,0,16000,/localscratch/GSC/no/af7a8296_nohash_1.wav,wav,,af7a8296,string,,no,string,,no,string,
+no/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/no/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,no,string,,no,string,
+no/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/no/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,no,string,,no,string,
+no/cc592808_nohash_0,1.0,0,16000,/localscratch/GSC/no/cc592808_nohash_0.wav,wav,,cc592808,string,,no,string,,no,string,
+no/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/no/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,no,string,,no,string,
+no/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/no/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,no,string,,no,string,
+no/03401e93_nohash_2,1.0,0,16000,/localscratch/GSC/no/03401e93_nohash_2.wav,wav,,03401e93,string,,no,string,,no,string,
+no/6736bc64_nohash_1,1.0,0,16000,/localscratch/GSC/no/6736bc64_nohash_1.wav,wav,,6736bc64,string,,no,string,,no,string,
+no/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/no/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,no,string,,no,string,
+no/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/no/e1469561_nohash_3.wav,wav,,e1469561,string,,no,string,,no,string,
+no/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/no/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,no,string,,no,string,
+no/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/no/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,no,string,,no,string,
+no/412c675c_nohash_0,1.0,0,16000,/localscratch/GSC/no/412c675c_nohash_0.wav,wav,,412c675c,string,,no,string,,no,string,
+no/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/no/2d82a556_nohash_0.wav,wav,,2d82a556,string,,no,string,,no,string,
+no/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/no/1acc97de_nohash_1.wav,wav,,1acc97de,string,,no,string,,no,string,
+no/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/no/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,no,string,,no,string,
+no/6e916de8_nohash_0,1.0,0,16000,/localscratch/GSC/no/6e916de8_nohash_0.wav,wav,,6e916de8,string,,no,string,,no,string,
+no/1f3bece8_nohash_0,1.0,0,16000,/localscratch/GSC/no/1f3bece8_nohash_0.wav,wav,,1f3bece8,string,,no,string,,no,string,
+no/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/no/87070229_nohash_3.wav,wav,,87070229,string,,no,string,,no,string,
+no/135c6841_nohash_2,1.0,0,16000,/localscratch/GSC/no/135c6841_nohash_2.wav,wav,,135c6841,string,,no,string,,no,string,
+no/5828dfa2_nohash_0,1.0,0,16000,/localscratch/GSC/no/5828dfa2_nohash_0.wav,wav,,5828dfa2,string,,no,string,,no,string,
+no/893705bb_nohash_10,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_10.wav,wav,,893705bb,string,,no,string,,no,string,
+no/283d7a53_nohash_2,1.0,0,16000,/localscratch/GSC/no/283d7a53_nohash_2.wav,wav,,283d7a53,string,,no,string,,no,string,
+no/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/no/91b03183_nohash_1.wav,wav,,91b03183,string,,no,string,,no,string,
+no/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/no/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,no,string,,no,string,
+no/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/no/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,no,string,,no,string,
+no/3efef882_nohash_0,1.0,0,16000,/localscratch/GSC/no/3efef882_nohash_0.wav,wav,,3efef882,string,,no,string,,no,string,
+no/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/no/bb05582b_nohash_0.wav,wav,,bb05582b,string,,no,string,,no,string,
+no/4c841771_nohash_0,1.0,0,16000,/localscratch/GSC/no/4c841771_nohash_0.wav,wav,,4c841771,string,,no,string,,no,string,
+no/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/no/8fe67225_nohash_4.wav,wav,,8fe67225,string,,no,string,,no,string,
+no/fb7eb481_nohash_4,1.0,0,16000,/localscratch/GSC/no/fb7eb481_nohash_4.wav,wav,,fb7eb481,string,,no,string,,no,string,
+no/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/no/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,no,string,,no,string,
+no/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/no/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,no,string,,no,string,
+no/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/no/f9643d42_nohash_1.wav,wav,,f9643d42,string,,no,string,,no,string,
+no/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/no/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,no,string,,no,string,
+no/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/no/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,no,string,,no,string,
+no/863880b7_nohash_1,1.0,0,16000,/localscratch/GSC/no/863880b7_nohash_1.wav,wav,,863880b7,string,,no,string,,no,string,
+no/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/no/837a0f64_nohash_3.wav,wav,,837a0f64,string,,no,string,,no,string,
+no/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/no/e1469561_nohash_1.wav,wav,,e1469561,string,,no,string,,no,string,
+no/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/no/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,no,string,,no,string,
+no/af8b2f2c_nohash_0,1.0,0,16000,/localscratch/GSC/no/af8b2f2c_nohash_0.wav,wav,,af8b2f2c,string,,no,string,,no,string,
+no/e5e54cee_nohash_0,1.0,0,16000,/localscratch/GSC/no/e5e54cee_nohash_0.wav,wav,,e5e54cee,string,,no,string,,no,string,
+no/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/no/5170b77f_nohash_3.wav,wav,,5170b77f,string,,no,string,,no,string,
+no/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/no/f9643d42_nohash_3.wav,wav,,f9643d42,string,,no,string,,no,string,
+no/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_4.wav,wav,,893705bb,string,,no,string,,no,string,
+no/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/no/e1469561_nohash_2.wav,wav,,e1469561,string,,no,string,,no,string,
+no/692a88e6_nohash_4,1.0,0,16000,/localscratch/GSC/no/692a88e6_nohash_4.wav,wav,,692a88e6,string,,no,string,,no,string,
+no/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/no/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,no,string,,no,string,
+no/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/no/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,no,string,,no,string,
+no/af130f12_nohash_0,1.0,0,16000,/localscratch/GSC/no/af130f12_nohash_0.wav,wav,,af130f12,string,,no,string,,no,string,
+no/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/no/e1469561_nohash_4.wav,wav,,e1469561,string,,no,string,,no,string,
+no/cfbedff9_nohash_4,1.0,0,16000,/localscratch/GSC/no/cfbedff9_nohash_4.wav,wav,,cfbedff9,string,,no,string,,no,string,
+no/a2473d62_nohash_0,1.0,0,16000,/localscratch/GSC/no/a2473d62_nohash_0.wav,wav,,a2473d62,string,,no,string,,no,string,
+no/e5e54cee_nohash_1,1.0,0,16000,/localscratch/GSC/no/e5e54cee_nohash_1.wav,wav,,e5e54cee,string,,no,string,,no,string,
+no/9a69672b_nohash_4,1.0,0,16000,/localscratch/GSC/no/9a69672b_nohash_4.wav,wav,,9a69672b,string,,no,string,,no,string,
+no/9a356ab9_nohash_0,1.0,0,16000,/localscratch/GSC/no/9a356ab9_nohash_0.wav,wav,,9a356ab9,string,,no,string,,no,string,
+no/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/no/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,no,string,,no,string,
+no/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/no/bb05582b_nohash_3.wav,wav,,bb05582b,string,,no,string,,no,string,
+no/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/no/5170b77f_nohash_0.wav,wav,,5170b77f,string,,no,string,,no,string,
+no/82d0d3ba_nohash_0,1.0,0,16000,/localscratch/GSC/no/82d0d3ba_nohash_0.wav,wav,,82d0d3ba,string,,no,string,,no,string,
+no/28497c5b_nohash_0,1.0,0,16000,/localscratch/GSC/no/28497c5b_nohash_0.wav,wav,,28497c5b,string,,no,string,,no,string,
+no/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/no/0cb74144_nohash_3.wav,wav,,0cb74144,string,,no,string,,no,string,
+no/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/no/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,no,string,,no,string,
+no/798f702a_nohash_0,1.0,0,16000,/localscratch/GSC/no/798f702a_nohash_0.wav,wav,,798f702a,string,,no,string,,no,string,
+no/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/no/9a69672b_nohash_1.wav,wav,,9a69672b,string,,no,string,,no,string,
+no/893705bb_nohash_9,1.0,0,16000,/localscratch/GSC/no/893705bb_nohash_9.wav,wav,,893705bb,string,,no,string,,no,string,
+no/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/no/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,no,string,,no,string,
+no/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/no/37dca74f_nohash_0.wav,wav,,37dca74f,string,,no,string,,no,string,
+no/82b99576_nohash_0,1.0,0,16000,/localscratch/GSC/no/82b99576_nohash_0.wav,wav,,82b99576,string,,no,string,,no,string,
+up/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/up/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,up,string,,up,string,
+up/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/up/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,up,string,,up,string,
+up/b4877f8e_nohash_0,1.0,0,16000,/localscratch/GSC/up/b4877f8e_nohash_0.wav,wav,,b4877f8e,string,,up,string,,up,string,
+up/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/up/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,up,string,,up,string,
+up/7dc95912_nohash_0,1.0,0,16000,/localscratch/GSC/up/7dc95912_nohash_0.wav,wav,,7dc95912,string,,up,string,,up,string,
+up/44715c1c_nohash_1,1.0,0,16000,/localscratch/GSC/up/44715c1c_nohash_1.wav,wav,,44715c1c,string,,up,string,,up,string,
+up/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/up/cd85758f_nohash_1.wav,wav,,cd85758f,string,,up,string,,up,string,
+up/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/up/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,up,string,,up,string,
+up/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/up/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,up,string,,up,string,
+up/4a0e2c16_nohash_0,1.0,0,16000,/localscratch/GSC/up/4a0e2c16_nohash_0.wav,wav,,4a0e2c16,string,,up,string,,up,string,
+up/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/up/b49caed3_nohash_1.wav,wav,,b49caed3,string,,up,string,,up,string,
+up/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/up/9a69672b_nohash_3.wav,wav,,9a69672b,string,,up,string,,up,string,
+up/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/up/e41a903b_nohash_2.wav,wav,,e41a903b,string,,up,string,,up,string,
+up/e71b4ce6_nohash_0,1.0,0,16000,/localscratch/GSC/up/e71b4ce6_nohash_0.wav,wav,,e71b4ce6,string,,up,string,,up,string,
+up/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/up/dc75148d_nohash_0.wav,wav,,dc75148d,string,,up,string,,up,string,
+up/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/up/97f4c236_nohash_2.wav,wav,,97f4c236,string,,up,string,,up,string,
+up/03401e93_nohash_0,1.0,0,16000,/localscratch/GSC/up/03401e93_nohash_0.wav,wav,,03401e93,string,,up,string,,up,string,
+up/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/up/91b03183_nohash_0.wav,wav,,91b03183,string,,up,string,,up,string,
+up/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/up/5170b77f_nohash_1.wav,wav,,5170b77f,string,,up,string,,up,string,
+up/c9e251d2_nohash_1,1.0,0,16000,/localscratch/GSC/up/c9e251d2_nohash_1.wav,wav,,c9e251d2,string,,up,string,,up,string,
+up/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/up/db24628d_nohash_0.wav,wav,,db24628d,string,,up,string,,up,string,
+up/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/up/837a0f64_nohash_1.wav,wav,,837a0f64,string,,up,string,,up,string,
+up/50033893_nohash_3,1.0,0,16000,/localscratch/GSC/up/50033893_nohash_3.wav,wav,,50033893,string,,up,string,,up,string,
+up/8625475c_nohash_0,1.0,0,16000,/localscratch/GSC/up/8625475c_nohash_0.wav,wav,,8625475c,string,,up,string,,up,string,
+up/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/up/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,up,string,,up,string,
+up/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/up/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,up,string,,up,string,
+up/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/up/97f4c236_nohash_1.wav,wav,,97f4c236,string,,up,string,,up,string,
+up/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/up/a7216980_nohash_1.wav,wav,,a7216980,string,,up,string,,up,string,
+up/a60a09cf_nohash_0,1.0,0,16000,/localscratch/GSC/up/a60a09cf_nohash_0.wav,wav,,a60a09cf,string,,up,string,,up,string,
+up/bfd26d6b_nohash_4,1.0,0,16000,/localscratch/GSC/up/bfd26d6b_nohash_4.wav,wav,,bfd26d6b,string,,up,string,,up,string,
+up/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/up/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,up,string,,up,string,
+up/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/up/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,up,string,,up,string,
+up/8ea6dec6_nohash_1,1.0,0,16000,/localscratch/GSC/up/8ea6dec6_nohash_1.wav,wav,,8ea6dec6,string,,up,string,,up,string,
+up/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/up/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,up,string,,up,string,
+up/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/up/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,up,string,,up,string,
+up/c9b5ff26_nohash_4,1.0,0,16000,/localscratch/GSC/up/c9b5ff26_nohash_4.wav,wav,,c9b5ff26,string,,up,string,,up,string,
+up/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/up/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,up,string,,up,string,
+up/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/up/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,up,string,,up,string,
+up/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/up/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,up,string,,up,string,
+up/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/up/692a88e6_nohash_3.wav,wav,,692a88e6,string,,up,string,,up,string,
+up/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/up/63f7a489_nohash_0.wav,wav,,63f7a489,string,,up,string,,up,string,
+up/44260689_nohash_0,1.0,0,16000,/localscratch/GSC/up/44260689_nohash_0.wav,wav,,44260689,string,,up,string,,up,string,
+up/d9e9f554_nohash_0,1.0,0,16000,/localscratch/GSC/up/d9e9f554_nohash_0.wav,wav,,d9e9f554,string,,up,string,,up,string,
+up/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/up/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,up,string,,up,string,
+up/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/up/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,up,string,,up,string,
+up/ad6a46f1_nohash_0,1.0,0,16000,/localscratch/GSC/up/ad6a46f1_nohash_0.wav,wav,,ad6a46f1,string,,up,string,,up,string,
+up/2fa39636_nohash_1,1.0,0,16000,/localscratch/GSC/up/2fa39636_nohash_1.wav,wav,,2fa39636,string,,up,string,,up,string,
+up/2aa787cf_nohash_0,1.0,0,16000,/localscratch/GSC/up/2aa787cf_nohash_0.wav,wav,,2aa787cf,string,,up,string,,up,string,
+up/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_0.wav,wav,,893705bb,string,,up,string,,up,string,
+up/893705bb_nohash_13,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_13.wav,wav,,893705bb,string,,up,string,,up,string,
+up/4c7c95de_nohash_1,1.0,0,16000,/localscratch/GSC/up/4c7c95de_nohash_1.wav,wav,,4c7c95de,string,,up,string,,up,string,
+up/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/up/189cbabe_nohash_1.wav,wav,,189cbabe,string,,up,string,,up,string,
+up/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/up/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,up,string,,up,string,
+up/893705bb_nohash_15,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_15.wav,wav,,893705bb,string,,up,string,,up,string,
+up/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/up/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,up,string,,up,string,
+up/37fc5d97_nohash_1,1.0,0,16000,/localscratch/GSC/up/37fc5d97_nohash_1.wav,wav,,37fc5d97,string,,up,string,,up,string,
+up/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/up/8fe67225_nohash_0.wav,wav,,8fe67225,string,,up,string,,up,string,
+up/37fc5d97_nohash_2,1.0,0,16000,/localscratch/GSC/up/37fc5d97_nohash_2.wav,wav,,37fc5d97,string,,up,string,,up,string,
+up/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/up/beb458a4_nohash_4.wav,wav,,beb458a4,string,,up,string,,up,string,
+up/4290ca61_nohash_0,1.0,0,16000,/localscratch/GSC/up/4290ca61_nohash_0.wav,wav,,4290ca61,string,,up,string,,up,string,
+up/48a8a69d_nohash_0,1.0,0,16000,/localscratch/GSC/up/48a8a69d_nohash_0.wav,wav,,48a8a69d,string,,up,string,,up,string,
+up/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/up/2796ac50_nohash_0.wav,wav,,2796ac50,string,,up,string,,up,string,
+up/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/up/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,up,string,,up,string,
+up/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/up/cd85758f_nohash_0.wav,wav,,cd85758f,string,,up,string,,up,string,
+up/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/up/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,up,string,,up,string,
+up/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/up/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,up,string,,up,string,
+up/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/up/e41a903b_nohash_1.wav,wav,,e41a903b,string,,up,string,,up,string,
+up/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/up/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,up,string,,up,string,
+up/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/up/37dca74f_nohash_3.wav,wav,,37dca74f,string,,up,string,,up,string,
+up/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/up/8fe67225_nohash_2.wav,wav,,8fe67225,string,,up,string,,up,string,
+up/1acc97de_nohash_4,1.0,0,16000,/localscratch/GSC/up/1acc97de_nohash_4.wav,wav,,1acc97de,string,,up,string,,up,string,
+up/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/up/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,up,string,,up,string,
+up/653a48f5_nohash_0,1.0,0,16000,/localscratch/GSC/up/653a48f5_nohash_0.wav,wav,,653a48f5,string,,up,string,,up,string,
+up/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/up/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,up,string,,up,string,
+up/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_7.wav,wav,,893705bb,string,,up,string,,up,string,
+up/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/up/8056e897_nohash_0.wav,wav,,8056e897,string,,up,string,,up,string,
+up/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/up/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,up,string,,up,string,
+up/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/up/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,up,string,,up,string,
+up/caf9fceb_nohash_0,1.0,0,16000,/localscratch/GSC/up/caf9fceb_nohash_0.wav,wav,,caf9fceb,string,,up,string,,up,string,
+up/3d86b69a_nohash_2,1.0,0,16000,/localscratch/GSC/up/3d86b69a_nohash_2.wav,wav,,3d86b69a,string,,up,string,,up,string,
+up/c7dc7278_nohash_4,1.0,0,16000,/localscratch/GSC/up/c7dc7278_nohash_4.wav,wav,,c7dc7278,string,,up,string,,up,string,
+up/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/up/97f4c236_nohash_0.wav,wav,,97f4c236,string,,up,string,,up,string,
+up/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/up/aa80f517_nohash_1.wav,wav,,aa80f517,string,,up,string,,up,string,
+up/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/up/0cb74144_nohash_1.wav,wav,,0cb74144,string,,up,string,,up,string,
+up/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/up/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,up,string,,up,string,
+up/653a48f5_nohash_1,1.0,0,16000,/localscratch/GSC/up/653a48f5_nohash_1.wav,wav,,653a48f5,string,,up,string,,up,string,
+up/44715c1c_nohash_0,1.0,0,16000,/localscratch/GSC/up/44715c1c_nohash_0.wav,wav,,44715c1c,string,,up,string,,up,string,
+up/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/up/9a69672b_nohash_0.wav,wav,,9a69672b,string,,up,string,,up,string,
+up/dfdabe19_nohash_0,1.0,0,16000,/localscratch/GSC/up/dfdabe19_nohash_0.wav,wav,,dfdabe19,string,,up,string,,up,string,
+up/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/up/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,up,string,,up,string,
+up/83957201_nohash_0,1.0,0,16000,/localscratch/GSC/up/83957201_nohash_0.wav,wav,,83957201,string,,up,string,,up,string,
+up/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/up/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,up,string,,up,string,
+up/37dca74f_nohash_4,1.0,0,16000,/localscratch/GSC/up/37dca74f_nohash_4.wav,wav,,37dca74f,string,,up,string,,up,string,
+up/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/up/1acc97de_nohash_2.wav,wav,,1acc97de,string,,up,string,,up,string,
+up/9a7c1f83_nohash_4,1.0,0,16000,/localscratch/GSC/up/9a7c1f83_nohash_4.wav,wav,,9a7c1f83,string,,up,string,,up,string,
+up/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/up/63f7a489_nohash_2.wav,wav,,63f7a489,string,,up,string,,up,string,
+up/0fa1e7a9_nohash_0,1.0,0,16000,/localscratch/GSC/up/0fa1e7a9_nohash_0.wav,wav,,0fa1e7a9,string,,up,string,,up,string,
+up/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/up/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,up,string,,up,string,
+up/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/up/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,up,string,,up,string,
+up/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_5.wav,wav,,893705bb,string,,up,string,,up,string,
+up/893705bb_nohash_16,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_16.wav,wav,,893705bb,string,,up,string,,up,string,
+up/4f8ef132_nohash_1,1.0,0,16000,/localscratch/GSC/up/4f8ef132_nohash_1.wav,wav,,4f8ef132,string,,up,string,,up,string,
+up/ca4d5368_nohash_4,1.0,0,16000,/localscratch/GSC/up/ca4d5368_nohash_4.wav,wav,,ca4d5368,string,,up,string,,up,string,
+up/42beb5eb_nohash_1,1.0,0,16000,/localscratch/GSC/up/42beb5eb_nohash_1.wav,wav,,42beb5eb,string,,up,string,,up,string,
+up/135c6841_nohash_0,1.0,0,16000,/localscratch/GSC/up/135c6841_nohash_0.wav,wav,,135c6841,string,,up,string,,up,string,
+up/d9e9f554_nohash_1,1.0,0,16000,/localscratch/GSC/up/d9e9f554_nohash_1.wav,wav,,d9e9f554,string,,up,string,,up,string,
+up/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/up/f9643d42_nohash_0.wav,wav,,f9643d42,string,,up,string,,up,string,
+up/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/up/beb458a4_nohash_3.wav,wav,,beb458a4,string,,up,string,,up,string,
+up/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/up/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,up,string,,up,string,
+up/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/up/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,up,string,,up,string,
+up/87070229_nohash_4,1.0,0,16000,/localscratch/GSC/up/87070229_nohash_4.wav,wav,,87070229,string,,up,string,,up,string,
+up/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/up/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,up,string,,up,string,
+up/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/up/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,up,string,,up,string,
+up/37fc5d97_nohash_0,1.0,0,16000,/localscratch/GSC/up/37fc5d97_nohash_0.wav,wav,,37fc5d97,string,,up,string,,up,string,
+up/210f3aa9_nohash_1,1.0,0,16000,/localscratch/GSC/up/210f3aa9_nohash_1.wav,wav,,210f3aa9,string,,up,string,,up,string,
+up/43fc47a7_nohash_2,1.0,0,16000,/localscratch/GSC/up/43fc47a7_nohash_2.wav,wav,,43fc47a7,string,,up,string,,up,string,
+up/85d2ac4b_nohash_1,1.0,0,16000,/localscratch/GSC/up/85d2ac4b_nohash_1.wav,wav,,85d2ac4b,string,,up,string,,up,string,
+up/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/up/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,up,string,,up,string,
+up/daf230ac_nohash_0,1.0,0,16000,/localscratch/GSC/up/daf230ac_nohash_0.wav,wav,,daf230ac,string,,up,string,,up,string,
+up/2005ca25_nohash_0,1.0,0,16000,/localscratch/GSC/up/2005ca25_nohash_0.wav,wav,,2005ca25,string,,up,string,,up,string,
+up/84d1e469_nohash_0,1.0,0,16000,/localscratch/GSC/up/84d1e469_nohash_0.wav,wav,,84d1e469,string,,up,string,,up,string,
+up/3f170018_nohash_2,1.0,0,16000,/localscratch/GSC/up/3f170018_nohash_2.wav,wav,,3f170018,string,,up,string,,up,string,
+up/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/up/87070229_nohash_1.wav,wav,,87070229,string,,up,string,,up,string,
+up/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/up/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,up,string,,up,string,
+up/e0c782d5_nohash_4,1.0,0,16000,/localscratch/GSC/up/e0c782d5_nohash_4.wav,wav,,e0c782d5,string,,up,string,,up,string,
+up/adebe223_nohash_0,1.0,0,16000,/localscratch/GSC/up/adebe223_nohash_0.wav,wav,,adebe223,string,,up,string,,up,string,
+up/f6af2457_nohash_1,1.0,0,16000,/localscratch/GSC/up/f6af2457_nohash_1.wav,wav,,f6af2457,string,,up,string,,up,string,
+up/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/up/e1469561_nohash_0.wav,wav,,e1469561,string,,up,string,,up,string,
+up/db24628d_nohash_4,1.0,0,16000,/localscratch/GSC/up/db24628d_nohash_4.wav,wav,,db24628d,string,,up,string,,up,string,
+up/caedb73a_nohash_1,1.0,0,16000,/localscratch/GSC/up/caedb73a_nohash_1.wav,wav,,caedb73a,string,,up,string,,up,string,
+up/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/up/e41a903b_nohash_3.wav,wav,,e41a903b,string,,up,string,,up,string,
+up/50033893_nohash_1,1.0,0,16000,/localscratch/GSC/up/50033893_nohash_1.wav,wav,,50033893,string,,up,string,,up,string,
+up/e9901cf0_nohash_0,1.0,0,16000,/localscratch/GSC/up/e9901cf0_nohash_0.wav,wav,,e9901cf0,string,,up,string,,up,string,
+up/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/up/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,up,string,,up,string,
+up/837a0f64_nohash_4,1.0,0,16000,/localscratch/GSC/up/837a0f64_nohash_4.wav,wav,,837a0f64,string,,up,string,,up,string,
+up/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/up/692a88e6_nohash_2.wav,wav,,692a88e6,string,,up,string,,up,string,
+up/85834399_nohash_0,1.0,0,16000,/localscratch/GSC/up/85834399_nohash_0.wav,wav,,85834399,string,,up,string,,up,string,
+up/5c8af87a_nohash_3,1.0,0,16000,/localscratch/GSC/up/5c8af87a_nohash_3.wav,wav,,5c8af87a,string,,up,string,,up,string,
+up/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/up/8769c34c_nohash_1.wav,wav,,8769c34c,string,,up,string,,up,string,
+up/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/up/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,up,string,,up,string,
+up/4c7c95de_nohash_2,1.0,0,16000,/localscratch/GSC/up/4c7c95de_nohash_2.wav,wav,,4c7c95de,string,,up,string,,up,string,
+up/1fe4c891_nohash_0,1.0,0,16000,/localscratch/GSC/up/1fe4c891_nohash_0.wav,wav,,1fe4c891,string,,up,string,,up,string,
+up/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/up/2fa39636_nohash_0.wav,wav,,2fa39636,string,,up,string,,up,string,
+up/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/up/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,up,string,,up,string,
+up/85d2ac4b_nohash_0,1.0,0,16000,/localscratch/GSC/up/85d2ac4b_nohash_0.wav,wav,,85d2ac4b,string,,up,string,,up,string,
+up/42beb5eb_nohash_0,1.0,0,16000,/localscratch/GSC/up/42beb5eb_nohash_0.wav,wav,,42beb5eb,string,,up,string,,up,string,
+up/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/up/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,up,string,,up,string,
+up/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/up/aa80f517_nohash_0.wav,wav,,aa80f517,string,,up,string,,up,string,
+up/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/up/5170b77f_nohash_2.wav,wav,,5170b77f,string,,up,string,,up,string,
+up/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/up/db24628d_nohash_1.wav,wav,,db24628d,string,,up,string,,up,string,
+up/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/up/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,up,string,,up,string,
+up/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/up/beb458a4_nohash_1.wav,wav,,beb458a4,string,,up,string,,up,string,
+up/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/up/d7467392_nohash_0.wav,wav,,d7467392,string,,up,string,,up,string,
+up/f0ae7203_nohash_1,1.0,0,16000,/localscratch/GSC/up/f0ae7203_nohash_1.wav,wav,,f0ae7203,string,,up,string,,up,string,
+up/c518d1b1_nohash_1,1.0,0,16000,/localscratch/GSC/up/c518d1b1_nohash_1.wav,wav,,c518d1b1,string,,up,string,,up,string,
+up/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/up/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,up,string,,up,string,
+up/a1533da4_nohash_0,1.0,0,16000,/localscratch/GSC/up/a1533da4_nohash_0.wav,wav,,a1533da4,string,,up,string,,up,string,
+up/5e3dde6b_nohash_3,1.0,0,16000,/localscratch/GSC/up/5e3dde6b_nohash_3.wav,wav,,5e3dde6b,string,,up,string,,up,string,
+up/d103dd6e_nohash_1,1.0,0,16000,/localscratch/GSC/up/d103dd6e_nohash_1.wav,wav,,d103dd6e,string,,up,string,,up,string,
+up/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/up/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,up,string,,up,string,
+up/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/up/bb05582b_nohash_1.wav,wav,,bb05582b,string,,up,string,,up,string,
+up/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/up/5f01c798_nohash_0.wav,wav,,5f01c798,string,,up,string,,up,string,
+up/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/up/f9643d42_nohash_2.wav,wav,,f9643d42,string,,up,string,,up,string,
+up/f428ca69_nohash_1,1.0,0,16000,/localscratch/GSC/up/f428ca69_nohash_1.wav,wav,,f428ca69,string,,up,string,,up,string,
+up/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/up/f292725f_nohash_0.wav,wav,,f292725f,string,,up,string,,up,string,
+up/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/up/a7216980_nohash_2.wav,wav,,a7216980,string,,up,string,,up,string,
+up/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/up/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,up,string,,up,string,
+up/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/up/1acc97de_nohash_0.wav,wav,,1acc97de,string,,up,string,,up,string,
+up/aa80f517_nohash_4,1.0,0,16000,/localscratch/GSC/up/aa80f517_nohash_4.wav,wav,,aa80f517,string,,up,string,,up,string,
+up/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/up/aa80f517_nohash_2.wav,wav,,aa80f517,string,,up,string,,up,string,
+up/8ec6dab6_nohash_2,1.0,0,16000,/localscratch/GSC/up/8ec6dab6_nohash_2.wav,wav,,8ec6dab6,string,,up,string,,up,string,
+up/e71b4ce6_nohash_1,1.0,0,16000,/localscratch/GSC/up/e71b4ce6_nohash_1.wav,wav,,e71b4ce6,string,,up,string,,up,string,
+up/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/up/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,up,string,,up,string,
+up/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/up/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,up,string,,up,string,
+up/0d53e045_nohash_0,1.0,0,16000,/localscratch/GSC/up/0d53e045_nohash_0.wav,wav,,0d53e045,string,,up,string,,up,string,
+up/9e2ce5e3_nohash_2,1.0,0,16000,/localscratch/GSC/up/9e2ce5e3_nohash_2.wav,wav,,9e2ce5e3,string,,up,string,,up,string,
+up/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/up/a7216980_nohash_0.wav,wav,,a7216980,string,,up,string,,up,string,
+up/db24628d_nohash_3,1.0,0,16000,/localscratch/GSC/up/db24628d_nohash_3.wav,wav,,db24628d,string,,up,string,,up,string,
+up/aa48c94a_nohash_1,1.0,0,16000,/localscratch/GSC/up/aa48c94a_nohash_1.wav,wav,,aa48c94a,string,,up,string,,up,string,
+up/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/up/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,up,string,,up,string,
+up/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/up/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,up,string,,up,string,
+up/50033893_nohash_2,1.0,0,16000,/localscratch/GSC/up/50033893_nohash_2.wav,wav,,50033893,string,,up,string,,up,string,
+up/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/up/b49caed3_nohash_2.wav,wav,,b49caed3,string,,up,string,,up,string,
+up/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_8.wav,wav,,893705bb,string,,up,string,,up,string,
+up/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/up/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,up,string,,up,string,
+up/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/up/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,up,string,,up,string,
+up/ea356919_nohash_2,1.0,0,16000,/localscratch/GSC/up/ea356919_nohash_2.wav,wav,,ea356919,string,,up,string,,up,string,
+up/f0ae7203_nohash_3,1.0,0,16000,/localscratch/GSC/up/f0ae7203_nohash_3.wav,wav,,f0ae7203,string,,up,string,,up,string,
+up/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/up/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,up,string,,up,string,
+up/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/up/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,up,string,,up,string,
+up/ef2a3cfb_nohash_0,1.0,0,16000,/localscratch/GSC/up/ef2a3cfb_nohash_0.wav,wav,,ef2a3cfb,string,,up,string,,up,string,
+up/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/up/cd85758f_nohash_3.wav,wav,,cd85758f,string,,up,string,,up,string,
+up/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/up/91b03183_nohash_2.wav,wav,,91b03183,string,,up,string,,up,string,
+up/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/up/63f7a489_nohash_3.wav,wav,,63f7a489,string,,up,string,,up,string,
+up/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/up/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,up,string,,up,string,
+up/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/up/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,up,string,,up,string,
+up/840c366d_nohash_2,1.0,0,16000,/localscratch/GSC/up/840c366d_nohash_2.wav,wav,,840c366d,string,,up,string,,up,string,
+up/0f250098_nohash_0,1.0,0,16000,/localscratch/GSC/up/0f250098_nohash_0.wav,wav,,0f250098,string,,up,string,,up,string,
+up/2aa787cf_nohash_3,1.0,0,16000,/localscratch/GSC/up/2aa787cf_nohash_3.wav,wav,,2aa787cf,string,,up,string,,up,string,
+up/3d86b69a_nohash_3,1.0,0,16000,/localscratch/GSC/up/3d86b69a_nohash_3.wav,wav,,3d86b69a,string,,up,string,,up,string,
+up/0c40e715_nohash_0,1.0,0,16000,/localscratch/GSC/up/0c40e715_nohash_0.wav,wav,,0c40e715,string,,up,string,,up,string,
+up/4c841771_nohash_1,1.0,0,16000,/localscratch/GSC/up/4c841771_nohash_1.wav,wav,,4c841771,string,,up,string,,up,string,
+up/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/up/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,up,string,,up,string,
+up/4f8ef132_nohash_0,1.0,0,16000,/localscratch/GSC/up/4f8ef132_nohash_0.wav,wav,,4f8ef132,string,,up,string,,up,string,
+up/c9e251d2_nohash_0,1.0,0,16000,/localscratch/GSC/up/c9e251d2_nohash_0.wav,wav,,c9e251d2,string,,up,string,,up,string,
+up/553f1a79_nohash_0,1.0,0,16000,/localscratch/GSC/up/553f1a79_nohash_0.wav,wav,,553f1a79,string,,up,string,,up,string,
+up/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/up/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,up,string,,up,string,
+up/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/up/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,up,string,,up,string,
+up/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/up/b49caed3_nohash_0.wav,wav,,b49caed3,string,,up,string,,up,string,
+up/63f7a489_nohash_4,1.0,0,16000,/localscratch/GSC/up/63f7a489_nohash_4.wav,wav,,63f7a489,string,,up,string,,up,string,
+up/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/up/881583a6_nohash_0.wav,wav,,881583a6,string,,up,string,,up,string,
+up/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/up/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,up,string,,up,string,
+up/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/up/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,up,string,,up,string,
+up/af7a8296_nohash_0,1.0,0,16000,/localscratch/GSC/up/af7a8296_nohash_0.wav,wav,,af7a8296,string,,up,string,,up,string,
+up/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/up/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,up,string,,up,string,
+up/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/up/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,up,string,,up,string,
+up/d1bf406b_nohash_0,1.0,0,16000,/localscratch/GSC/up/d1bf406b_nohash_0.wav,wav,,d1bf406b,string,,up,string,,up,string,
+up/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/up/692a88e6_nohash_1.wav,wav,,692a88e6,string,,up,string,,up,string,
+up/4c841771_nohash_4,1.0,0,16000,/localscratch/GSC/up/4c841771_nohash_4.wav,wav,,4c841771,string,,up,string,,up,string,
+up/27c30960_nohash_0,1.0,0,16000,/localscratch/GSC/up/27c30960_nohash_0.wav,wav,,27c30960,string,,up,string,,up,string,
+up/d962e5ac_nohash_4,1.0,0,16000,/localscratch/GSC/up/d962e5ac_nohash_4.wav,wav,,d962e5ac,string,,up,string,,up,string,
+up/8625475c_nohash_1,1.0,0,16000,/localscratch/GSC/up/8625475c_nohash_1.wav,wav,,8625475c,string,,up,string,,up,string,
+up/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/up/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,up,string,,up,string,
+up/4c841771_nohash_2,1.0,0,16000,/localscratch/GSC/up/4c841771_nohash_2.wav,wav,,4c841771,string,,up,string,,up,string,
+up/7bae88ed_nohash_0,1.0,0,16000,/localscratch/GSC/up/7bae88ed_nohash_0.wav,wav,,7bae88ed,string,,up,string,,up,string,
+up/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/up/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,up,string,,up,string,
+up/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/up/37dca74f_nohash_2.wav,wav,,37dca74f,string,,up,string,,up,string,
+up/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/up/bb05582b_nohash_2.wav,wav,,bb05582b,string,,up,string,,up,string,
+up/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/up/692a88e6_nohash_0.wav,wav,,692a88e6,string,,up,string,,up,string,
+up/cfde27ba_nohash_0,1.0,0,16000,/localscratch/GSC/up/cfde27ba_nohash_0.wav,wav,,cfde27ba,string,,up,string,,up,string,
+up/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/up/aa80f517_nohash_3.wav,wav,,aa80f517,string,,up,string,,up,string,
+up/f0ae7203_nohash_2,1.0,0,16000,/localscratch/GSC/up/f0ae7203_nohash_2.wav,wav,,f0ae7203,string,,up,string,,up,string,
+up/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_1.wav,wav,,893705bb,string,,up,string,,up,string,
+up/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/up/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,up,string,,up,string,
+up/18f8afd5_nohash_4,1.0,0,16000,/localscratch/GSC/up/18f8afd5_nohash_4.wav,wav,,18f8afd5,string,,up,string,,up,string,
+up/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/up/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,up,string,,up,string,
+up/c518d1b1_nohash_0,1.0,0,16000,/localscratch/GSC/up/c518d1b1_nohash_0.wav,wav,,c518d1b1,string,,up,string,,up,string,
+up/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/up/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,up,string,,up,string,
+up/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/up/cd85758f_nohash_2.wav,wav,,cd85758f,string,,up,string,,up,string,
+up/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/up/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,up,string,,up,string,
+up/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/up/63f7a489_nohash_1.wav,wav,,63f7a489,string,,up,string,,up,string,
+up/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/up/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,up,string,,up,string,
+up/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/up/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,up,string,,up,string,
+up/8ec6dab6_nohash_1,1.0,0,16000,/localscratch/GSC/up/8ec6dab6_nohash_1.wav,wav,,8ec6dab6,string,,up,string,,up,string,
+up/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/up/422d3197_nohash_0.wav,wav,,422d3197,string,,up,string,,up,string,
+up/fdb5155e_nohash_0,1.0,0,16000,/localscratch/GSC/up/fdb5155e_nohash_0.wav,wav,,fdb5155e,string,,up,string,,up,string,
+up/8ec6dab6_nohash_0,1.0,0,16000,/localscratch/GSC/up/8ec6dab6_nohash_0.wav,wav,,8ec6dab6,string,,up,string,,up,string,
+up/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_6.wav,wav,,893705bb,string,,up,string,,up,string,
+up/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/up/189cbabe_nohash_0.wav,wav,,189cbabe,string,,up,string,,up,string,
+up/840c366d_nohash_1,1.0,0,16000,/localscratch/GSC/up/840c366d_nohash_1.wav,wav,,840c366d,string,,up,string,,up,string,
+up/f297e878_nohash_1,1.0,0,16000,/localscratch/GSC/up/f297e878_nohash_1.wav,wav,,f297e878,string,,up,string,,up,string,
+up/210f3aa9_nohash_0,1.0,0,16000,/localscratch/GSC/up/210f3aa9_nohash_0.wav,wav,,210f3aa9,string,,up,string,,up,string,
+up/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/up/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,up,string,,up,string,
+up/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/up/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,up,string,,up,string,
+up/cfde27ba_nohash_1,1.0,0,16000,/localscratch/GSC/up/cfde27ba_nohash_1.wav,wav,,cfde27ba,string,,up,string,,up,string,
+up/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/up/7257420c_nohash_0.wav,wav,,7257420c,string,,up,string,,up,string,
+up/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/up/beb458a4_nohash_0.wav,wav,,beb458a4,string,,up,string,,up,string,
+up/fdb5155e_nohash_2,1.0,0,16000,/localscratch/GSC/up/fdb5155e_nohash_2.wav,wav,,fdb5155e,string,,up,string,,up,string,
+up/caf9fceb_nohash_1,1.0,0,16000,/localscratch/GSC/up/caf9fceb_nohash_1.wav,wav,,caf9fceb,string,,up,string,,up,string,
+up/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/up/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,up,string,,up,string,
+up/a60a09cf_nohash_1,1.0,0,16000,/localscratch/GSC/up/a60a09cf_nohash_1.wav,wav,,a60a09cf,string,,up,string,,up,string,
+up/893705bb_nohash_12,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_12.wav,wav,,893705bb,string,,up,string,,up,string,
+up/67961766_nohash_0,1.0,0,16000,/localscratch/GSC/up/67961766_nohash_0.wav,wav,,67961766,string,,up,string,,up,string,
+up/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/up/6f689791_nohash_0.wav,wav,,6f689791,string,,up,string,,up,string,
+up/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/up/87070229_nohash_2.wav,wav,,87070229,string,,up,string,,up,string,
+up/ea356919_nohash_1,1.0,0,16000,/localscratch/GSC/up/ea356919_nohash_1.wav,wav,,ea356919,string,,up,string,,up,string,
+up/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/up/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,up,string,,up,string,
+up/3659fc1c_nohash_0,1.0,0,16000,/localscratch/GSC/up/3659fc1c_nohash_0.wav,wav,,3659fc1c,string,,up,string,,up,string,
+up/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/up/1acc97de_nohash_3.wav,wav,,1acc97de,string,,up,string,,up,string,
+up/ef2a3cfb_nohash_1,1.0,0,16000,/localscratch/GSC/up/ef2a3cfb_nohash_1.wav,wav,,ef2a3cfb,string,,up,string,,up,string,
+up/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/up/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,up,string,,up,string,
+up/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/up/87070229_nohash_0.wav,wav,,87070229,string,,up,string,,up,string,
+up/893705bb_nohash_14,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_14.wav,wav,,893705bb,string,,up,string,,up,string,
+up/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/up/97f4c236_nohash_4.wav,wav,,97f4c236,string,,up,string,,up,string,
+up/9a7c1f83_nohash_2,1.0,0,16000,/localscratch/GSC/up/9a7c1f83_nohash_2.wav,wav,,9a7c1f83,string,,up,string,,up,string,
+up/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/up/837a0f64_nohash_0.wav,wav,,837a0f64,string,,up,string,,up,string,
+up/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/up/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,up,string,,up,string,
+up/b737ee80_nohash_1,1.0,0,16000,/localscratch/GSC/up/b737ee80_nohash_1.wav,wav,,b737ee80,string,,up,string,,up,string,
+up/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/up/8769c34c_nohash_2.wav,wav,,8769c34c,string,,up,string,,up,string,
+up/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/up/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,up,string,,up,string,
+up/0d53e045_nohash_1,1.0,0,16000,/localscratch/GSC/up/0d53e045_nohash_1.wav,wav,,0d53e045,string,,up,string,,up,string,
+up/b737ee80_nohash_0,1.0,0,16000,/localscratch/GSC/up/b737ee80_nohash_0.wav,wav,,b737ee80,string,,up,string,,up,string,
+up/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/up/a7216980_nohash_3.wav,wav,,a7216980,string,,up,string,,up,string,
+up/8494fba8_nohash_0,1.0,0,16000,/localscratch/GSC/up/8494fba8_nohash_0.wav,wav,,8494fba8,string,,up,string,,up,string,
+up/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/up/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,up,string,,up,string,
+up/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/up/8fe67225_nohash_1.wav,wav,,8fe67225,string,,up,string,,up,string,
+up/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/up/837a0f64_nohash_2.wav,wav,,837a0f64,string,,up,string,,up,string,
+up/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/up/db24628d_nohash_2.wav,wav,,db24628d,string,,up,string,,up,string,
+up/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/up/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,up,string,,up,string,
+up/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/up/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,up,string,,up,string,
+up/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/up/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,up,string,,up,string,
+up/67961766_nohash_1,1.0,0,16000,/localscratch/GSC/up/67961766_nohash_1.wav,wav,,67961766,string,,up,string,,up,string,
+up/893705bb_nohash_11,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_11.wav,wav,,893705bb,string,,up,string,,up,string,
+up/fdb5155e_nohash_4,1.0,0,16000,/localscratch/GSC/up/fdb5155e_nohash_4.wav,wav,,fdb5155e,string,,up,string,,up,string,
+up/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/up/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,up,string,,up,string,
+up/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/up/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,up,string,,up,string,
+up/d5ca80c6_nohash_0,1.0,0,16000,/localscratch/GSC/up/d5ca80c6_nohash_0.wav,wav,,d5ca80c6,string,,up,string,,up,string,
+up/fdb5155e_nohash_1,1.0,0,16000,/localscratch/GSC/up/fdb5155e_nohash_1.wav,wav,,fdb5155e,string,,up,string,,up,string,
+up/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_2.wav,wav,,893705bb,string,,up,string,,up,string,
+up/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/up/9a69672b_nohash_2.wav,wav,,9a69672b,string,,up,string,,up,string,
+up/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/up/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,up,string,,up,string,
+up/553f1a79_nohash_2,1.0,0,16000,/localscratch/GSC/up/553f1a79_nohash_2.wav,wav,,553f1a79,string,,up,string,,up,string,
+up/f6af2457_nohash_0,1.0,0,16000,/localscratch/GSC/up/f6af2457_nohash_0.wav,wav,,f6af2457,string,,up,string,,up,string,
+up/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/up/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,up,string,,up,string,
+up/7192fddc_nohash_0,1.0,0,16000,/localscratch/GSC/up/7192fddc_nohash_0.wav,wav,,7192fddc,string,,up,string,,up,string,
+up/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/up/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,up,string,,up,string,
+up/83957201_nohash_1,1.0,0,16000,/localscratch/GSC/up/83957201_nohash_1.wav,wav,,83957201,string,,up,string,,up,string,
+up/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/up/beb458a4_nohash_2.wav,wav,,beb458a4,string,,up,string,,up,string,
+up/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/up/0cb74144_nohash_2.wav,wav,,0cb74144,string,,up,string,,up,string,
+up/d0faf7e4_nohash_4,1.0,0,16000,/localscratch/GSC/up/d0faf7e4_nohash_4.wav,wav,,d0faf7e4,string,,up,string,,up,string,
+up/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/up/97f4c236_nohash_3.wav,wav,,97f4c236,string,,up,string,,up,string,
+up/3f170018_nohash_0,1.0,0,16000,/localscratch/GSC/up/3f170018_nohash_0.wav,wav,,3f170018,string,,up,string,,up,string,
+up/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/up/8769c34c_nohash_0.wav,wav,,8769c34c,string,,up,string,,up,string,
+up/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/up/8fe67225_nohash_3.wav,wav,,8fe67225,string,,up,string,,up,string,
+up/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/up/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,up,string,,up,string,
+up/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/up/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,up,string,,up,string,
+up/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/up/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,up,string,,up,string,
+up/24ad3ebe_nohash_0,1.0,0,16000,/localscratch/GSC/up/24ad3ebe_nohash_0.wav,wav,,24ad3ebe,string,,up,string,,up,string,
+up/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/up/0cb74144_nohash_0.wav,wav,,0cb74144,string,,up,string,,up,string,
+up/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/up/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,up,string,,up,string,
+up/a6f2fd71_nohash_4,1.0,0,16000,/localscratch/GSC/up/a6f2fd71_nohash_4.wav,wav,,a6f2fd71,string,,up,string,,up,string,
+up/d1bf406b_nohash_1,1.0,0,16000,/localscratch/GSC/up/d1bf406b_nohash_1.wav,wav,,d1bf406b,string,,up,string,,up,string,
+up/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/up/37dca74f_nohash_1.wav,wav,,37dca74f,string,,up,string,,up,string,
+up/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/up/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,up,string,,up,string,
+up/f2e59fea_nohash_4,1.0,0,16000,/localscratch/GSC/up/f2e59fea_nohash_4.wav,wav,,f2e59fea,string,,up,string,,up,string,
+up/4c6167ca_nohash_6,1.0,0,16000,/localscratch/GSC/up/4c6167ca_nohash_6.wav,wav,,4c6167ca,string,,up,string,,up,string,
+up/9a7c1f83_nohash_3,1.0,0,16000,/localscratch/GSC/up/9a7c1f83_nohash_3.wav,wav,,9a7c1f83,string,,up,string,,up,string,
+up/d103dd6e_nohash_0,1.0,0,16000,/localscratch/GSC/up/d103dd6e_nohash_0.wav,wav,,d103dd6e,string,,up,string,,up,string,
+up/9e2ce5e3_nohash_1,1.0,0,16000,/localscratch/GSC/up/9e2ce5e3_nohash_1.wav,wav,,9e2ce5e3,string,,up,string,,up,string,
+up/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/up/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,up,string,,up,string,
+up/7192fddc_nohash_1,1.0,0,16000,/localscratch/GSC/up/7192fddc_nohash_1.wav,wav,,7192fddc,string,,up,string,,up,string,
+up/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/up/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,up,string,,up,string,
+up/b11a05d2_nohash_0,1.0,0,16000,/localscratch/GSC/up/b11a05d2_nohash_0.wav,wav,,b11a05d2,string,,up,string,,up,string,
+up/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/up/68dd409e_nohash_0.wav,wav,,68dd409e,string,,up,string,,up,string,
+up/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/up/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,up,string,,up,string,
+up/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/up/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,up,string,,up,string,
+up/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_3.wav,wav,,893705bb,string,,up,string,,up,string,
+up/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/up/f428ca69_nohash_0.wav,wav,,f428ca69,string,,up,string,,up,string,
+up/ea356919_nohash_0,1.0,0,16000,/localscratch/GSC/up/ea356919_nohash_0.wav,wav,,ea356919,string,,up,string,,up,string,
+up/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/up/e41a903b_nohash_0.wav,wav,,e41a903b,string,,up,string,,up,string,
+up/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/up/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,up,string,,up,string,
+up/4c7c95de_nohash_0,1.0,0,16000,/localscratch/GSC/up/4c7c95de_nohash_0.wav,wav,,4c7c95de,string,,up,string,,up,string,
+up/4c841771_nohash_5,1.0,0,16000,/localscratch/GSC/up/4c841771_nohash_5.wav,wav,,4c841771,string,,up,string,,up,string,
+up/91b03183_nohash_3,1.0,0,16000,/localscratch/GSC/up/91b03183_nohash_3.wav,wav,,91b03183,string,,up,string,,up,string,
+up/9dc1889e_nohash_0,1.0,0,16000,/localscratch/GSC/up/9dc1889e_nohash_0.wav,wav,,9dc1889e,string,,up,string,,up,string,
+up/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/up/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,up,string,,up,string,
+up/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/up/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,up,string,,up,string,
+up/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/up/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,up,string,,up,string,
+up/cc592808_nohash_0,1.0,0,16000,/localscratch/GSC/up/cc592808_nohash_0.wav,wav,,cc592808,string,,up,string,,up,string,
+up/2aa787cf_nohash_2,1.0,0,16000,/localscratch/GSC/up/2aa787cf_nohash_2.wav,wav,,2aa787cf,string,,up,string,,up,string,
+up/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/up/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,up,string,,up,string,
+up/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/up/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,up,string,,up,string,
+up/587f3271_nohash_0,1.0,0,16000,/localscratch/GSC/up/587f3271_nohash_0.wav,wav,,587f3271,string,,up,string,,up,string,
+up/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/up/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,up,string,,up,string,
+up/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/up/e1469561_nohash_3.wav,wav,,e1469561,string,,up,string,,up,string,
+up/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/up/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,up,string,,up,string,
+up/f428ca69_nohash_2,1.0,0,16000,/localscratch/GSC/up/f428ca69_nohash_2.wav,wav,,f428ca69,string,,up,string,,up,string,
+up/ca48dc76_nohash_0,1.0,0,16000,/localscratch/GSC/up/ca48dc76_nohash_0.wav,wav,,ca48dc76,string,,up,string,,up,string,
+up/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/up/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,up,string,,up,string,
+up/412c675c_nohash_0,1.0,0,16000,/localscratch/GSC/up/412c675c_nohash_0.wav,wav,,412c675c,string,,up,string,,up,string,
+up/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/up/2d82a556_nohash_0.wav,wav,,2d82a556,string,,up,string,,up,string,
+up/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/up/1acc97de_nohash_1.wav,wav,,1acc97de,string,,up,string,,up,string,
+up/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/up/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,up,string,,up,string,
+up/fdb5155e_nohash_3,1.0,0,16000,/localscratch/GSC/up/fdb5155e_nohash_3.wav,wav,,fdb5155e,string,,up,string,,up,string,
+up/6e916de8_nohash_0,1.0,0,16000,/localscratch/GSC/up/6e916de8_nohash_0.wav,wav,,6e916de8,string,,up,string,,up,string,
+up/adebe223_nohash_1,1.0,0,16000,/localscratch/GSC/up/adebe223_nohash_1.wav,wav,,adebe223,string,,up,string,,up,string,
+up/4c841771_nohash_3,1.0,0,16000,/localscratch/GSC/up/4c841771_nohash_3.wav,wav,,4c841771,string,,up,string,,up,string,
+up/2fa39636_nohash_2,1.0,0,16000,/localscratch/GSC/up/2fa39636_nohash_2.wav,wav,,2fa39636,string,,up,string,,up,string,
+up/553f1a79_nohash_1,1.0,0,16000,/localscratch/GSC/up/553f1a79_nohash_1.wav,wav,,553f1a79,string,,up,string,,up,string,
+up/5170b77f_nohash_4,1.0,0,16000,/localscratch/GSC/up/5170b77f_nohash_4.wav,wav,,5170b77f,string,,up,string,,up,string,
+up/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/up/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,up,string,,up,string,
+up/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/up/87070229_nohash_3.wav,wav,,87070229,string,,up,string,,up,string,
+up/893705bb_nohash_10,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_10.wav,wav,,893705bb,string,,up,string,,up,string,
+up/37fc5d97_nohash_3,1.0,0,16000,/localscratch/GSC/up/37fc5d97_nohash_3.wav,wav,,37fc5d97,string,,up,string,,up,string,
+up/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/up/91b03183_nohash_1.wav,wav,,91b03183,string,,up,string,,up,string,
+up/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/up/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,up,string,,up,string,
+up/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/up/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,up,string,,up,string,
+up/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/up/bb05582b_nohash_0.wav,wav,,bb05582b,string,,up,string,,up,string,
+up/2aa787cf_nohash_1,1.0,0,16000,/localscratch/GSC/up/2aa787cf_nohash_1.wav,wav,,2aa787cf,string,,up,string,,up,string,
+up/4c841771_nohash_0,1.0,0,16000,/localscratch/GSC/up/4c841771_nohash_0.wav,wav,,4c841771,string,,up,string,,up,string,
+up/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/up/8fe67225_nohash_4.wav,wav,,8fe67225,string,,up,string,,up,string,
+up/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/up/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,up,string,,up,string,
+up/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/up/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,up,string,,up,string,
+up/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/up/f9643d42_nohash_1.wav,wav,,f9643d42,string,,up,string,,up,string,
+up/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/up/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,up,string,,up,string,
+up/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/up/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,up,string,,up,string,
+up/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/up/837a0f64_nohash_3.wav,wav,,837a0f64,string,,up,string,,up,string,
+up/f297e878_nohash_0,1.0,0,16000,/localscratch/GSC/up/f297e878_nohash_0.wav,wav,,f297e878,string,,up,string,,up,string,
+up/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/up/e1469561_nohash_1.wav,wav,,e1469561,string,,up,string,,up,string,
+up/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/up/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,up,string,,up,string,
+up/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/up/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,up,string,,up,string,
+up/af8b2f2c_nohash_0,1.0,0,16000,/localscratch/GSC/up/af8b2f2c_nohash_0.wav,wav,,af8b2f2c,string,,up,string,,up,string,
+up/e5e54cee_nohash_0,1.0,0,16000,/localscratch/GSC/up/e5e54cee_nohash_0.wav,wav,,e5e54cee,string,,up,string,,up,string,
+up/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/up/5170b77f_nohash_3.wav,wav,,5170b77f,string,,up,string,,up,string,
+up/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/up/f9643d42_nohash_3.wav,wav,,f9643d42,string,,up,string,,up,string,
+up/caedb73a_nohash_0,1.0,0,16000,/localscratch/GSC/up/caedb73a_nohash_0.wav,wav,,caedb73a,string,,up,string,,up,string,
+up/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_4.wav,wav,,893705bb,string,,up,string,,up,string,
+up/3f170018_nohash_1,1.0,0,16000,/localscratch/GSC/up/3f170018_nohash_1.wav,wav,,3f170018,string,,up,string,,up,string,
+up/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/up/e1469561_nohash_2.wav,wav,,e1469561,string,,up,string,,up,string,
+up/50033893_nohash_0,1.0,0,16000,/localscratch/GSC/up/50033893_nohash_0.wav,wav,,50033893,string,,up,string,,up,string,
+up/1f653d27_nohash_0,1.0,0,16000,/localscratch/GSC/up/1f653d27_nohash_0.wav,wav,,1f653d27,string,,up,string,,up,string,
+up/692a88e6_nohash_4,1.0,0,16000,/localscratch/GSC/up/692a88e6_nohash_4.wav,wav,,692a88e6,string,,up,string,,up,string,
+up/840c366d_nohash_0,1.0,0,16000,/localscratch/GSC/up/840c366d_nohash_0.wav,wav,,840c366d,string,,up,string,,up,string,
+up/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/up/c7124b73_nohash_0.wav,wav,,c7124b73,string,,up,string,,up,string,
+up/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/up/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,up,string,,up,string,
+up/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/up/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,up,string,,up,string,
+up/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/up/e1469561_nohash_4.wav,wav,,e1469561,string,,up,string,,up,string,
+up/68dd409e_nohash_1,1.0,0,16000,/localscratch/GSC/up/68dd409e_nohash_1.wav,wav,,68dd409e,string,,up,string,,up,string,
+up/e5e54cee_nohash_1,1.0,0,16000,/localscratch/GSC/up/e5e54cee_nohash_1.wav,wav,,e5e54cee,string,,up,string,,up,string,
+up/9a356ab9_nohash_0,1.0,0,16000,/localscratch/GSC/up/9a356ab9_nohash_0.wav,wav,,9a356ab9,string,,up,string,,up,string,
+up/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/up/8769c34c_nohash_3.wav,wav,,8769c34c,string,,up,string,,up,string,
+up/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/up/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,up,string,,up,string,
+up/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/up/bb05582b_nohash_3.wav,wav,,bb05582b,string,,up,string,,up,string,
+up/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/up/5170b77f_nohash_0.wav,wav,,5170b77f,string,,up,string,,up,string,
+up/82d0d3ba_nohash_0,1.0,0,16000,/localscratch/GSC/up/82d0d3ba_nohash_0.wav,wav,,82d0d3ba,string,,up,string,,up,string,
+up/28497c5b_nohash_0,1.0,0,16000,/localscratch/GSC/up/28497c5b_nohash_0.wav,wav,,28497c5b,string,,up,string,,up,string,
+up/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/up/0cb74144_nohash_3.wav,wav,,0cb74144,string,,up,string,,up,string,
+up/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/up/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,up,string,,up,string,
+up/798f702a_nohash_0,1.0,0,16000,/localscratch/GSC/up/798f702a_nohash_0.wav,wav,,798f702a,string,,up,string,,up,string,
+up/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/up/9a69672b_nohash_1.wav,wav,,9a69672b,string,,up,string,,up,string,
+up/893705bb_nohash_9,1.0,0,16000,/localscratch/GSC/up/893705bb_nohash_9.wav,wav,,893705bb,string,,up,string,,up,string,
+up/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/up/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,up,string,,up,string,
+up/43fc47a7_nohash_1,1.0,0,16000,/localscratch/GSC/up/43fc47a7_nohash_1.wav,wav,,43fc47a7,string,,up,string,,up,string,
+up/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/up/37dca74f_nohash_0.wav,wav,,37dca74f,string,,up,string,,up,string,
+up/9d4bab4f_nohash_0,1.0,0,16000,/localscratch/GSC/up/9d4bab4f_nohash_0.wav,wav,,9d4bab4f,string,,up,string,,up,string,
+down/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/down/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,down,string,,down,string,
+down/b2e2773a_nohash_0,1.0,0,16000,/localscratch/GSC/down/b2e2773a_nohash_0.wav,wav,,b2e2773a,string,,down,string,,down,string,
+down/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/down/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,down,string,,down,string,
+down/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/down/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,down,string,,down,string,
+down/9a356ab9_nohash_1,1.0,0,16000,/localscratch/GSC/down/9a356ab9_nohash_1.wav,wav,,9a356ab9,string,,down,string,,down,string,
+down/f17d21b5_nohash_0,1.0,0,16000,/localscratch/GSC/down/f17d21b5_nohash_0.wav,wav,,f17d21b5,string,,down,string,,down,string,
+down/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/down/cd85758f_nohash_1.wav,wav,,cd85758f,string,,down,string,,down,string,
+down/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/down/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,down,string,,down,string,
+down/b7e9f841_nohash_0,1.0,0,16000,/localscratch/GSC/down/b7e9f841_nohash_0.wav,wav,,b7e9f841,string,,down,string,,down,string,
+down/4a0e2c16_nohash_0,1.0,0,16000,/localscratch/GSC/down/4a0e2c16_nohash_0.wav,wav,,4a0e2c16,string,,down,string,,down,string,
+down/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/down/b49caed3_nohash_1.wav,wav,,b49caed3,string,,down,string,,down,string,
+down/26b28ea7_nohash_1,1.0,0,16000,/localscratch/GSC/down/26b28ea7_nohash_1.wav,wav,,26b28ea7,string,,down,string,,down,string,
+down/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/down/9a69672b_nohash_3.wav,wav,,9a69672b,string,,down,string,,down,string,
+down/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/down/e41a903b_nohash_2.wav,wav,,e41a903b,string,,down,string,,down,string,
+down/e71b4ce6_nohash_0,1.0,0,16000,/localscratch/GSC/down/e71b4ce6_nohash_0.wav,wav,,e71b4ce6,string,,down,string,,down,string,
+down/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/down/97f4c236_nohash_2.wav,wav,,97f4c236,string,,down,string,,down,string,
+down/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/down/91b03183_nohash_0.wav,wav,,91b03183,string,,down,string,,down,string,
+down/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/down/5170b77f_nohash_1.wav,wav,,5170b77f,string,,down,string,,down,string,
+down/f5496439_nohash_0,1.0,0,16000,/localscratch/GSC/down/f5496439_nohash_0.wav,wav,,f5496439,string,,down,string,,down,string,
+down/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/down/db24628d_nohash_0.wav,wav,,db24628d,string,,down,string,,down,string,
+down/6f2f57c1_nohash_1,1.0,0,16000,/localscratch/GSC/down/6f2f57c1_nohash_1.wav,wav,,6f2f57c1,string,,down,string,,down,string,
+down/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/down/837a0f64_nohash_1.wav,wav,,837a0f64,string,,down,string,,down,string,
+down/8625475c_nohash_0,1.0,0,16000,/localscratch/GSC/down/8625475c_nohash_0.wav,wav,,8625475c,string,,down,string,,down,string,
+down/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/down/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,down,string,,down,string,
+down/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/down/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,down,string,,down,string,
+down/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/down/97f4c236_nohash_1.wav,wav,,97f4c236,string,,down,string,,down,string,
+down/587f3271_nohash_1,1.0,0,16000,/localscratch/GSC/down/587f3271_nohash_1.wav,wav,,587f3271,string,,down,string,,down,string,
+down/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/down/a7216980_nohash_1.wav,wav,,a7216980,string,,down,string,,down,string,
+down/6205088b_nohash_0,1.0,0,16000,/localscratch/GSC/down/6205088b_nohash_0.wav,wav,,6205088b,string,,down,string,,down,string,
+down/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/down/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,down,string,,down,string,
+down/8ea6dec6_nohash_1,1.0,0,16000,/localscratch/GSC/down/8ea6dec6_nohash_1.wav,wav,,8ea6dec6,string,,down,string,,down,string,
+down/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/down/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,down,string,,down,string,
+down/aa233654_nohash_1,1.0,0,16000,/localscratch/GSC/down/aa233654_nohash_1.wav,wav,,aa233654,string,,down,string,,down,string,
+down/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/down/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,down,string,,down,string,
+down/9dc1889e_nohash_1,1.0,0,16000,/localscratch/GSC/down/9dc1889e_nohash_1.wav,wav,,9dc1889e,string,,down,string,,down,string,
+down/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/down/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,down,string,,down,string,
+down/a4e8a997_nohash_1,1.0,0,16000,/localscratch/GSC/down/a4e8a997_nohash_1.wav,wav,,a4e8a997,string,,down,string,,down,string,
+down/c0e0f834_nohash_0,1.0,0,16000,/localscratch/GSC/down/c0e0f834_nohash_0.wav,wav,,c0e0f834,string,,down,string,,down,string,
+down/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/down/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,down,string,,down,string,
+down/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/down/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,down,string,,down,string,
+down/b49caed3_nohash_4,1.0,0,16000,/localscratch/GSC/down/b49caed3_nohash_4.wav,wav,,b49caed3,string,,down,string,,down,string,
+down/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/down/692a88e6_nohash_3.wav,wav,,692a88e6,string,,down,string,,down,string,
+down/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/down/63f7a489_nohash_0.wav,wav,,63f7a489,string,,down,string,,down,string,
+down/d5b963aa_nohash_4,1.0,0,16000,/localscratch/GSC/down/d5b963aa_nohash_4.wav,wav,,d5b963aa,string,,down,string,,down,string,
+down/44260689_nohash_0,1.0,0,16000,/localscratch/GSC/down/44260689_nohash_0.wav,wav,,44260689,string,,down,string,,down,string,
+down/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/down/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,down,string,,down,string,
+down/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/down/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,down,string,,down,string,
+down/ad6a46f1_nohash_0,1.0,0,16000,/localscratch/GSC/down/ad6a46f1_nohash_0.wav,wav,,ad6a46f1,string,,down,string,,down,string,
+down/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/down/893705bb_nohash_0.wav,wav,,893705bb,string,,down,string,,down,string,
+down/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/down/189cbabe_nohash_1.wav,wav,,189cbabe,string,,down,string,,down,string,
+down/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/down/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,down,string,,down,string,
+down/bed06fac_nohash_0,1.0,0,16000,/localscratch/GSC/down/bed06fac_nohash_0.wav,wav,,bed06fac,string,,down,string,,down,string,
+down/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/down/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,down,string,,down,string,
+down/af8b2f2c_nohash_1,1.0,0,16000,/localscratch/GSC/down/af8b2f2c_nohash_1.wav,wav,,af8b2f2c,string,,down,string,,down,string,
+down/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/down/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,down,string,,down,string,
+down/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/down/8fe67225_nohash_0.wav,wav,,8fe67225,string,,down,string,,down,string,
+down/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/down/beb458a4_nohash_4.wav,wav,,beb458a4,string,,down,string,,down,string,
+down/4290ca61_nohash_0,1.0,0,16000,/localscratch/GSC/down/4290ca61_nohash_0.wav,wav,,4290ca61,string,,down,string,,down,string,
+down/48a8a69d_nohash_0,1.0,0,16000,/localscratch/GSC/down/48a8a69d_nohash_0.wav,wav,,48a8a69d,string,,down,string,,down,string,
+down/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/down/2796ac50_nohash_0.wav,wav,,2796ac50,string,,down,string,,down,string,
+down/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/down/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,down,string,,down,string,
+down/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/down/cd85758f_nohash_0.wav,wav,,cd85758f,string,,down,string,,down,string,
+down/0cb74144_nohash_4,1.0,0,16000,/localscratch/GSC/down/0cb74144_nohash_4.wav,wav,,0cb74144,string,,down,string,,down,string,
+down/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/down/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,down,string,,down,string,
+down/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/down/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,down,string,,down,string,
+down/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/down/e41a903b_nohash_1.wav,wav,,e41a903b,string,,down,string,,down,string,
+down/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/down/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,down,string,,down,string,
+down/95ba4996_nohash_0,1.0,0,16000,/localscratch/GSC/down/95ba4996_nohash_0.wav,wav,,95ba4996,string,,down,string,,down,string,
+down/fa446c16_nohash_0,1.0,0,16000,/localscratch/GSC/down/fa446c16_nohash_0.wav,wav,,fa446c16,string,,down,string,,down,string,
+down/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/down/37dca74f_nohash_3.wav,wav,,37dca74f,string,,down,string,,down,string,
+down/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/down/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,down,string,,down,string,
+down/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/down/8fe67225_nohash_2.wav,wav,,8fe67225,string,,down,string,,down,string,
+down/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/down/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,down,string,,down,string,
+down/8ea6dec6_nohash_2,1.0,0,16000,/localscratch/GSC/down/8ea6dec6_nohash_2.wav,wav,,8ea6dec6,string,,down,string,,down,string,
+down/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/down/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,down,string,,down,string,
+down/e49428d9_nohash_3,1.0,0,16000,/localscratch/GSC/down/e49428d9_nohash_3.wav,wav,,e49428d9,string,,down,string,,down,string,
+down/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/down/8056e897_nohash_0.wav,wav,,8056e897,string,,down,string,,down,string,
+down/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/down/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,down,string,,down,string,
+down/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/down/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,down,string,,down,string,
+down/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/down/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,down,string,,down,string,
+down/c7dc7278_nohash_4,1.0,0,16000,/localscratch/GSC/down/c7dc7278_nohash_4.wav,wav,,c7dc7278,string,,down,string,,down,string,
+down/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/down/97f4c236_nohash_0.wav,wav,,97f4c236,string,,down,string,,down,string,
+down/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/down/aa80f517_nohash_1.wav,wav,,aa80f517,string,,down,string,,down,string,
+down/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/down/0cb74144_nohash_1.wav,wav,,0cb74144,string,,down,string,,down,string,
+down/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/down/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,down,string,,down,string,
+down/412c675c_nohash_1,1.0,0,16000,/localscratch/GSC/down/412c675c_nohash_1.wav,wav,,412c675c,string,,down,string,,down,string,
+down/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/down/9a69672b_nohash_0.wav,wav,,9a69672b,string,,down,string,,down,string,
+down/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/down/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,down,string,,down,string,
+down/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/down/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,down,string,,down,string,
+down/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/down/1acc97de_nohash_2.wav,wav,,1acc97de,string,,down,string,,down,string,
+down/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/down/63f7a489_nohash_2.wav,wav,,63f7a489,string,,down,string,,down,string,
+down/0fa1e7a9_nohash_0,1.0,0,16000,/localscratch/GSC/down/0fa1e7a9_nohash_0.wav,wav,,0fa1e7a9,string,,down,string,,down,string,
+down/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/down/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,down,string,,down,string,
+down/1cb788bc_nohash_1,1.0,0,16000,/localscratch/GSC/down/1cb788bc_nohash_1.wav,wav,,1cb788bc,string,,down,string,,down,string,
+down/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/down/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,down,string,,down,string,
+down/e49428d9_nohash_0,1.0,0,16000,/localscratch/GSC/down/e49428d9_nohash_0.wav,wav,,e49428d9,string,,down,string,,down,string,
+down/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/down/893705bb_nohash_5.wav,wav,,893705bb,string,,down,string,,down,string,
+down/ca4d5368_nohash_4,1.0,0,16000,/localscratch/GSC/down/ca4d5368_nohash_4.wav,wav,,ca4d5368,string,,down,string,,down,string,
+down/42beb5eb_nohash_1,1.0,0,16000,/localscratch/GSC/down/42beb5eb_nohash_1.wav,wav,,42beb5eb,string,,down,string,,down,string,
+down/5744b6a7_nohash_0,1.0,0,16000,/localscratch/GSC/down/5744b6a7_nohash_0.wav,wav,,5744b6a7,string,,down,string,,down,string,
+down/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/down/f9643d42_nohash_0.wav,wav,,f9643d42,string,,down,string,,down,string,
+down/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/down/beb458a4_nohash_3.wav,wav,,beb458a4,string,,down,string,,down,string,
+down/0c540988_nohash_0,1.0,0,16000,/localscratch/GSC/down/0c540988_nohash_0.wav,wav,,0c540988,string,,down,string,,down,string,
+down/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/down/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,down,string,,down,string,
+down/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/down/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,down,string,,down,string,
+down/87070229_nohash_4,1.0,0,16000,/localscratch/GSC/down/87070229_nohash_4.wav,wav,,87070229,string,,down,string,,down,string,
+down/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/down/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,down,string,,down,string,
+down/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/down/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,down,string,,down,string,
+down/37fc5d97_nohash_0,1.0,0,16000,/localscratch/GSC/down/37fc5d97_nohash_0.wav,wav,,37fc5d97,string,,down,string,,down,string,
+down/3b4f8f24_nohash_4,1.0,0,16000,/localscratch/GSC/down/3b4f8f24_nohash_4.wav,wav,,3b4f8f24,string,,down,string,,down,string,
+down/8a325749_nohash_0,1.0,0,16000,/localscratch/GSC/down/8a325749_nohash_0.wav,wav,,8a325749,string,,down,string,,down,string,
+down/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/down/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,down,string,,down,string,
+down/daf230ac_nohash_0,1.0,0,16000,/localscratch/GSC/down/daf230ac_nohash_0.wav,wav,,daf230ac,string,,down,string,,down,string,
+down/2005ca25_nohash_0,1.0,0,16000,/localscratch/GSC/down/2005ca25_nohash_0.wav,wav,,2005ca25,string,,down,string,,down,string,
+down/84d1e469_nohash_0,1.0,0,16000,/localscratch/GSC/down/84d1e469_nohash_0.wav,wav,,84d1e469,string,,down,string,,down,string,
+down/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/down/87070229_nohash_1.wav,wav,,87070229,string,,down,string,,down,string,
+down/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/down/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,down,string,,down,string,
+down/95ba4996_nohash_2,1.0,0,16000,/localscratch/GSC/down/95ba4996_nohash_2.wav,wav,,95ba4996,string,,down,string,,down,string,
+down/adebe223_nohash_0,1.0,0,16000,/localscratch/GSC/down/adebe223_nohash_0.wav,wav,,adebe223,string,,down,string,,down,string,
+down/e49428d9_nohash_2,1.0,0,16000,/localscratch/GSC/down/e49428d9_nohash_2.wav,wav,,e49428d9,string,,down,string,,down,string,
+down/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/down/e1469561_nohash_0.wav,wav,,e1469561,string,,down,string,,down,string,
+down/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/down/e41a903b_nohash_3.wav,wav,,e41a903b,string,,down,string,,down,string,
+down/94de6a6a_nohash_4,1.0,0,16000,/localscratch/GSC/down/94de6a6a_nohash_4.wav,wav,,94de6a6a,string,,down,string,,down,string,
+down/5744b6a7_nohash_1,1.0,0,16000,/localscratch/GSC/down/5744b6a7_nohash_1.wav,wav,,5744b6a7,string,,down,string,,down,string,
+down/e9901cf0_nohash_0,1.0,0,16000,/localscratch/GSC/down/e9901cf0_nohash_0.wav,wav,,e9901cf0,string,,down,string,,down,string,
+down/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/down/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,down,string,,down,string,
+down/bb31b82b_nohash_0,1.0,0,16000,/localscratch/GSC/down/bb31b82b_nohash_0.wav,wav,,bb31b82b,string,,down,string,,down,string,
+down/837a0f64_nohash_4,1.0,0,16000,/localscratch/GSC/down/837a0f64_nohash_4.wav,wav,,837a0f64,string,,down,string,,down,string,
+down/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/down/692a88e6_nohash_2.wav,wav,,692a88e6,string,,down,string,,down,string,
+down/48a8a69d_nohash_1,1.0,0,16000,/localscratch/GSC/down/48a8a69d_nohash_1.wav,wav,,48a8a69d,string,,down,string,,down,string,
+down/5c8af87a_nohash_3,1.0,0,16000,/localscratch/GSC/down/5c8af87a_nohash_3.wav,wav,,5c8af87a,string,,down,string,,down,string,
+down/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/down/8769c34c_nohash_1.wav,wav,,8769c34c,string,,down,string,,down,string,
+down/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/down/b49caed3_nohash_3.wav,wav,,b49caed3,string,,down,string,,down,string,
+down/a4e8a997_nohash_0,1.0,0,16000,/localscratch/GSC/down/a4e8a997_nohash_0.wav,wav,,a4e8a997,string,,down,string,,down,string,
+down/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/down/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,down,string,,down,string,
+down/1fe4c891_nohash_0,1.0,0,16000,/localscratch/GSC/down/1fe4c891_nohash_0.wav,wav,,1fe4c891,string,,down,string,,down,string,
+down/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/down/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,down,string,,down,string,
+down/f264e0df_nohash_1,1.0,0,16000,/localscratch/GSC/down/f264e0df_nohash_1.wav,wav,,f264e0df,string,,down,string,,down,string,
+down/a9f54d8d_nohash_0,1.0,0,16000,/localscratch/GSC/down/a9f54d8d_nohash_0.wav,wav,,a9f54d8d,string,,down,string,,down,string,
+down/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/down/a4383927_nohash_0.wav,wav,,a4383927,string,,down,string,,down,string,
+down/85d2ac4b_nohash_0,1.0,0,16000,/localscratch/GSC/down/85d2ac4b_nohash_0.wav,wav,,85d2ac4b,string,,down,string,,down,string,
+down/42beb5eb_nohash_0,1.0,0,16000,/localscratch/GSC/down/42beb5eb_nohash_0.wav,wav,,42beb5eb,string,,down,string,,down,string,
+down/e49428d9_nohash_1,1.0,0,16000,/localscratch/GSC/down/e49428d9_nohash_1.wav,wav,,e49428d9,string,,down,string,,down,string,
+down/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/down/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,down,string,,down,string,
+down/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/down/aa80f517_nohash_0.wav,wav,,aa80f517,string,,down,string,,down,string,
+down/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/down/5170b77f_nohash_2.wav,wav,,5170b77f,string,,down,string,,down,string,
+down/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/down/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,down,string,,down,string,
+down/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/down/db24628d_nohash_1.wav,wav,,db24628d,string,,down,string,,down,string,
+down/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/down/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,down,string,,down,string,
+down/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/down/beb458a4_nohash_1.wav,wav,,beb458a4,string,,down,string,,down,string,
+down/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/down/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,down,string,,down,string,
+down/2d82a556_nohash_1,1.0,0,16000,/localscratch/GSC/down/2d82a556_nohash_1.wav,wav,,2d82a556,string,,down,string,,down,string,
+down/a1533da4_nohash_0,1.0,0,16000,/localscratch/GSC/down/a1533da4_nohash_0.wav,wav,,a1533da4,string,,down,string,,down,string,
+down/5e3dde6b_nohash_3,1.0,0,16000,/localscratch/GSC/down/5e3dde6b_nohash_3.wav,wav,,5e3dde6b,string,,down,string,,down,string,
+down/d103dd6e_nohash_1,1.0,0,16000,/localscratch/GSC/down/d103dd6e_nohash_1.wav,wav,,d103dd6e,string,,down,string,,down,string,
+down/4620dc14_nohash_0,1.0,0,16000,/localscratch/GSC/down/4620dc14_nohash_0.wav,wav,,4620dc14,string,,down,string,,down,string,
+down/4845bb10_nohash_0,1.0,0,16000,/localscratch/GSC/down/4845bb10_nohash_0.wav,wav,,4845bb10,string,,down,string,,down,string,
+down/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/down/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,down,string,,down,string,
+down/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/down/bb05582b_nohash_1.wav,wav,,bb05582b,string,,down,string,,down,string,
+down/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/down/5f01c798_nohash_0.wav,wav,,5f01c798,string,,down,string,,down,string,
+down/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/down/f9643d42_nohash_2.wav,wav,,f9643d42,string,,down,string,,down,string,
+down/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/down/a7216980_nohash_2.wav,wav,,a7216980,string,,down,string,,down,string,
+down/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/down/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,down,string,,down,string,
+down/964c7c9e_nohash_0,1.0,0,16000,/localscratch/GSC/down/964c7c9e_nohash_0.wav,wav,,964c7c9e,string,,down,string,,down,string,
+down/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/down/1acc97de_nohash_0.wav,wav,,1acc97de,string,,down,string,,down,string,
+down/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/down/aa80f517_nohash_2.wav,wav,,aa80f517,string,,down,string,,down,string,
+down/8ec6dab6_nohash_2,1.0,0,16000,/localscratch/GSC/down/8ec6dab6_nohash_2.wav,wav,,8ec6dab6,string,,down,string,,down,string,
+down/a1533da4_nohash_1,1.0,0,16000,/localscratch/GSC/down/a1533da4_nohash_1.wav,wav,,a1533da4,string,,down,string,,down,string,
+down/e71b4ce6_nohash_1,1.0,0,16000,/localscratch/GSC/down/e71b4ce6_nohash_1.wav,wav,,e71b4ce6,string,,down,string,,down,string,
+down/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/down/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,down,string,,down,string,
+down/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/down/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,down,string,,down,string,
+down/7e1054e7_nohash_0,1.0,0,16000,/localscratch/GSC/down/7e1054e7_nohash_0.wav,wav,,7e1054e7,string,,down,string,,down,string,
+down/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/down/a7216980_nohash_0.wav,wav,,a7216980,string,,down,string,,down,string,
+down/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/down/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,down,string,,down,string,
+down/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/down/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,down,string,,down,string,
+down/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/down/b49caed3_nohash_2.wav,wav,,b49caed3,string,,down,string,,down,string,
+down/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/down/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,down,string,,down,string,
+down/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/down/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,down,string,,down,string,
+down/0ea0e2f4_nohash_0,1.0,0,16000,/localscratch/GSC/down/0ea0e2f4_nohash_0.wav,wav,,0ea0e2f4,string,,down,string,,down,string,
+down/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/down/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,down,string,,down,string,
+down/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/down/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,down,string,,down,string,
+down/ef2a3cfb_nohash_0,1.0,0,16000,/localscratch/GSC/down/ef2a3cfb_nohash_0.wav,wav,,ef2a3cfb,string,,down,string,,down,string,
+down/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/down/cd85758f_nohash_3.wav,wav,,cd85758f,string,,down,string,,down,string,
+down/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/down/63f7a489_nohash_3.wav,wav,,63f7a489,string,,down,string,,down,string,
+down/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/down/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,down,string,,down,string,
+down/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/down/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,down,string,,down,string,
+down/0f250098_nohash_0,1.0,0,16000,/localscratch/GSC/down/0f250098_nohash_0.wav,wav,,0f250098,string,,down,string,,down,string,
+down/0c40e715_nohash_0,1.0,0,16000,/localscratch/GSC/down/0c40e715_nohash_0.wav,wav,,0c40e715,string,,down,string,,down,string,
+down/a80f9f53_nohash_0,1.0,0,16000,/localscratch/GSC/down/a80f9f53_nohash_0.wav,wav,,a80f9f53,string,,down,string,,down,string,
+down/4c841771_nohash_1,1.0,0,16000,/localscratch/GSC/down/4c841771_nohash_1.wav,wav,,4c841771,string,,down,string,,down,string,
+down/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/down/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,down,string,,down,string,
+down/553f1a79_nohash_0,1.0,0,16000,/localscratch/GSC/down/553f1a79_nohash_0.wav,wav,,553f1a79,string,,down,string,,down,string,
+down/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/down/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,down,string,,down,string,
+down/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/down/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,down,string,,down,string,
+down/f9643d42_nohash_4,1.0,0,16000,/localscratch/GSC/down/f9643d42_nohash_4.wav,wav,,f9643d42,string,,down,string,,down,string,
+down/af405b69_nohash_0,1.0,0,16000,/localscratch/GSC/down/af405b69_nohash_0.wav,wav,,af405b69,string,,down,string,,down,string,
+down/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/down/b49caed3_nohash_0.wav,wav,,b49caed3,string,,down,string,,down,string,
+down/63f7a489_nohash_4,1.0,0,16000,/localscratch/GSC/down/63f7a489_nohash_4.wav,wav,,63f7a489,string,,down,string,,down,string,
+down/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/down/881583a6_nohash_0.wav,wav,,881583a6,string,,down,string,,down,string,
+down/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/down/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,down,string,,down,string,
+down/80c45ed6_nohash_1,1.0,0,16000,/localscratch/GSC/down/80c45ed6_nohash_1.wav,wav,,80c45ed6,string,,down,string,,down,string,
+down/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/down/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,down,string,,down,string,
+down/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/down/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,down,string,,down,string,
+down/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/down/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,down,string,,down,string,
+down/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/down/692a88e6_nohash_1.wav,wav,,692a88e6,string,,down,string,,down,string,
+down/27c30960_nohash_0,1.0,0,16000,/localscratch/GSC/down/27c30960_nohash_0.wav,wav,,27c30960,string,,down,string,,down,string,
+down/d962e5ac_nohash_4,1.0,0,16000,/localscratch/GSC/down/d962e5ac_nohash_4.wav,wav,,d962e5ac,string,,down,string,,down,string,
+down/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/down/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,down,string,,down,string,
+down/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/down/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,down,string,,down,string,
+down/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/down/37dca74f_nohash_2.wav,wav,,37dca74f,string,,down,string,,down,string,
+down/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/down/bb05582b_nohash_2.wav,wav,,bb05582b,string,,down,string,,down,string,
+down/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/down/692a88e6_nohash_0.wav,wav,,692a88e6,string,,down,string,,down,string,
+down/cfde27ba_nohash_0,1.0,0,16000,/localscratch/GSC/down/cfde27ba_nohash_0.wav,wav,,cfde27ba,string,,down,string,,down,string,
+down/8056e897_nohash_1,1.0,0,16000,/localscratch/GSC/down/8056e897_nohash_1.wav,wav,,8056e897,string,,down,string,,down,string,
+down/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/down/aa80f517_nohash_3.wav,wav,,aa80f517,string,,down,string,,down,string,
+down/8fe52b97_nohash_0,1.0,0,16000,/localscratch/GSC/down/8fe52b97_nohash_0.wav,wav,,8fe52b97,string,,down,string,,down,string,
+down/27c30960_nohash_1,1.0,0,16000,/localscratch/GSC/down/27c30960_nohash_1.wav,wav,,27c30960,string,,down,string,,down,string,
+down/1fe4c891_nohash_2,1.0,0,16000,/localscratch/GSC/down/1fe4c891_nohash_2.wav,wav,,1fe4c891,string,,down,string,,down,string,
+down/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/down/893705bb_nohash_1.wav,wav,,893705bb,string,,down,string,,down,string,
+down/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/down/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,down,string,,down,string,
+down/6736bc64_nohash_0,1.0,0,16000,/localscratch/GSC/down/6736bc64_nohash_0.wav,wav,,6736bc64,string,,down,string,,down,string,
+down/f264e0df_nohash_0,1.0,0,16000,/localscratch/GSC/down/f264e0df_nohash_0.wav,wav,,f264e0df,string,,down,string,,down,string,
+down/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/down/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,down,string,,down,string,
+down/c518d1b1_nohash_0,1.0,0,16000,/localscratch/GSC/down/c518d1b1_nohash_0.wav,wav,,c518d1b1,string,,down,string,,down,string,
+down/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/down/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,down,string,,down,string,
+down/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/down/cd85758f_nohash_2.wav,wav,,cd85758f,string,,down,string,,down,string,
+down/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/down/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,down,string,,down,string,
+down/189cbabe_nohash_4,1.0,0,16000,/localscratch/GSC/down/189cbabe_nohash_4.wav,wav,,189cbabe,string,,down,string,,down,string,
+down/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/down/63f7a489_nohash_1.wav,wav,,63f7a489,string,,down,string,,down,string,
+down/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/down/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,down,string,,down,string,
+down/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/down/189cbabe_nohash_2.wav,wav,,189cbabe,string,,down,string,,down,string,
+down/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/down/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,down,string,,down,string,
+down/8ec6dab6_nohash_1,1.0,0,16000,/localscratch/GSC/down/8ec6dab6_nohash_1.wav,wav,,8ec6dab6,string,,down,string,,down,string,
+down/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/down/422d3197_nohash_0.wav,wav,,422d3197,string,,down,string,,down,string,
+down/fdb5155e_nohash_0,1.0,0,16000,/localscratch/GSC/down/fdb5155e_nohash_0.wav,wav,,fdb5155e,string,,down,string,,down,string,
+down/8ec6dab6_nohash_0,1.0,0,16000,/localscratch/GSC/down/8ec6dab6_nohash_0.wav,wav,,8ec6dab6,string,,down,string,,down,string,
+down/aa233654_nohash_0,1.0,0,16000,/localscratch/GSC/down/aa233654_nohash_0.wav,wav,,aa233654,string,,down,string,,down,string,
+down/6f689791_nohash_2,1.0,0,16000,/localscratch/GSC/down/6f689791_nohash_2.wav,wav,,6f689791,string,,down,string,,down,string,
+down/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/down/189cbabe_nohash_0.wav,wav,,189cbabe,string,,down,string,,down,string,
+down/210f3aa9_nohash_0,1.0,0,16000,/localscratch/GSC/down/210f3aa9_nohash_0.wav,wav,,210f3aa9,string,,down,string,,down,string,
+down/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/down/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,down,string,,down,string,
+down/283d7a53_nohash_1,1.0,0,16000,/localscratch/GSC/down/283d7a53_nohash_1.wav,wav,,283d7a53,string,,down,string,,down,string,
+down/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/down/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,down,string,,down,string,
+down/d91a159e_nohash_0,1.0,0,16000,/localscratch/GSC/down/d91a159e_nohash_0.wav,wav,,d91a159e,string,,down,string,,down,string,
+down/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/down/7257420c_nohash_0.wav,wav,,7257420c,string,,down,string,,down,string,
+down/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/down/beb458a4_nohash_0.wav,wav,,beb458a4,string,,down,string,,down,string,
+down/1fe4c891_nohash_1,1.0,0,16000,/localscratch/GSC/down/1fe4c891_nohash_1.wav,wav,,1fe4c891,string,,down,string,,down,string,
+down/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/down/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,down,string,,down,string,
+down/5f814c23_nohash_0,1.0,0,16000,/localscratch/GSC/down/5f814c23_nohash_0.wav,wav,,5f814c23,string,,down,string,,down,string,
+down/dcb57584_nohash_0,1.0,0,16000,/localscratch/GSC/down/dcb57584_nohash_0.wav,wav,,dcb57584,string,,down,string,,down,string,
+down/67961766_nohash_0,1.0,0,16000,/localscratch/GSC/down/67961766_nohash_0.wav,wav,,67961766,string,,down,string,,down,string,
+down/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/down/6f689791_nohash_0.wav,wav,,6f689791,string,,down,string,,down,string,
+down/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/down/87070229_nohash_2.wav,wav,,87070229,string,,down,string,,down,string,
+down/1cb788bc_nohash_0,1.0,0,16000,/localscratch/GSC/down/1cb788bc_nohash_0.wav,wav,,1cb788bc,string,,down,string,,down,string,
+down/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/down/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,down,string,,down,string,
+down/5828dfa2_nohash_2,1.0,0,16000,/localscratch/GSC/down/5828dfa2_nohash_2.wav,wav,,5828dfa2,string,,down,string,,down,string,
+down/3659fc1c_nohash_0,1.0,0,16000,/localscratch/GSC/down/3659fc1c_nohash_0.wav,wav,,3659fc1c,string,,down,string,,down,string,
+down/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/down/1acc97de_nohash_3.wav,wav,,1acc97de,string,,down,string,,down,string,
+down/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/down/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,down,string,,down,string,
+down/563aa4e6_nohash_4,1.0,0,16000,/localscratch/GSC/down/563aa4e6_nohash_4.wav,wav,,563aa4e6,string,,down,string,,down,string,
+down/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/down/87070229_nohash_0.wav,wav,,87070229,string,,down,string,,down,string,
+down/6f2f57c1_nohash_2,1.0,0,16000,/localscratch/GSC/down/6f2f57c1_nohash_2.wav,wav,,6f2f57c1,string,,down,string,,down,string,
+down/6021f08b_nohash_0,1.0,0,16000,/localscratch/GSC/down/6021f08b_nohash_0.wav,wav,,6021f08b,string,,down,string,,down,string,
+down/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/down/97f4c236_nohash_4.wav,wav,,97f4c236,string,,down,string,,down,string,
+down/9a7c1f83_nohash_2,1.0,0,16000,/localscratch/GSC/down/9a7c1f83_nohash_2.wav,wav,,9a7c1f83,string,,down,string,,down,string,
+down/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/down/837a0f64_nohash_0.wav,wav,,837a0f64,string,,down,string,,down,string,
+down/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/down/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,down,string,,down,string,
+down/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/down/8769c34c_nohash_2.wav,wav,,8769c34c,string,,down,string,,down,string,
+down/daf230ac_nohash_1,1.0,0,16000,/localscratch/GSC/down/daf230ac_nohash_1.wav,wav,,daf230ac,string,,down,string,,down,string,
+down/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/down/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,down,string,,down,string,
+down/881583a6_nohash_1,1.0,0,16000,/localscratch/GSC/down/881583a6_nohash_1.wav,wav,,881583a6,string,,down,string,,down,string,
+down/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/down/a7216980_nohash_3.wav,wav,,a7216980,string,,down,string,,down,string,
+down/8494fba8_nohash_0,1.0,0,16000,/localscratch/GSC/down/8494fba8_nohash_0.wav,wav,,8494fba8,string,,down,string,,down,string,
+down/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/down/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,down,string,,down,string,
+down/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/down/8fe67225_nohash_1.wav,wav,,8fe67225,string,,down,string,,down,string,
+down/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/down/837a0f64_nohash_2.wav,wav,,837a0f64,string,,down,string,,down,string,
+down/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/down/db24628d_nohash_2.wav,wav,,db24628d,string,,down,string,,down,string,
+down/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/down/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,down,string,,down,string,
+down/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/down/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,down,string,,down,string,
+down/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/down/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,down,string,,down,string,
+down/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/down/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,down,string,,down,string,
+down/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/down/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,down,string,,down,string,
+down/d5ca80c6_nohash_0,1.0,0,16000,/localscratch/GSC/down/d5ca80c6_nohash_0.wav,wav,,d5ca80c6,string,,down,string,,down,string,
+down/fdb5155e_nohash_1,1.0,0,16000,/localscratch/GSC/down/fdb5155e_nohash_1.wav,wav,,fdb5155e,string,,down,string,,down,string,
+down/6f689791_nohash_1,1.0,0,16000,/localscratch/GSC/down/6f689791_nohash_1.wav,wav,,6f689791,string,,down,string,,down,string,
+down/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/down/893705bb_nohash_2.wav,wav,,893705bb,string,,down,string,,down,string,
+down/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/down/9a69672b_nohash_2.wav,wav,,9a69672b,string,,down,string,,down,string,
+down/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/down/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,down,string,,down,string,
+down/f6af2457_nohash_0,1.0,0,16000,/localscratch/GSC/down/f6af2457_nohash_0.wav,wav,,f6af2457,string,,down,string,,down,string,
+down/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/down/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,down,string,,down,string,
+down/af405b69_nohash_1,1.0,0,16000,/localscratch/GSC/down/af405b69_nohash_1.wav,wav,,af405b69,string,,down,string,,down,string,
+down/e41a903b_nohash_4,1.0,0,16000,/localscratch/GSC/down/e41a903b_nohash_4.wav,wav,,e41a903b,string,,down,string,,down,string,
+down/af130f12_nohash_2,1.0,0,16000,/localscratch/GSC/down/af130f12_nohash_2.wav,wav,,af130f12,string,,down,string,,down,string,
+down/7192fddc_nohash_0,1.0,0,16000,/localscratch/GSC/down/7192fddc_nohash_0.wav,wav,,7192fddc,string,,down,string,,down,string,
+down/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/down/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,down,string,,down,string,
+down/af130f12_nohash_1,1.0,0,16000,/localscratch/GSC/down/af130f12_nohash_1.wav,wav,,af130f12,string,,down,string,,down,string,
+down/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/down/beb458a4_nohash_2.wav,wav,,beb458a4,string,,down,string,,down,string,
+down/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/down/0cb74144_nohash_2.wav,wav,,0cb74144,string,,down,string,,down,string,
+down/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/down/97f4c236_nohash_3.wav,wav,,97f4c236,string,,down,string,,down,string,
+down/3f170018_nohash_0,1.0,0,16000,/localscratch/GSC/down/3f170018_nohash_0.wav,wav,,3f170018,string,,down,string,,down,string,
+down/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/down/8769c34c_nohash_0.wav,wav,,8769c34c,string,,down,string,,down,string,
+down/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/down/8fe67225_nohash_3.wav,wav,,8fe67225,string,,down,string,,down,string,
+down/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/down/189cbabe_nohash_3.wav,wav,,189cbabe,string,,down,string,,down,string,
+down/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/down/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,down,string,,down,string,
+down/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/down/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,down,string,,down,string,
+down/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/down/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,down,string,,down,string,
+down/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/down/0cb74144_nohash_0.wav,wav,,0cb74144,string,,down,string,,down,string,
+down/6f2f57c1_nohash_0,1.0,0,16000,/localscratch/GSC/down/6f2f57c1_nohash_0.wav,wav,,6f2f57c1,string,,down,string,,down,string,
+down/283d7a53_nohash_0,1.0,0,16000,/localscratch/GSC/down/283d7a53_nohash_0.wav,wav,,283d7a53,string,,down,string,,down,string,
+down/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/down/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,down,string,,down,string,
+down/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/down/37dca74f_nohash_1.wav,wav,,37dca74f,string,,down,string,,down,string,
+down/d5ca80c6_nohash_1,1.0,0,16000,/localscratch/GSC/down/d5ca80c6_nohash_1.wav,wav,,d5ca80c6,string,,down,string,,down,string,
+down/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/down/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,down,string,,down,string,
+down/80c45ed6_nohash_0,1.0,0,16000,/localscratch/GSC/down/80c45ed6_nohash_0.wav,wav,,80c45ed6,string,,down,string,,down,string,
+down/f2e59fea_nohash_4,1.0,0,16000,/localscratch/GSC/down/f2e59fea_nohash_4.wav,wav,,f2e59fea,string,,down,string,,down,string,
+down/4c6167ca_nohash_6,1.0,0,16000,/localscratch/GSC/down/4c6167ca_nohash_6.wav,wav,,4c6167ca,string,,down,string,,down,string,
+down/0f250098_nohash_1,1.0,0,16000,/localscratch/GSC/down/0f250098_nohash_1.wav,wav,,0f250098,string,,down,string,,down,string,
+down/370844f7_nohash_0,1.0,0,16000,/localscratch/GSC/down/370844f7_nohash_0.wav,wav,,370844f7,string,,down,string,,down,string,
+down/d103dd6e_nohash_0,1.0,0,16000,/localscratch/GSC/down/d103dd6e_nohash_0.wav,wav,,d103dd6e,string,,down,string,,down,string,
+down/95ba4996_nohash_1,1.0,0,16000,/localscratch/GSC/down/95ba4996_nohash_1.wav,wav,,95ba4996,string,,down,string,,down,string,
+down/9e2ce5e3_nohash_1,1.0,0,16000,/localscratch/GSC/down/9e2ce5e3_nohash_1.wav,wav,,9e2ce5e3,string,,down,string,,down,string,
+down/4c6167ca_nohash_7,1.0,0,16000,/localscratch/GSC/down/4c6167ca_nohash_7.wav,wav,,4c6167ca,string,,down,string,,down,string,
+down/7192fddc_nohash_1,1.0,0,16000,/localscratch/GSC/down/7192fddc_nohash_1.wav,wav,,7192fddc,string,,down,string,,down,string,
+down/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/down/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,down,string,,down,string,
+down/c0e0f834_nohash_1,1.0,0,16000,/localscratch/GSC/down/c0e0f834_nohash_1.wav,wav,,c0e0f834,string,,down,string,,down,string,
+down/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/down/68dd409e_nohash_0.wav,wav,,68dd409e,string,,down,string,,down,string,
+down/6f2f57c1_nohash_3,1.0,0,16000,/localscratch/GSC/down/6f2f57c1_nohash_3.wav,wav,,6f2f57c1,string,,down,string,,down,string,
+down/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/down/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,down,string,,down,string,
+down/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/down/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,down,string,,down,string,
+down/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/down/893705bb_nohash_3.wav,wav,,893705bb,string,,down,string,,down,string,
+down/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/down/f428ca69_nohash_0.wav,wav,,f428ca69,string,,down,string,,down,string,
+down/475b61f1_nohash_0,1.0,0,16000,/localscratch/GSC/down/475b61f1_nohash_0.wav,wav,,475b61f1,string,,down,string,,down,string,
+down/c7124b73_nohash_1,1.0,0,16000,/localscratch/GSC/down/c7124b73_nohash_1.wav,wav,,c7124b73,string,,down,string,,down,string,
+down/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/down/e41a903b_nohash_0.wav,wav,,e41a903b,string,,down,string,,down,string,
+down/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/down/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,down,string,,down,string,
+down/9dc1889e_nohash_0,1.0,0,16000,/localscratch/GSC/down/9dc1889e_nohash_0.wav,wav,,9dc1889e,string,,down,string,,down,string,
+down/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/down/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,down,string,,down,string,
+down/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/down/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,down,string,,down,string,
+down/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/down/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,down,string,,down,string,
+down/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/down/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,down,string,,down,string,
+down/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/down/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,down,string,,down,string,
+down/587f3271_nohash_0,1.0,0,16000,/localscratch/GSC/down/587f3271_nohash_0.wav,wav,,587f3271,string,,down,string,,down,string,
+down/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/down/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,down,string,,down,string,
+down/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/down/e1469561_nohash_3.wav,wav,,e1469561,string,,down,string,,down,string,
+down/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/down/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,down,string,,down,string,
+down/ca48dc76_nohash_0,1.0,0,16000,/localscratch/GSC/down/ca48dc76_nohash_0.wav,wav,,ca48dc76,string,,down,string,,down,string,
+down/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/down/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,down,string,,down,string,
+down/412c675c_nohash_0,1.0,0,16000,/localscratch/GSC/down/412c675c_nohash_0.wav,wav,,412c675c,string,,down,string,,down,string,
+down/022cd682_nohash_0,1.0,0,16000,/localscratch/GSC/down/022cd682_nohash_0.wav,wav,,022cd682,string,,down,string,,down,string,
+down/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/down/2d82a556_nohash_0.wav,wav,,2d82a556,string,,down,string,,down,string,
+down/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/down/1acc97de_nohash_1.wav,wav,,1acc97de,string,,down,string,,down,string,
+down/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/down/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,down,string,,down,string,
+down/adebe223_nohash_1,1.0,0,16000,/localscratch/GSC/down/adebe223_nohash_1.wav,wav,,adebe223,string,,down,string,,down,string,
+down/553f1a79_nohash_1,1.0,0,16000,/localscratch/GSC/down/553f1a79_nohash_1.wav,wav,,553f1a79,string,,down,string,,down,string,
+down/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/down/87070229_nohash_3.wav,wav,,87070229,string,,down,string,,down,string,
+down/5828dfa2_nohash_0,1.0,0,16000,/localscratch/GSC/down/5828dfa2_nohash_0.wav,wav,,5828dfa2,string,,down,string,,down,string,
+down/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/down/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,down,string,,down,string,
+down/4845bb10_nohash_1,1.0,0,16000,/localscratch/GSC/down/4845bb10_nohash_1.wav,wav,,4845bb10,string,,down,string,,down,string,
+down/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/down/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,down,string,,down,string,
+down/3efef882_nohash_0,1.0,0,16000,/localscratch/GSC/down/3efef882_nohash_0.wav,wav,,3efef882,string,,down,string,,down,string,
+down/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/down/bb05582b_nohash_0.wav,wav,,bb05582b,string,,down,string,,down,string,
+down/4c841771_nohash_0,1.0,0,16000,/localscratch/GSC/down/4c841771_nohash_0.wav,wav,,4c841771,string,,down,string,,down,string,
+down/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/down/8fe67225_nohash_4.wav,wav,,8fe67225,string,,down,string,,down,string,
+down/fb7eb481_nohash_4,1.0,0,16000,/localscratch/GSC/down/fb7eb481_nohash_4.wav,wav,,fb7eb481,string,,down,string,,down,string,
+down/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/down/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,down,string,,down,string,
+down/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/down/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,down,string,,down,string,
+down/7257420c_nohash_1,1.0,0,16000,/localscratch/GSC/down/7257420c_nohash_1.wav,wav,,7257420c,string,,down,string,,down,string,
+down/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/down/f9643d42_nohash_1.wav,wav,,f9643d42,string,,down,string,,down,string,
+down/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/down/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,down,string,,down,string,
+down/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/down/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,down,string,,down,string,
+down/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/down/837a0f64_nohash_3.wav,wav,,837a0f64,string,,down,string,,down,string,
+down/f297e878_nohash_0,1.0,0,16000,/localscratch/GSC/down/f297e878_nohash_0.wav,wav,,f297e878,string,,down,string,,down,string,
+down/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/down/e1469561_nohash_1.wav,wav,,e1469561,string,,down,string,,down,string,
+down/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/down/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,down,string,,down,string,
+down/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/down/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,down,string,,down,string,
+down/af8b2f2c_nohash_0,1.0,0,16000,/localscratch/GSC/down/af8b2f2c_nohash_0.wav,wav,,af8b2f2c,string,,down,string,,down,string,
+down/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/down/5170b77f_nohash_3.wav,wav,,5170b77f,string,,down,string,,down,string,
+down/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/down/f9643d42_nohash_3.wav,wav,,f9643d42,string,,down,string,,down,string,
+down/caedb73a_nohash_0,1.0,0,16000,/localscratch/GSC/down/caedb73a_nohash_0.wav,wav,,caedb73a,string,,down,string,,down,string,
+down/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/down/893705bb_nohash_4.wav,wav,,893705bb,string,,down,string,,down,string,
+down/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/down/e1469561_nohash_2.wav,wav,,e1469561,string,,down,string,,down,string,
+down/1f653d27_nohash_0,1.0,0,16000,/localscratch/GSC/down/1f653d27_nohash_0.wav,wav,,1f653d27,string,,down,string,,down,string,
+down/5828dfa2_nohash_1,1.0,0,16000,/localscratch/GSC/down/5828dfa2_nohash_1.wav,wav,,5828dfa2,string,,down,string,,down,string,
+down/692a88e6_nohash_4,1.0,0,16000,/localscratch/GSC/down/692a88e6_nohash_4.wav,wav,,692a88e6,string,,down,string,,down,string,
+down/840c366d_nohash_0,1.0,0,16000,/localscratch/GSC/down/840c366d_nohash_0.wav,wav,,840c366d,string,,down,string,,down,string,
+down/3659fc1c_nohash_1,1.0,0,16000,/localscratch/GSC/down/3659fc1c_nohash_1.wav,wav,,3659fc1c,string,,down,string,,down,string,
+down/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/down/c7124b73_nohash_0.wav,wav,,c7124b73,string,,down,string,,down,string,
+down/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/down/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,down,string,,down,string,
+down/1cb788bc_nohash_2,1.0,0,16000,/localscratch/GSC/down/1cb788bc_nohash_2.wav,wav,,1cb788bc,string,,down,string,,down,string,
+down/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/down/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,down,string,,down,string,
+down/af130f12_nohash_0,1.0,0,16000,/localscratch/GSC/down/af130f12_nohash_0.wav,wav,,af130f12,string,,down,string,,down,string,
+down/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/down/e1469561_nohash_4.wav,wav,,e1469561,string,,down,string,,down,string,
+down/9a69672b_nohash_4,1.0,0,16000,/localscratch/GSC/down/9a69672b_nohash_4.wav,wav,,9a69672b,string,,down,string,,down,string,
+down/9a356ab9_nohash_0,1.0,0,16000,/localscratch/GSC/down/9a356ab9_nohash_0.wav,wav,,9a356ab9,string,,down,string,,down,string,
+down/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/down/8769c34c_nohash_3.wav,wav,,8769c34c,string,,down,string,,down,string,
+down/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/down/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,down,string,,down,string,
+down/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/down/bb05582b_nohash_3.wav,wav,,bb05582b,string,,down,string,,down,string,
+down/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/down/5170b77f_nohash_0.wav,wav,,5170b77f,string,,down,string,,down,string,
+down/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/down/0cb74144_nohash_3.wav,wav,,0cb74144,string,,down,string,,down,string,
+down/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/down/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,down,string,,down,string,
+down/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/down/9a69672b_nohash_1.wav,wav,,9a69672b,string,,down,string,,down,string,
+down/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/down/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,down,string,,down,string,
+down/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/down/37dca74f_nohash_0.wav,wav,,37dca74f,string,,down,string,,down,string,
+down/82b99576_nohash_0,1.0,0,16000,/localscratch/GSC/down/82b99576_nohash_0.wav,wav,,82b99576,string,,down,string,,down,string,
+down/e49428d9_nohash_4,1.0,0,16000,/localscratch/GSC/down/e49428d9_nohash_4.wav,wav,,e49428d9,string,,down,string,,down,string,
+down/9d4bab4f_nohash_0,1.0,0,16000,/localscratch/GSC/down/9d4bab4f_nohash_0.wav,wav,,9d4bab4f,string,,down,string,,down,string,
+left/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/left/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,left,string,,left,string,
+left/b2e2773a_nohash_0,1.0,0,16000,/localscratch/GSC/left/b2e2773a_nohash_0.wav,wav,,b2e2773a,string,,left,string,,left,string,
+left/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/left/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,left,string,,left,string,
+left/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/left/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,left,string,,left,string,
+left/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/left/cd85758f_nohash_1.wav,wav,,cd85758f,string,,left,string,,left,string,
+left/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/left/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,left,string,,left,string,
+left/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/left/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,left,string,,left,string,
+left/b7e9f841_nohash_0,1.0,0,16000,/localscratch/GSC/left/b7e9f841_nohash_0.wav,wav,,b7e9f841,string,,left,string,,left,string,
+left/4a0e2c16_nohash_0,1.0,0,16000,/localscratch/GSC/left/4a0e2c16_nohash_0.wav,wav,,4a0e2c16,string,,left,string,,left,string,
+left/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/left/b49caed3_nohash_1.wav,wav,,b49caed3,string,,left,string,,left,string,
+left/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/left/9a69672b_nohash_3.wav,wav,,9a69672b,string,,left,string,,left,string,
+left/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/left/e41a903b_nohash_2.wav,wav,,e41a903b,string,,left,string,,left,string,
+left/ca48dc76_nohash_1,1.0,0,16000,/localscratch/GSC/left/ca48dc76_nohash_1.wav,wav,,ca48dc76,string,,left,string,,left,string,
+left/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/left/dc75148d_nohash_0.wav,wav,,dc75148d,string,,left,string,,left,string,
+left/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/left/97f4c236_nohash_2.wav,wav,,97f4c236,string,,left,string,,left,string,
+left/03401e93_nohash_0,1.0,0,16000,/localscratch/GSC/left/03401e93_nohash_0.wav,wav,,03401e93,string,,left,string,,left,string,
+left/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/left/91b03183_nohash_0.wav,wav,,91b03183,string,,left,string,,left,string,
+left/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/left/5170b77f_nohash_1.wav,wav,,5170b77f,string,,left,string,,left,string,
+left/4a0e2c16_nohash_1,1.0,0,16000,/localscratch/GSC/left/4a0e2c16_nohash_1.wav,wav,,4a0e2c16,string,,left,string,,left,string,
+left/f5496439_nohash_0,1.0,0,16000,/localscratch/GSC/left/f5496439_nohash_0.wav,wav,,f5496439,string,,left,string,,left,string,
+left/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/left/db24628d_nohash_0.wav,wav,,db24628d,string,,left,string,,left,string,
+left/9d171fee_nohash_0,1.0,0,16000,/localscratch/GSC/left/9d171fee_nohash_0.wav,wav,,9d171fee,string,,left,string,,left,string,
+left/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/left/837a0f64_nohash_1.wav,wav,,837a0f64,string,,left,string,,left,string,
+left/8625475c_nohash_0,1.0,0,16000,/localscratch/GSC/left/8625475c_nohash_0.wav,wav,,8625475c,string,,left,string,,left,string,
+left/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/left/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,left,string,,left,string,
+left/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/left/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,left,string,,left,string,
+left/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/left/97f4c236_nohash_1.wav,wav,,97f4c236,string,,left,string,,left,string,
+left/587f3271_nohash_1,1.0,0,16000,/localscratch/GSC/left/587f3271_nohash_1.wav,wav,,587f3271,string,,left,string,,left,string,
+left/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/left/a7216980_nohash_1.wav,wav,,a7216980,string,,left,string,,left,string,
+left/a60a09cf_nohash_0,1.0,0,16000,/localscratch/GSC/left/a60a09cf_nohash_0.wav,wav,,a60a09cf,string,,left,string,,left,string,
+left/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/left/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,left,string,,left,string,
+left/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/left/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,left,string,,left,string,
+left/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/left/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,left,string,,left,string,
+left/3efef882_nohash_2,1.0,0,16000,/localscratch/GSC/left/3efef882_nohash_2.wav,wav,,3efef882,string,,left,string,,left,string,
+left/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/left/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,left,string,,left,string,
+left/9dc1889e_nohash_1,1.0,0,16000,/localscratch/GSC/left/9dc1889e_nohash_1.wav,wav,,9dc1889e,string,,left,string,,left,string,
+left/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/left/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,left,string,,left,string,
+left/c0e0f834_nohash_0,1.0,0,16000,/localscratch/GSC/left/c0e0f834_nohash_0.wav,wav,,c0e0f834,string,,left,string,,left,string,
+left/a2473d62_nohash_2,1.0,0,16000,/localscratch/GSC/left/a2473d62_nohash_2.wav,wav,,a2473d62,string,,left,string,,left,string,
+left/3efef882_nohash_1,1.0,0,16000,/localscratch/GSC/left/3efef882_nohash_1.wav,wav,,3efef882,string,,left,string,,left,string,
+left/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/left/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,left,string,,left,string,
+left/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/left/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,left,string,,left,string,
+left/6b889021_nohash_0,1.0,0,16000,/localscratch/GSC/left/6b889021_nohash_0.wav,wav,,6b889021,string,,left,string,,left,string,
+left/a7216980_nohash_4,1.0,0,16000,/localscratch/GSC/left/a7216980_nohash_4.wav,wav,,a7216980,string,,left,string,,left,string,
+left/b49caed3_nohash_4,1.0,0,16000,/localscratch/GSC/left/b49caed3_nohash_4.wav,wav,,b49caed3,string,,left,string,,left,string,
+left/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/left/692a88e6_nohash_3.wav,wav,,692a88e6,string,,left,string,,left,string,
+left/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/left/63f7a489_nohash_0.wav,wav,,63f7a489,string,,left,string,,left,string,
+left/d5b963aa_nohash_4,1.0,0,16000,/localscratch/GSC/left/d5b963aa_nohash_4.wav,wav,,d5b963aa,string,,left,string,,left,string,
+left/44260689_nohash_0,1.0,0,16000,/localscratch/GSC/left/44260689_nohash_0.wav,wav,,44260689,string,,left,string,,left,string,
+left/d9e9f554_nohash_0,1.0,0,16000,/localscratch/GSC/left/d9e9f554_nohash_0.wav,wav,,d9e9f554,string,,left,string,,left,string,
+left/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/left/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,left,string,,left,string,
+left/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/left/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,left,string,,left,string,
+left/2aa787cf_nohash_0,1.0,0,16000,/localscratch/GSC/left/2aa787cf_nohash_0.wav,wav,,2aa787cf,string,,left,string,,left,string,
+left/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_0.wav,wav,,893705bb,string,,left,string,,left,string,
+left/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/left/189cbabe_nohash_1.wav,wav,,189cbabe,string,,left,string,,left,string,
+left/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/left/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,left,string,,left,string,
+left/bed06fac_nohash_0,1.0,0,16000,/localscratch/GSC/left/bed06fac_nohash_0.wav,wav,,bed06fac,string,,left,string,,left,string,
+left/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/left/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,left,string,,left,string,
+left/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/left/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,left,string,,left,string,
+left/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/left/8fe67225_nohash_0.wav,wav,,8fe67225,string,,left,string,,left,string,
+left/3df9a3d4_nohash_0,1.0,0,16000,/localscratch/GSC/left/3df9a3d4_nohash_0.wav,wav,,3df9a3d4,string,,left,string,,left,string,
+left/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/left/beb458a4_nohash_4.wav,wav,,beb458a4,string,,left,string,,left,string,
+left/4290ca61_nohash_0,1.0,0,16000,/localscratch/GSC/left/4290ca61_nohash_0.wav,wav,,4290ca61,string,,left,string,,left,string,
+left/48a8a69d_nohash_0,1.0,0,16000,/localscratch/GSC/left/48a8a69d_nohash_0.wav,wav,,48a8a69d,string,,left,string,,left,string,
+left/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/left/2796ac50_nohash_0.wav,wav,,2796ac50,string,,left,string,,left,string,
+left/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/left/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,left,string,,left,string,
+left/370844f7_nohash_1,1.0,0,16000,/localscratch/GSC/left/370844f7_nohash_1.wav,wav,,370844f7,string,,left,string,,left,string,
+left/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/left/cd85758f_nohash_0.wav,wav,,cd85758f,string,,left,string,,left,string,
+left/0cb74144_nohash_4,1.0,0,16000,/localscratch/GSC/left/0cb74144_nohash_4.wav,wav,,0cb74144,string,,left,string,,left,string,
+left/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/left/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,left,string,,left,string,
+left/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/left/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,left,string,,left,string,
+left/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/left/e41a903b_nohash_1.wav,wav,,e41a903b,string,,left,string,,left,string,
+left/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/left/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,left,string,,left,string,
+left/95ba4996_nohash_0,1.0,0,16000,/localscratch/GSC/left/95ba4996_nohash_0.wav,wav,,95ba4996,string,,left,string,,left,string,
+left/fa446c16_nohash_0,1.0,0,16000,/localscratch/GSC/left/fa446c16_nohash_0.wav,wav,,fa446c16,string,,left,string,,left,string,
+left/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/left/37dca74f_nohash_3.wav,wav,,37dca74f,string,,left,string,,left,string,
+left/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/left/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,left,string,,left,string,
+left/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/left/8fe67225_nohash_2.wav,wav,,8fe67225,string,,left,string,,left,string,
+left/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/left/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,left,string,,left,string,
+left/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/left/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,left,string,,left,string,
+left/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_7.wav,wav,,893705bb,string,,left,string,,left,string,
+left/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/left/8056e897_nohash_0.wav,wav,,8056e897,string,,left,string,,left,string,
+left/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/left/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,left,string,,left,string,
+left/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/left/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,left,string,,left,string,
+left/caf9fceb_nohash_0,1.0,0,16000,/localscratch/GSC/left/caf9fceb_nohash_0.wav,wav,,caf9fceb,string,,left,string,,left,string,
+left/0487ba9b_nohash_0,1.0,0,16000,/localscratch/GSC/left/0487ba9b_nohash_0.wav,wav,,0487ba9b,string,,left,string,,left,string,
+left/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/left/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,left,string,,left,string,
+left/47d01978_nohash_0,1.0,0,16000,/localscratch/GSC/left/47d01978_nohash_0.wav,wav,,47d01978,string,,left,string,,left,string,
+left/c7dc7278_nohash_4,1.0,0,16000,/localscratch/GSC/left/c7dc7278_nohash_4.wav,wav,,c7dc7278,string,,left,string,,left,string,
+left/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/left/97f4c236_nohash_0.wav,wav,,97f4c236,string,,left,string,,left,string,
+left/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/left/aa80f517_nohash_1.wav,wav,,aa80f517,string,,left,string,,left,string,
+left/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/left/0cb74144_nohash_1.wav,wav,,0cb74144,string,,left,string,,left,string,
+left/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/left/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,left,string,,left,string,
+left/412c675c_nohash_1,1.0,0,16000,/localscratch/GSC/left/412c675c_nohash_1.wav,wav,,412c675c,string,,left,string,,left,string,
+left/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/left/9a69672b_nohash_0.wav,wav,,9a69672b,string,,left,string,,left,string,
+left/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/left/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,left,string,,left,string,
+left/ffa76c4a_nohash_1,1.0,0,16000,/localscratch/GSC/left/ffa76c4a_nohash_1.wav,wav,,ffa76c4a,string,,left,string,,left,string,
+left/ca48dc76_nohash_3,1.0,0,16000,/localscratch/GSC/left/ca48dc76_nohash_3.wav,wav,,ca48dc76,string,,left,string,,left,string,
+left/d7467392_nohash_1,1.0,0,16000,/localscratch/GSC/left/d7467392_nohash_1.wav,wav,,d7467392,string,,left,string,,left,string,
+left/83957201_nohash_0,1.0,0,16000,/localscratch/GSC/left/83957201_nohash_0.wav,wav,,83957201,string,,left,string,,left,string,
+left/6379c6a2_nohash_0,1.0,0,16000,/localscratch/GSC/left/6379c6a2_nohash_0.wav,wav,,6379c6a2,string,,left,string,,left,string,
+left/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/left/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,left,string,,left,string,
+left/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/left/1acc97de_nohash_2.wav,wav,,1acc97de,string,,left,string,,left,string,
+left/9a7c1f83_nohash_4,1.0,0,16000,/localscratch/GSC/left/9a7c1f83_nohash_4.wav,wav,,9a7c1f83,string,,left,string,,left,string,
+left/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/left/63f7a489_nohash_2.wav,wav,,63f7a489,string,,left,string,,left,string,
+left/a2473d62_nohash_1,1.0,0,16000,/localscratch/GSC/left/a2473d62_nohash_1.wav,wav,,a2473d62,string,,left,string,,left,string,
+left/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/left/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,left,string,,left,string,
+left/1f3bece8_nohash_1,1.0,0,16000,/localscratch/GSC/left/1f3bece8_nohash_1.wav,wav,,1f3bece8,string,,left,string,,left,string,
+left/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/left/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,left,string,,left,string,
+left/e49428d9_nohash_0,1.0,0,16000,/localscratch/GSC/left/e49428d9_nohash_0.wav,wav,,e49428d9,string,,left,string,,left,string,
+left/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_5.wav,wav,,893705bb,string,,left,string,,left,string,
+left/fe1916ba_nohash_0,1.0,0,16000,/localscratch/GSC/left/fe1916ba_nohash_0.wav,wav,,fe1916ba,string,,left,string,,left,string,
+left/fce96bac_nohash_0,1.0,0,16000,/localscratch/GSC/left/fce96bac_nohash_0.wav,wav,,fce96bac,string,,left,string,,left,string,
+left/5744b6a7_nohash_0,1.0,0,16000,/localscratch/GSC/left/5744b6a7_nohash_0.wav,wav,,5744b6a7,string,,left,string,,left,string,
+left/5f814c23_nohash_1,1.0,0,16000,/localscratch/GSC/left/5f814c23_nohash_1.wav,wav,,5f814c23,string,,left,string,,left,string,
+left/9b3ea809_nohash_1,1.0,0,16000,/localscratch/GSC/left/9b3ea809_nohash_1.wav,wav,,9b3ea809,string,,left,string,,left,string,
+left/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/left/f9643d42_nohash_0.wav,wav,,f9643d42,string,,left,string,,left,string,
+left/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/left/beb458a4_nohash_3.wav,wav,,beb458a4,string,,left,string,,left,string,
+left/0c540988_nohash_0,1.0,0,16000,/localscratch/GSC/left/0c540988_nohash_0.wav,wav,,0c540988,string,,left,string,,left,string,
+left/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/left/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,left,string,,left,string,
+left/87070229_nohash_4,1.0,0,16000,/localscratch/GSC/left/87070229_nohash_4.wav,wav,,87070229,string,,left,string,,left,string,
+left/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/left/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,left,string,,left,string,
+left/03401e93_nohash_1,1.0,0,16000,/localscratch/GSC/left/03401e93_nohash_1.wav,wav,,03401e93,string,,left,string,,left,string,
+left/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/left/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,left,string,,left,string,
+left/3b4f8f24_nohash_4,1.0,0,16000,/localscratch/GSC/left/3b4f8f24_nohash_4.wav,wav,,3b4f8f24,string,,left,string,,left,string,
+left/85d2ac4b_nohash_1,1.0,0,16000,/localscratch/GSC/left/85d2ac4b_nohash_1.wav,wav,,85d2ac4b,string,,left,string,,left,string,
+left/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/left/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,left,string,,left,string,
+left/daf230ac_nohash_0,1.0,0,16000,/localscratch/GSC/left/daf230ac_nohash_0.wav,wav,,daf230ac,string,,left,string,,left,string,
+left/2005ca25_nohash_0,1.0,0,16000,/localscratch/GSC/left/2005ca25_nohash_0.wav,wav,,2005ca25,string,,left,string,,left,string,
+left/1f3bece8_nohash_2,1.0,0,16000,/localscratch/GSC/left/1f3bece8_nohash_2.wav,wav,,1f3bece8,string,,left,string,,left,string,
+left/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/left/87070229_nohash_1.wav,wav,,87070229,string,,left,string,,left,string,
+left/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/left/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,left,string,,left,string,
+left/adebe223_nohash_0,1.0,0,16000,/localscratch/GSC/left/adebe223_nohash_0.wav,wav,,adebe223,string,,left,string,,left,string,
+left/e49428d9_nohash_2,1.0,0,16000,/localscratch/GSC/left/e49428d9_nohash_2.wav,wav,,e49428d9,string,,left,string,,left,string,
+left/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/left/e1469561_nohash_0.wav,wav,,e1469561,string,,left,string,,left,string,
+left/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/left/e41a903b_nohash_3.wav,wav,,e41a903b,string,,left,string,,left,string,
+left/94de6a6a_nohash_4,1.0,0,16000,/localscratch/GSC/left/94de6a6a_nohash_4.wav,wav,,94de6a6a,string,,left,string,,left,string,
+left/fa446c16_nohash_1,1.0,0,16000,/localscratch/GSC/left/fa446c16_nohash_1.wav,wav,,fa446c16,string,,left,string,,left,string,
+left/5744b6a7_nohash_1,1.0,0,16000,/localscratch/GSC/left/5744b6a7_nohash_1.wav,wav,,5744b6a7,string,,left,string,,left,string,
+left/e9901cf0_nohash_0,1.0,0,16000,/localscratch/GSC/left/e9901cf0_nohash_0.wav,wav,,e9901cf0,string,,left,string,,left,string,
+left/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/left/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,left,string,,left,string,
+left/b2e2773a_nohash_2,1.0,0,16000,/localscratch/GSC/left/b2e2773a_nohash_2.wav,wav,,b2e2773a,string,,left,string,,left,string,
+left/bb31b82b_nohash_0,1.0,0,16000,/localscratch/GSC/left/bb31b82b_nohash_0.wav,wav,,bb31b82b,string,,left,string,,left,string,
+left/837a0f64_nohash_4,1.0,0,16000,/localscratch/GSC/left/837a0f64_nohash_4.wav,wav,,837a0f64,string,,left,string,,left,string,
+left/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/left/692a88e6_nohash_2.wav,wav,,692a88e6,string,,left,string,,left,string,
+left/85834399_nohash_0,1.0,0,16000,/localscratch/GSC/left/85834399_nohash_0.wav,wav,,85834399,string,,left,string,,left,string,
+left/bb31b82b_nohash_1,1.0,0,16000,/localscratch/GSC/left/bb31b82b_nohash_1.wav,wav,,bb31b82b,string,,left,string,,left,string,
+left/48a8a69d_nohash_1,1.0,0,16000,/localscratch/GSC/left/48a8a69d_nohash_1.wav,wav,,48a8a69d,string,,left,string,,left,string,
+left/5c8af87a_nohash_3,1.0,0,16000,/localscratch/GSC/left/5c8af87a_nohash_3.wav,wav,,5c8af87a,string,,left,string,,left,string,
+left/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/left/8769c34c_nohash_1.wav,wav,,8769c34c,string,,left,string,,left,string,
+left/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/left/b49caed3_nohash_3.wav,wav,,b49caed3,string,,left,string,,left,string,
+left/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/left/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,left,string,,left,string,
+left/bb05582b_nohash_4,1.0,0,16000,/localscratch/GSC/left/bb05582b_nohash_4.wav,wav,,bb05582b,string,,left,string,,left,string,
+left/fce96bac_nohash_2,1.0,0,16000,/localscratch/GSC/left/fce96bac_nohash_2.wav,wav,,fce96bac,string,,left,string,,left,string,
+left/1fe4c891_nohash_0,1.0,0,16000,/localscratch/GSC/left/1fe4c891_nohash_0.wav,wav,,1fe4c891,string,,left,string,,left,string,
+left/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/left/2fa39636_nohash_0.wav,wav,,2fa39636,string,,left,string,,left,string,
+left/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/left/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,left,string,,left,string,
+left/a9f54d8d_nohash_0,1.0,0,16000,/localscratch/GSC/left/a9f54d8d_nohash_0.wav,wav,,a9f54d8d,string,,left,string,,left,string,
+left/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/left/a4383927_nohash_0.wav,wav,,a4383927,string,,left,string,,left,string,
+left/67961766_nohash_2,1.0,0,16000,/localscratch/GSC/left/67961766_nohash_2.wav,wav,,67961766,string,,left,string,,left,string,
+left/85d2ac4b_nohash_0,1.0,0,16000,/localscratch/GSC/left/85d2ac4b_nohash_0.wav,wav,,85d2ac4b,string,,left,string,,left,string,
+left/42beb5eb_nohash_0,1.0,0,16000,/localscratch/GSC/left/42beb5eb_nohash_0.wav,wav,,42beb5eb,string,,left,string,,left,string,
+left/e49428d9_nohash_1,1.0,0,16000,/localscratch/GSC/left/e49428d9_nohash_1.wav,wav,,e49428d9,string,,left,string,,left,string,
+left/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/left/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,left,string,,left,string,
+left/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/left/aa80f517_nohash_0.wav,wav,,aa80f517,string,,left,string,,left,string,
+left/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/left/5170b77f_nohash_2.wav,wav,,5170b77f,string,,left,string,,left,string,
+left/dc75148d_nohash_1,1.0,0,16000,/localscratch/GSC/left/dc75148d_nohash_1.wav,wav,,dc75148d,string,,left,string,,left,string,
+left/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/left/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,left,string,,left,string,
+left/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/left/db24628d_nohash_1.wav,wav,,db24628d,string,,left,string,,left,string,
+left/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/left/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,left,string,,left,string,
+left/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/left/beb458a4_nohash_1.wav,wav,,beb458a4,string,,left,string,,left,string,
+left/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/left/d7467392_nohash_0.wav,wav,,d7467392,string,,left,string,,left,string,
+left/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/left/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,left,string,,left,string,
+left/a1533da4_nohash_0,1.0,0,16000,/localscratch/GSC/left/a1533da4_nohash_0.wav,wav,,a1533da4,string,,left,string,,left,string,
+left/5e3dde6b_nohash_3,1.0,0,16000,/localscratch/GSC/left/5e3dde6b_nohash_3.wav,wav,,5e3dde6b,string,,left,string,,left,string,
+left/f428ca69_nohash_3,1.0,0,16000,/localscratch/GSC/left/f428ca69_nohash_3.wav,wav,,f428ca69,string,,left,string,,left,string,
+left/4620dc14_nohash_0,1.0,0,16000,/localscratch/GSC/left/4620dc14_nohash_0.wav,wav,,4620dc14,string,,left,string,,left,string,
+left/4845bb10_nohash_0,1.0,0,16000,/localscratch/GSC/left/4845bb10_nohash_0.wav,wav,,4845bb10,string,,left,string,,left,string,
+left/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/left/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,left,string,,left,string,
+left/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/left/bb05582b_nohash_1.wav,wav,,bb05582b,string,,left,string,,left,string,
+left/5eb5fc74_nohash_0,1.0,0,16000,/localscratch/GSC/left/5eb5fc74_nohash_0.wav,wav,,5eb5fc74,string,,left,string,,left,string,
+left/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/left/5f01c798_nohash_0.wav,wav,,5f01c798,string,,left,string,,left,string,
+left/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/left/f9643d42_nohash_2.wav,wav,,f9643d42,string,,left,string,,left,string,
+left/f428ca69_nohash_1,1.0,0,16000,/localscratch/GSC/left/f428ca69_nohash_1.wav,wav,,f428ca69,string,,left,string,,left,string,
+left/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/left/f292725f_nohash_0.wav,wav,,f292725f,string,,left,string,,left,string,
+left/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/left/a7216980_nohash_2.wav,wav,,a7216980,string,,left,string,,left,string,
+left/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/left/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,left,string,,left,string,
+left/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/left/1acc97de_nohash_0.wav,wav,,1acc97de,string,,left,string,,left,string,
+left/aa80f517_nohash_4,1.0,0,16000,/localscratch/GSC/left/aa80f517_nohash_4.wav,wav,,aa80f517,string,,left,string,,left,string,
+left/4c4d2526_nohash_0,1.0,0,16000,/localscratch/GSC/left/4c4d2526_nohash_0.wav,wav,,4c4d2526,string,,left,string,,left,string,
+left/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/left/aa80f517_nohash_2.wav,wav,,aa80f517,string,,left,string,,left,string,
+left/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/left/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,left,string,,left,string,
+left/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/left/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,left,string,,left,string,
+left/0d53e045_nohash_0,1.0,0,16000,/localscratch/GSC/left/0d53e045_nohash_0.wav,wav,,0d53e045,string,,left,string,,left,string,
+left/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/left/a7216980_nohash_0.wav,wav,,a7216980,string,,left,string,,left,string,
+left/b1f8326d_nohash_0,1.0,0,16000,/localscratch/GSC/left/b1f8326d_nohash_0.wav,wav,,b1f8326d,string,,left,string,,left,string,
+left/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/left/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,left,string,,left,string,
+left/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/left/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,left,string,,left,string,
+left/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/left/b49caed3_nohash_2.wav,wav,,b49caed3,string,,left,string,,left,string,
+left/fce96bac_nohash_1,1.0,0,16000,/localscratch/GSC/left/fce96bac_nohash_1.wav,wav,,fce96bac,string,,left,string,,left,string,
+left/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_8.wav,wav,,893705bb,string,,left,string,,left,string,
+left/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/left/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,left,string,,left,string,
+left/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/left/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,left,string,,left,string,
+left/0ea0e2f4_nohash_0,1.0,0,16000,/localscratch/GSC/left/0ea0e2f4_nohash_0.wav,wav,,0ea0e2f4,string,,left,string,,left,string,
+left/ffb86d3c_nohash_0,1.0,0,16000,/localscratch/GSC/left/ffb86d3c_nohash_0.wav,wav,,ffb86d3c,string,,left,string,,left,string,
+left/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/left/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,left,string,,left,string,
+left/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/left/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,left,string,,left,string,
+left/ef2a3cfb_nohash_0,1.0,0,16000,/localscratch/GSC/left/ef2a3cfb_nohash_0.wav,wav,,ef2a3cfb,string,,left,string,,left,string,
+left/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/left/cd85758f_nohash_3.wav,wav,,cd85758f,string,,left,string,,left,string,
+left/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/left/91b03183_nohash_2.wav,wav,,91b03183,string,,left,string,,left,string,
+left/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/left/63f7a489_nohash_3.wav,wav,,63f7a489,string,,left,string,,left,string,
+left/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/left/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,left,string,,left,string,
+left/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/left/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,left,string,,left,string,
+left/a80f9f53_nohash_0,1.0,0,16000,/localscratch/GSC/left/a80f9f53_nohash_0.wav,wav,,a80f9f53,string,,left,string,,left,string,
+left/4f8ef132_nohash_0,1.0,0,16000,/localscratch/GSC/left/4f8ef132_nohash_0.wav,wav,,4f8ef132,string,,left,string,,left,string,
+left/553f1a79_nohash_0,1.0,0,16000,/localscratch/GSC/left/553f1a79_nohash_0.wav,wav,,553f1a79,string,,left,string,,left,string,
+left/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/left/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,left,string,,left,string,
+left/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/left/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,left,string,,left,string,
+left/f9643d42_nohash_4,1.0,0,16000,/localscratch/GSC/left/f9643d42_nohash_4.wav,wav,,f9643d42,string,,left,string,,left,string,
+left/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/left/b49caed3_nohash_0.wav,wav,,b49caed3,string,,left,string,,left,string,
+left/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/left/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,left,string,,left,string,
+left/80c45ed6_nohash_1,1.0,0,16000,/localscratch/GSC/left/80c45ed6_nohash_1.wav,wav,,80c45ed6,string,,left,string,,left,string,
+left/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/left/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,left,string,,left,string,
+left/af7a8296_nohash_0,1.0,0,16000,/localscratch/GSC/left/af7a8296_nohash_0.wav,wav,,af7a8296,string,,left,string,,left,string,
+left/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/left/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,left,string,,left,string,
+left/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/left/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,left,string,,left,string,
+left/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/left/692a88e6_nohash_1.wav,wav,,692a88e6,string,,left,string,,left,string,
+left/27c30960_nohash_0,1.0,0,16000,/localscratch/GSC/left/27c30960_nohash_0.wav,wav,,27c30960,string,,left,string,,left,string,
+left/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/left/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,left,string,,left,string,
+left/863880b7_nohash_0,1.0,0,16000,/localscratch/GSC/left/863880b7_nohash_0.wav,wav,,863880b7,string,,left,string,,left,string,
+left/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/left/37dca74f_nohash_2.wav,wav,,37dca74f,string,,left,string,,left,string,
+left/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/left/bb05582b_nohash_2.wav,wav,,bb05582b,string,,left,string,,left,string,
+left/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/left/692a88e6_nohash_0.wav,wav,,692a88e6,string,,left,string,,left,string,
+left/105a0eea_nohash_0,1.0,0,16000,/localscratch/GSC/left/105a0eea_nohash_0.wav,wav,,105a0eea,string,,left,string,,left,string,
+left/cfde27ba_nohash_0,1.0,0,16000,/localscratch/GSC/left/cfde27ba_nohash_0.wav,wav,,cfde27ba,string,,left,string,,left,string,
+left/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/left/aa80f517_nohash_3.wav,wav,,aa80f517,string,,left,string,,left,string,
+left/27c30960_nohash_1,1.0,0,16000,/localscratch/GSC/left/27c30960_nohash_1.wav,wav,,27c30960,string,,left,string,,left,string,
+left/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_1.wav,wav,,893705bb,string,,left,string,,left,string,
+left/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/left/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,left,string,,left,string,
+left/6736bc64_nohash_0,1.0,0,16000,/localscratch/GSC/left/6736bc64_nohash_0.wav,wav,,6736bc64,string,,left,string,,left,string,
+left/f264e0df_nohash_0,1.0,0,16000,/localscratch/GSC/left/f264e0df_nohash_0.wav,wav,,f264e0df,string,,left,string,,left,string,
+left/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/left/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,left,string,,left,string,
+left/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/left/cd85758f_nohash_2.wav,wav,,cd85758f,string,,left,string,,left,string,
+left/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/left/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,left,string,,left,string,
+left/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/left/63f7a489_nohash_1.wav,wav,,63f7a489,string,,left,string,,left,string,
+left/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/left/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,left,string,,left,string,
+left/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/left/189cbabe_nohash_2.wav,wav,,189cbabe,string,,left,string,,left,string,
+left/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/left/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,left,string,,left,string,
+left/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/left/422d3197_nohash_0.wav,wav,,422d3197,string,,left,string,,left,string,
+left/ca48dc76_nohash_2,1.0,0,16000,/localscratch/GSC/left/ca48dc76_nohash_2.wav,wav,,ca48dc76,string,,left,string,,left,string,
+left/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_6.wav,wav,,893705bb,string,,left,string,,left,string,
+left/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/left/189cbabe_nohash_0.wav,wav,,189cbabe,string,,left,string,,left,string,
+left/840c366d_nohash_1,1.0,0,16000,/localscratch/GSC/left/840c366d_nohash_1.wav,wav,,840c366d,string,,left,string,,left,string,
+left/8c7f81df_nohash_0,1.0,0,16000,/localscratch/GSC/left/8c7f81df_nohash_0.wav,wav,,8c7f81df,string,,left,string,,left,string,
+left/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/left/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,left,string,,left,string,
+left/283d7a53_nohash_1,1.0,0,16000,/localscratch/GSC/left/283d7a53_nohash_1.wav,wav,,283d7a53,string,,left,string,,left,string,
+left/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/left/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,left,string,,left,string,
+left/692a88e6_nohash_5,1.0,0,16000,/localscratch/GSC/left/692a88e6_nohash_5.wav,wav,,692a88e6,string,,left,string,,left,string,
+left/d91a159e_nohash_0,1.0,0,16000,/localscratch/GSC/left/d91a159e_nohash_0.wav,wav,,d91a159e,string,,left,string,,left,string,
+left/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/left/7257420c_nohash_0.wav,wav,,7257420c,string,,left,string,,left,string,
+left/2796ac50_nohash_1,1.0,0,16000,/localscratch/GSC/left/2796ac50_nohash_1.wav,wav,,2796ac50,string,,left,string,,left,string,
+left/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/left/beb458a4_nohash_0.wav,wav,,beb458a4,string,,left,string,,left,string,
+left/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/left/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,left,string,,left,string,
+left/a60a09cf_nohash_1,1.0,0,16000,/localscratch/GSC/left/a60a09cf_nohash_1.wav,wav,,a60a09cf,string,,left,string,,left,string,
+left/5f814c23_nohash_0,1.0,0,16000,/localscratch/GSC/left/5f814c23_nohash_0.wav,wav,,5f814c23,string,,left,string,,left,string,
+left/67961766_nohash_0,1.0,0,16000,/localscratch/GSC/left/67961766_nohash_0.wav,wav,,67961766,string,,left,string,,left,string,
+left/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/left/6f689791_nohash_0.wav,wav,,6f689791,string,,left,string,,left,string,
+left/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/left/87070229_nohash_2.wav,wav,,87070229,string,,left,string,,left,string,
+left/1cb788bc_nohash_0,1.0,0,16000,/localscratch/GSC/left/1cb788bc_nohash_0.wav,wav,,1cb788bc,string,,left,string,,left,string,
+left/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/left/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,left,string,,left,string,
+left/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/left/1acc97de_nohash_3.wav,wav,,1acc97de,string,,left,string,,left,string,
+left/5c8af87a_nohash_4,1.0,0,16000,/localscratch/GSC/left/5c8af87a_nohash_4.wav,wav,,5c8af87a,string,,left,string,,left,string,
+left/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/left/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,left,string,,left,string,
+left/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/left/87070229_nohash_0.wav,wav,,87070229,string,,left,string,,left,string,
+left/a4383927_nohash_1,1.0,0,16000,/localscratch/GSC/left/a4383927_nohash_1.wav,wav,,a4383927,string,,left,string,,left,string,
+left/6021f08b_nohash_0,1.0,0,16000,/localscratch/GSC/left/6021f08b_nohash_0.wav,wav,,6021f08b,string,,left,string,,left,string,
+left/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/left/97f4c236_nohash_4.wav,wav,,97f4c236,string,,left,string,,left,string,
+left/9a7c1f83_nohash_2,1.0,0,16000,/localscratch/GSC/left/9a7c1f83_nohash_2.wav,wav,,9a7c1f83,string,,left,string,,left,string,
+left/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/left/837a0f64_nohash_0.wav,wav,,837a0f64,string,,left,string,,left,string,
+left/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/left/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,left,string,,left,string,
+left/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/left/8769c34c_nohash_2.wav,wav,,8769c34c,string,,left,string,,left,string,
+left/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/left/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,left,string,,left,string,
+left/0d53e045_nohash_1,1.0,0,16000,/localscratch/GSC/left/0d53e045_nohash_1.wav,wav,,0d53e045,string,,left,string,,left,string,
+left/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/left/a7216980_nohash_3.wav,wav,,a7216980,string,,left,string,,left,string,
+left/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/left/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,left,string,,left,string,
+left/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/left/8fe67225_nohash_1.wav,wav,,8fe67225,string,,left,string,,left,string,
+left/022cd682_nohash_1,1.0,0,16000,/localscratch/GSC/left/022cd682_nohash_1.wav,wav,,022cd682,string,,left,string,,left,string,
+left/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/left/837a0f64_nohash_2.wav,wav,,837a0f64,string,,left,string,,left,string,
+left/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/left/db24628d_nohash_2.wav,wav,,db24628d,string,,left,string,,left,string,
+left/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/left/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,left,string,,left,string,
+left/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/left/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,left,string,,left,string,
+left/67961766_nohash_1,1.0,0,16000,/localscratch/GSC/left/67961766_nohash_1.wav,wav,,67961766,string,,left,string,,left,string,
+left/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/left/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,left,string,,left,string,
+left/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/left/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,left,string,,left,string,
+left/d5ca80c6_nohash_0,1.0,0,16000,/localscratch/GSC/left/d5ca80c6_nohash_0.wav,wav,,d5ca80c6,string,,left,string,,left,string,
+left/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_2.wav,wav,,893705bb,string,,left,string,,left,string,
+left/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/left/9a69672b_nohash_2.wav,wav,,9a69672b,string,,left,string,,left,string,
+left/2796ac50_nohash_2,1.0,0,16000,/localscratch/GSC/left/2796ac50_nohash_2.wav,wav,,2796ac50,string,,left,string,,left,string,
+left/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/left/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,left,string,,left,string,
+left/f6af2457_nohash_0,1.0,0,16000,/localscratch/GSC/left/f6af2457_nohash_0.wav,wav,,f6af2457,string,,left,string,,left,string,
+left/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/left/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,left,string,,left,string,
+left/e41a903b_nohash_4,1.0,0,16000,/localscratch/GSC/left/e41a903b_nohash_4.wav,wav,,e41a903b,string,,left,string,,left,string,
+left/7192fddc_nohash_0,1.0,0,16000,/localscratch/GSC/left/7192fddc_nohash_0.wav,wav,,7192fddc,string,,left,string,,left,string,
+left/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/left/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,left,string,,left,string,
+left/83957201_nohash_1,1.0,0,16000,/localscratch/GSC/left/83957201_nohash_1.wav,wav,,83957201,string,,left,string,,left,string,
+left/5f814c23_nohash_2,1.0,0,16000,/localscratch/GSC/left/5f814c23_nohash_2.wav,wav,,5f814c23,string,,left,string,,left,string,
+left/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/left/beb458a4_nohash_2.wav,wav,,beb458a4,string,,left,string,,left,string,
+left/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/left/0cb74144_nohash_2.wav,wav,,0cb74144,string,,left,string,,left,string,
+left/d0faf7e4_nohash_4,1.0,0,16000,/localscratch/GSC/left/d0faf7e4_nohash_4.wav,wav,,d0faf7e4,string,,left,string,,left,string,
+left/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/left/97f4c236_nohash_3.wav,wav,,97f4c236,string,,left,string,,left,string,
+left/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/left/8769c34c_nohash_0.wav,wav,,8769c34c,string,,left,string,,left,string,
+left/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/left/8fe67225_nohash_3.wav,wav,,8fe67225,string,,left,string,,left,string,
+left/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/left/189cbabe_nohash_3.wav,wav,,189cbabe,string,,left,string,,left,string,
+left/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/left/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,left,string,,left,string,
+left/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/left/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,left,string,,left,string,
+left/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/left/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,left,string,,left,string,
+left/9dc1889e_nohash_2,1.0,0,16000,/localscratch/GSC/left/9dc1889e_nohash_2.wav,wav,,9dc1889e,string,,left,string,,left,string,
+left/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/left/0cb74144_nohash_0.wav,wav,,0cb74144,string,,left,string,,left,string,
+left/283d7a53_nohash_0,1.0,0,16000,/localscratch/GSC/left/283d7a53_nohash_0.wav,wav,,283d7a53,string,,left,string,,left,string,
+left/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/left/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,left,string,,left,string,
+left/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/left/37dca74f_nohash_1.wav,wav,,37dca74f,string,,left,string,,left,string,
+left/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/left/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,left,string,,left,string,
+left/80c45ed6_nohash_0,1.0,0,16000,/localscratch/GSC/left/80c45ed6_nohash_0.wav,wav,,80c45ed6,string,,left,string,,left,string,
+left/f2e59fea_nohash_4,1.0,0,16000,/localscratch/GSC/left/f2e59fea_nohash_4.wav,wav,,f2e59fea,string,,left,string,,left,string,
+left/4c6167ca_nohash_6,1.0,0,16000,/localscratch/GSC/left/4c6167ca_nohash_6.wav,wav,,4c6167ca,string,,left,string,,left,string,
+left/9a7c1f83_nohash_3,1.0,0,16000,/localscratch/GSC/left/9a7c1f83_nohash_3.wav,wav,,9a7c1f83,string,,left,string,,left,string,
+left/370844f7_nohash_0,1.0,0,16000,/localscratch/GSC/left/370844f7_nohash_0.wav,wav,,370844f7,string,,left,string,,left,string,
+left/95ba4996_nohash_1,1.0,0,16000,/localscratch/GSC/left/95ba4996_nohash_1.wav,wav,,95ba4996,string,,left,string,,left,string,
+left/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/left/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,left,string,,left,string,
+left/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/left/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,left,string,,left,string,
+left/b11a05d2_nohash_0,1.0,0,16000,/localscratch/GSC/left/b11a05d2_nohash_0.wav,wav,,b11a05d2,string,,left,string,,left,string,
+left/105a0eea_nohash_1,1.0,0,16000,/localscratch/GSC/left/105a0eea_nohash_1.wav,wav,,105a0eea,string,,left,string,,left,string,
+left/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/left/68dd409e_nohash_0.wav,wav,,68dd409e,string,,left,string,,left,string,
+left/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/left/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,left,string,,left,string,
+left/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/left/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,left,string,,left,string,
+left/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_3.wav,wav,,893705bb,string,,left,string,,left,string,
+left/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/left/f428ca69_nohash_0.wav,wav,,f428ca69,string,,left,string,,left,string,
+left/ea356919_nohash_0,1.0,0,16000,/localscratch/GSC/left/ea356919_nohash_0.wav,wav,,ea356919,string,,left,string,,left,string,
+left/c7124b73_nohash_1,1.0,0,16000,/localscratch/GSC/left/c7124b73_nohash_1.wav,wav,,c7124b73,string,,left,string,,left,string,
+left/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/left/e41a903b_nohash_0.wav,wav,,e41a903b,string,,left,string,,left,string,
+left/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/left/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,left,string,,left,string,
+left/91b03183_nohash_3,1.0,0,16000,/localscratch/GSC/left/91b03183_nohash_3.wav,wav,,91b03183,string,,left,string,,left,string,
+left/9dc1889e_nohash_0,1.0,0,16000,/localscratch/GSC/left/9dc1889e_nohash_0.wav,wav,,9dc1889e,string,,left,string,,left,string,
+left/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/left/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,left,string,,left,string,
+left/af7a8296_nohash_1,1.0,0,16000,/localscratch/GSC/left/af7a8296_nohash_1.wav,wav,,af7a8296,string,,left,string,,left,string,
+left/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/left/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,left,string,,left,string,
+left/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/left/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,left,string,,left,string,
+left/cc592808_nohash_0,1.0,0,16000,/localscratch/GSC/left/cc592808_nohash_0.wav,wav,,cc592808,string,,left,string,,left,string,
+left/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/left/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,left,string,,left,string,
+left/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/left/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,left,string,,left,string,
+left/b2e2773a_nohash_1,1.0,0,16000,/localscratch/GSC/left/b2e2773a_nohash_1.wav,wav,,b2e2773a,string,,left,string,,left,string,
+left/587f3271_nohash_0,1.0,0,16000,/localscratch/GSC/left/587f3271_nohash_0.wav,wav,,587f3271,string,,left,string,,left,string,
+left/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/left/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,left,string,,left,string,
+left/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/left/e1469561_nohash_3.wav,wav,,e1469561,string,,left,string,,left,string,
+left/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/left/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,left,string,,left,string,
+left/f428ca69_nohash_2,1.0,0,16000,/localscratch/GSC/left/f428ca69_nohash_2.wav,wav,,f428ca69,string,,left,string,,left,string,
+left/ca48dc76_nohash_0,1.0,0,16000,/localscratch/GSC/left/ca48dc76_nohash_0.wav,wav,,ca48dc76,string,,left,string,,left,string,
+left/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/left/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,left,string,,left,string,
+left/412c675c_nohash_0,1.0,0,16000,/localscratch/GSC/left/412c675c_nohash_0.wav,wav,,412c675c,string,,left,string,,left,string,
+left/022cd682_nohash_0,1.0,0,16000,/localscratch/GSC/left/022cd682_nohash_0.wav,wav,,022cd682,string,,left,string,,left,string,
+left/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/left/2d82a556_nohash_0.wav,wav,,2d82a556,string,,left,string,,left,string,
+left/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/left/1acc97de_nohash_1.wav,wav,,1acc97de,string,,left,string,,left,string,
+left/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/left/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,left,string,,left,string,
+left/d1a4fb3f_nohash_1,1.0,0,16000,/localscratch/GSC/left/d1a4fb3f_nohash_1.wav,wav,,d1a4fb3f,string,,left,string,,left,string,
+left/1f3bece8_nohash_0,1.0,0,16000,/localscratch/GSC/left/1f3bece8_nohash_0.wav,wav,,1f3bece8,string,,left,string,,left,string,
+left/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/left/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,left,string,,left,string,
+left/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/left/87070229_nohash_3.wav,wav,,87070229,string,,left,string,,left,string,
+left/5828dfa2_nohash_0,1.0,0,16000,/localscratch/GSC/left/5828dfa2_nohash_0.wav,wav,,5828dfa2,string,,left,string,,left,string,
+left/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/left/91b03183_nohash_1.wav,wav,,91b03183,string,,left,string,,left,string,
+left/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/left/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,left,string,,left,string,
+left/4845bb10_nohash_1,1.0,0,16000,/localscratch/GSC/left/4845bb10_nohash_1.wav,wav,,4845bb10,string,,left,string,,left,string,
+left/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/left/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,left,string,,left,string,
+left/3efef882_nohash_0,1.0,0,16000,/localscratch/GSC/left/3efef882_nohash_0.wav,wav,,3efef882,string,,left,string,,left,string,
+left/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/left/bb05582b_nohash_0.wav,wav,,bb05582b,string,,left,string,,left,string,
+left/4c841771_nohash_0,1.0,0,16000,/localscratch/GSC/left/4c841771_nohash_0.wav,wav,,4c841771,string,,left,string,,left,string,
+left/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/left/8fe67225_nohash_4.wav,wav,,8fe67225,string,,left,string,,left,string,
+left/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/left/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,left,string,,left,string,
+left/b1f8326d_nohash_1,1.0,0,16000,/localscratch/GSC/left/b1f8326d_nohash_1.wav,wav,,b1f8326d,string,,left,string,,left,string,
+left/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/left/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,left,string,,left,string,
+left/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/left/f9643d42_nohash_1.wav,wav,,f9643d42,string,,left,string,,left,string,
+left/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/left/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,left,string,,left,string,
+left/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/left/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,left,string,,left,string,
+left/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/left/837a0f64_nohash_3.wav,wav,,837a0f64,string,,left,string,,left,string,
+left/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/left/e1469561_nohash_1.wav,wav,,e1469561,string,,left,string,,left,string,
+left/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/left/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,left,string,,left,string,
+left/e5e54cee_nohash_0,1.0,0,16000,/localscratch/GSC/left/e5e54cee_nohash_0.wav,wav,,e5e54cee,string,,left,string,,left,string,
+left/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/left/5170b77f_nohash_3.wav,wav,,5170b77f,string,,left,string,,left,string,
+left/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/left/f9643d42_nohash_3.wav,wav,,f9643d42,string,,left,string,,left,string,
+left/caedb73a_nohash_0,1.0,0,16000,/localscratch/GSC/left/caedb73a_nohash_0.wav,wav,,caedb73a,string,,left,string,,left,string,
+left/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/left/893705bb_nohash_4.wav,wav,,893705bb,string,,left,string,,left,string,
+left/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/left/e1469561_nohash_2.wav,wav,,e1469561,string,,left,string,,left,string,
+left/91b03183_nohash_4,1.0,0,16000,/localscratch/GSC/left/91b03183_nohash_4.wav,wav,,91b03183,string,,left,string,,left,string,
+left/692a88e6_nohash_4,1.0,0,16000,/localscratch/GSC/left/692a88e6_nohash_4.wav,wav,,692a88e6,string,,left,string,,left,string,
+left/840c366d_nohash_0,1.0,0,16000,/localscratch/GSC/left/840c366d_nohash_0.wav,wav,,840c366d,string,,left,string,,left,string,
+left/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/left/c7124b73_nohash_0.wav,wav,,c7124b73,string,,left,string,,left,string,
+left/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/left/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,left,string,,left,string,
+left/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/left/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,left,string,,left,string,
+left/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/left/e1469561_nohash_4.wav,wav,,e1469561,string,,left,string,,left,string,
+left/cfbedff9_nohash_4,1.0,0,16000,/localscratch/GSC/left/cfbedff9_nohash_4.wav,wav,,cfbedff9,string,,left,string,,left,string,
+left/a2473d62_nohash_0,1.0,0,16000,/localscratch/GSC/left/a2473d62_nohash_0.wav,wav,,a2473d62,string,,left,string,,left,string,
+left/68dd409e_nohash_1,1.0,0,16000,/localscratch/GSC/left/68dd409e_nohash_1.wav,wav,,68dd409e,string,,left,string,,left,string,
+left/e5e54cee_nohash_1,1.0,0,16000,/localscratch/GSC/left/e5e54cee_nohash_1.wav,wav,,e5e54cee,string,,left,string,,left,string,
+left/9a69672b_nohash_4,1.0,0,16000,/localscratch/GSC/left/9a69672b_nohash_4.wav,wav,,9a69672b,string,,left,string,,left,string,
+left/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/left/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,left,string,,left,string,
+left/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/left/bb05582b_nohash_3.wav,wav,,bb05582b,string,,left,string,,left,string,
+left/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/left/5170b77f_nohash_0.wav,wav,,5170b77f,string,,left,string,,left,string,
+left/28497c5b_nohash_0,1.0,0,16000,/localscratch/GSC/left/28497c5b_nohash_0.wav,wav,,28497c5b,string,,left,string,,left,string,
+left/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/left/0cb74144_nohash_3.wav,wav,,0cb74144,string,,left,string,,left,string,
+left/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/left/9a69672b_nohash_1.wav,wav,,9a69672b,string,,left,string,,left,string,
+left/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/left/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,left,string,,left,string,
+left/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/left/37dca74f_nohash_0.wav,wav,,37dca74f,string,,left,string,,left,string,
+left/9d4bab4f_nohash_0,1.0,0,16000,/localscratch/GSC/left/9d4bab4f_nohash_0.wav,wav,,9d4bab4f,string,,left,string,,left,string,
+right/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/right/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,right,string,,right,string,
+right/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/right/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,right,string,,right,string,
+right/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/right/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,right,string,,right,string,
+right/44715c1c_nohash_1,1.0,0,16000,/localscratch/GSC/right/44715c1c_nohash_1.wav,wav,,44715c1c,string,,right,string,,right,string,
+right/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/right/cd85758f_nohash_1.wav,wav,,cd85758f,string,,right,string,,right,string,
+right/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/right/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,right,string,,right,string,
+right/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/right/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,right,string,,right,string,
+right/b7e9f841_nohash_0,1.0,0,16000,/localscratch/GSC/right/b7e9f841_nohash_0.wav,wav,,b7e9f841,string,,right,string,,right,string,
+right/4a0e2c16_nohash_0,1.0,0,16000,/localscratch/GSC/right/4a0e2c16_nohash_0.wav,wav,,4a0e2c16,string,,right,string,,right,string,
+right/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/right/b49caed3_nohash_1.wav,wav,,b49caed3,string,,right,string,,right,string,
+right/26b28ea7_nohash_1,1.0,0,16000,/localscratch/GSC/right/26b28ea7_nohash_1.wav,wav,,26b28ea7,string,,right,string,,right,string,
+right/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/right/e41a903b_nohash_2.wav,wav,,e41a903b,string,,right,string,,right,string,
+right/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/right/dc75148d_nohash_0.wav,wav,,dc75148d,string,,right,string,,right,string,
+right/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/right/97f4c236_nohash_2.wav,wav,,97f4c236,string,,right,string,,right,string,
+right/81dc4a94_nohash_0,1.0,0,16000,/localscratch/GSC/right/81dc4a94_nohash_0.wav,wav,,81dc4a94,string,,right,string,,right,string,
+right/03401e93_nohash_0,1.0,0,16000,/localscratch/GSC/right/03401e93_nohash_0.wav,wav,,03401e93,string,,right,string,,right,string,
+right/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/right/91b03183_nohash_0.wav,wav,,91b03183,string,,right,string,,right,string,
+right/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/right/5170b77f_nohash_1.wav,wav,,5170b77f,string,,right,string,,right,string,
+right/b11a05d2_nohash_1,1.0,0,16000,/localscratch/GSC/right/b11a05d2_nohash_1.wav,wav,,b11a05d2,string,,right,string,,right,string,
+right/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/right/db24628d_nohash_0.wav,wav,,db24628d,string,,right,string,,right,string,
+right/9d171fee_nohash_0,1.0,0,16000,/localscratch/GSC/right/9d171fee_nohash_0.wav,wav,,9d171fee,string,,right,string,,right,string,
+right/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/right/837a0f64_nohash_1.wav,wav,,837a0f64,string,,right,string,,right,string,
+right/8625475c_nohash_0,1.0,0,16000,/localscratch/GSC/right/8625475c_nohash_0.wav,wav,,8625475c,string,,right,string,,right,string,
+right/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/right/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,right,string,,right,string,
+right/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/right/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,right,string,,right,string,
+right/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/right/97f4c236_nohash_1.wav,wav,,97f4c236,string,,right,string,,right,string,
+right/587f3271_nohash_1,1.0,0,16000,/localscratch/GSC/right/587f3271_nohash_1.wav,wav,,587f3271,string,,right,string,,right,string,
+right/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/right/a7216980_nohash_1.wav,wav,,a7216980,string,,right,string,,right,string,
+right/a60a09cf_nohash_0,1.0,0,16000,/localscratch/GSC/right/a60a09cf_nohash_0.wav,wav,,a60a09cf,string,,right,string,,right,string,
+right/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/right/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,right,string,,right,string,
+right/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/right/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,right,string,,right,string,
+right/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/right/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,right,string,,right,string,
+right/3efef882_nohash_2,1.0,0,16000,/localscratch/GSC/right/3efef882_nohash_2.wav,wav,,3efef882,string,,right,string,,right,string,
+right/aa233654_nohash_1,1.0,0,16000,/localscratch/GSC/right/aa233654_nohash_1.wav,wav,,aa233654,string,,right,string,,right,string,
+right/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/right/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,right,string,,right,string,
+right/1f653d27_nohash_1,1.0,0,16000,/localscratch/GSC/right/1f653d27_nohash_1.wav,wav,,1f653d27,string,,right,string,,right,string,
+right/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/right/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,right,string,,right,string,
+right/3efef882_nohash_1,1.0,0,16000,/localscratch/GSC/right/3efef882_nohash_1.wav,wav,,3efef882,string,,right,string,,right,string,
+right/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/right/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,right,string,,right,string,
+right/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/right/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,right,string,,right,string,
+right/6b889021_nohash_0,1.0,0,16000,/localscratch/GSC/right/6b889021_nohash_0.wav,wav,,6b889021,string,,right,string,,right,string,
+right/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/right/692a88e6_nohash_3.wav,wav,,692a88e6,string,,right,string,,right,string,
+right/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/right/63f7a489_nohash_0.wav,wav,,63f7a489,string,,right,string,,right,string,
+right/d9e9f554_nohash_0,1.0,0,16000,/localscratch/GSC/right/d9e9f554_nohash_0.wav,wav,,d9e9f554,string,,right,string,,right,string,
+right/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/right/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,right,string,,right,string,
+right/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/right/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,right,string,,right,string,
+right/ad6a46f1_nohash_0,1.0,0,16000,/localscratch/GSC/right/ad6a46f1_nohash_0.wav,wav,,ad6a46f1,string,,right,string,,right,string,
+right/2aa787cf_nohash_0,1.0,0,16000,/localscratch/GSC/right/2aa787cf_nohash_0.wav,wav,,2aa787cf,string,,right,string,,right,string,
+right/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_0.wav,wav,,893705bb,string,,right,string,,right,string,
+right/4c7c95de_nohash_1,1.0,0,16000,/localscratch/GSC/right/4c7c95de_nohash_1.wav,wav,,4c7c95de,string,,right,string,,right,string,
+right/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/right/189cbabe_nohash_1.wav,wav,,189cbabe,string,,right,string,,right,string,
+right/6e916de8_nohash_2,1.0,0,16000,/localscratch/GSC/right/6e916de8_nohash_2.wav,wav,,6e916de8,string,,right,string,,right,string,
+right/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/right/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,right,string,,right,string,
+right/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/right/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,right,string,,right,string,
+right/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/right/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,right,string,,right,string,
+right/37fc5d97_nohash_1,1.0,0,16000,/localscratch/GSC/right/37fc5d97_nohash_1.wav,wav,,37fc5d97,string,,right,string,,right,string,
+right/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/right/8fe67225_nohash_0.wav,wav,,8fe67225,string,,right,string,,right,string,
+right/3df9a3d4_nohash_0,1.0,0,16000,/localscratch/GSC/right/3df9a3d4_nohash_0.wav,wav,,3df9a3d4,string,,right,string,,right,string,
+right/37fc5d97_nohash_2,1.0,0,16000,/localscratch/GSC/right/37fc5d97_nohash_2.wav,wav,,37fc5d97,string,,right,string,,right,string,
+right/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/right/beb458a4_nohash_4.wav,wav,,beb458a4,string,,right,string,,right,string,
+right/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/right/2796ac50_nohash_0.wav,wav,,2796ac50,string,,right,string,,right,string,
+right/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/right/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,right,string,,right,string,
+right/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/right/cd85758f_nohash_0.wav,wav,,cd85758f,string,,right,string,,right,string,
+right/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/right/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,right,string,,right,string,
+right/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/right/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,right,string,,right,string,
+right/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/right/e41a903b_nohash_1.wav,wav,,e41a903b,string,,right,string,,right,string,
+right/95ba4996_nohash_0,1.0,0,16000,/localscratch/GSC/right/95ba4996_nohash_0.wav,wav,,95ba4996,string,,right,string,,right,string,
+right/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/right/37dca74f_nohash_3.wav,wav,,37dca74f,string,,right,string,,right,string,
+right/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/right/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,right,string,,right,string,
+right/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/right/8fe67225_nohash_2.wav,wav,,8fe67225,string,,right,string,,right,string,
+right/5525ff66_nohash_0,1.0,0,16000,/localscratch/GSC/right/5525ff66_nohash_0.wav,wav,,5525ff66,string,,right,string,,right,string,
+right/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/right/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,right,string,,right,string,
+right/653a48f5_nohash_0,1.0,0,16000,/localscratch/GSC/right/653a48f5_nohash_0.wav,wav,,653a48f5,string,,right,string,,right,string,
+right/e49428d9_nohash_3,1.0,0,16000,/localscratch/GSC/right/e49428d9_nohash_3.wav,wav,,e49428d9,string,,right,string,,right,string,
+right/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_7.wav,wav,,893705bb,string,,right,string,,right,string,
+right/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/right/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,right,string,,right,string,
+right/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/right/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,right,string,,right,string,
+right/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/right/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,right,string,,right,string,
+right/3d86b69a_nohash_2,1.0,0,16000,/localscratch/GSC/right/3d86b69a_nohash_2.wav,wav,,3d86b69a,string,,right,string,,right,string,
+right/47d01978_nohash_0,1.0,0,16000,/localscratch/GSC/right/47d01978_nohash_0.wav,wav,,47d01978,string,,right,string,,right,string,
+right/c7dc7278_nohash_4,1.0,0,16000,/localscratch/GSC/right/c7dc7278_nohash_4.wav,wav,,c7dc7278,string,,right,string,,right,string,
+right/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/right/97f4c236_nohash_0.wav,wav,,97f4c236,string,,right,string,,right,string,
+right/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/right/aa80f517_nohash_1.wav,wav,,aa80f517,string,,right,string,,right,string,
+right/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/right/0cb74144_nohash_1.wav,wav,,0cb74144,string,,right,string,,right,string,
+right/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/right/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,right,string,,right,string,
+right/44715c1c_nohash_0,1.0,0,16000,/localscratch/GSC/right/44715c1c_nohash_0.wav,wav,,44715c1c,string,,right,string,,right,string,
+right/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/right/9a69672b_nohash_0.wav,wav,,9a69672b,string,,right,string,,right,string,
+right/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/right/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,right,string,,right,string,
+right/d7467392_nohash_1,1.0,0,16000,/localscratch/GSC/right/d7467392_nohash_1.wav,wav,,d7467392,string,,right,string,,right,string,
+right/83957201_nohash_0,1.0,0,16000,/localscratch/GSC/right/83957201_nohash_0.wav,wav,,83957201,string,,right,string,,right,string,
+right/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/right/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,right,string,,right,string,
+right/0bac8a71_nohash_0,1.0,0,16000,/localscratch/GSC/right/0bac8a71_nohash_0.wav,wav,,0bac8a71,string,,right,string,,right,string,
+right/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/right/1acc97de_nohash_2.wav,wav,,1acc97de,string,,right,string,,right,string,
+right/9a7c1f83_nohash_4,1.0,0,16000,/localscratch/GSC/right/9a7c1f83_nohash_4.wav,wav,,9a7c1f83,string,,right,string,,right,string,
+right/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/right/63f7a489_nohash_2.wav,wav,,63f7a489,string,,right,string,,right,string,
+right/0fa1e7a9_nohash_0,1.0,0,16000,/localscratch/GSC/right/0fa1e7a9_nohash_0.wav,wav,,0fa1e7a9,string,,right,string,,right,string,
+right/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/right/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,right,string,,right,string,
+right/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/right/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,right,string,,right,string,
+right/e49428d9_nohash_0,1.0,0,16000,/localscratch/GSC/right/e49428d9_nohash_0.wav,wav,,e49428d9,string,,right,string,,right,string,
+right/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_5.wav,wav,,893705bb,string,,right,string,,right,string,
+right/4f8ef132_nohash_1,1.0,0,16000,/localscratch/GSC/right/4f8ef132_nohash_1.wav,wav,,4f8ef132,string,,right,string,,right,string,
+right/fe1916ba_nohash_0,1.0,0,16000,/localscratch/GSC/right/fe1916ba_nohash_0.wav,wav,,fe1916ba,string,,right,string,,right,string,
+right/5744b6a7_nohash_0,1.0,0,16000,/localscratch/GSC/right/5744b6a7_nohash_0.wav,wav,,5744b6a7,string,,right,string,,right,string,
+right/7add4c5f_nohash_0,1.0,0,16000,/localscratch/GSC/right/7add4c5f_nohash_0.wav,wav,,7add4c5f,string,,right,string,,right,string,
+right/5f814c23_nohash_1,1.0,0,16000,/localscratch/GSC/right/5f814c23_nohash_1.wav,wav,,5f814c23,string,,right,string,,right,string,
+right/d9e9f554_nohash_1,1.0,0,16000,/localscratch/GSC/right/d9e9f554_nohash_1.wav,wav,,d9e9f554,string,,right,string,,right,string,
+right/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/right/f9643d42_nohash_0.wav,wav,,f9643d42,string,,right,string,,right,string,
+right/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/right/beb458a4_nohash_3.wav,wav,,beb458a4,string,,right,string,,right,string,
+right/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/right/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,right,string,,right,string,
+right/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/right/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,right,string,,right,string,
+right/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/right/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,right,string,,right,string,
+right/03401e93_nohash_1,1.0,0,16000,/localscratch/GSC/right/03401e93_nohash_1.wav,wav,,03401e93,string,,right,string,,right,string,
+right/37fc5d97_nohash_0,1.0,0,16000,/localscratch/GSC/right/37fc5d97_nohash_0.wav,wav,,37fc5d97,string,,right,string,,right,string,
+right/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/right/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,right,string,,right,string,
+right/daf230ac_nohash_0,1.0,0,16000,/localscratch/GSC/right/daf230ac_nohash_0.wav,wav,,daf230ac,string,,right,string,,right,string,
+right/2005ca25_nohash_0,1.0,0,16000,/localscratch/GSC/right/2005ca25_nohash_0.wav,wav,,2005ca25,string,,right,string,,right,string,
+right/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/right/87070229_nohash_1.wav,wav,,87070229,string,,right,string,,right,string,
+right/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/right/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,right,string,,right,string,
+right/e0c782d5_nohash_4,1.0,0,16000,/localscratch/GSC/right/e0c782d5_nohash_4.wav,wav,,e0c782d5,string,,right,string,,right,string,
+right/e49428d9_nohash_2,1.0,0,16000,/localscratch/GSC/right/e49428d9_nohash_2.wav,wav,,e49428d9,string,,right,string,,right,string,
+right/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/right/e1469561_nohash_0.wav,wav,,e1469561,string,,right,string,,right,string,
+right/db24628d_nohash_4,1.0,0,16000,/localscratch/GSC/right/db24628d_nohash_4.wav,wav,,db24628d,string,,right,string,,right,string,
+right/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/right/e41a903b_nohash_3.wav,wav,,e41a903b,string,,right,string,,right,string,
+right/50033893_nohash_1,1.0,0,16000,/localscratch/GSC/right/50033893_nohash_1.wav,wav,,50033893,string,,right,string,,right,string,
+right/94de6a6a_nohash_4,1.0,0,16000,/localscratch/GSC/right/94de6a6a_nohash_4.wav,wav,,94de6a6a,string,,right,string,,right,string,
+right/6e916de8_nohash_1,1.0,0,16000,/localscratch/GSC/right/6e916de8_nohash_1.wav,wav,,6e916de8,string,,right,string,,right,string,
+right/5744b6a7_nohash_1,1.0,0,16000,/localscratch/GSC/right/5744b6a7_nohash_1.wav,wav,,5744b6a7,string,,right,string,,right,string,
+right/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/right/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,right,string,,right,string,
+right/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/right/692a88e6_nohash_2.wav,wav,,692a88e6,string,,right,string,,right,string,
+right/85834399_nohash_0,1.0,0,16000,/localscratch/GSC/right/85834399_nohash_0.wav,wav,,85834399,string,,right,string,,right,string,
+right/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/right/8769c34c_nohash_1.wav,wav,,8769c34c,string,,right,string,,right,string,
+right/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/right/b49caed3_nohash_3.wav,wav,,b49caed3,string,,right,string,,right,string,
+right/a4e8a997_nohash_0,1.0,0,16000,/localscratch/GSC/right/a4e8a997_nohash_0.wav,wav,,a4e8a997,string,,right,string,,right,string,
+right/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/right/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,right,string,,right,string,
+right/1fe4c891_nohash_0,1.0,0,16000,/localscratch/GSC/right/1fe4c891_nohash_0.wav,wav,,1fe4c891,string,,right,string,,right,string,
+right/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/right/2fa39636_nohash_0.wav,wav,,2fa39636,string,,right,string,,right,string,
+right/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/right/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,right,string,,right,string,
+right/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/right/a4383927_nohash_0.wav,wav,,a4383927,string,,right,string,,right,string,
+right/85d2ac4b_nohash_0,1.0,0,16000,/localscratch/GSC/right/85d2ac4b_nohash_0.wav,wav,,85d2ac4b,string,,right,string,,right,string,
+right/f292725f_nohash_1,1.0,0,16000,/localscratch/GSC/right/f292725f_nohash_1.wav,wav,,f292725f,string,,right,string,,right,string,
+right/e49428d9_nohash_1,1.0,0,16000,/localscratch/GSC/right/e49428d9_nohash_1.wav,wav,,e49428d9,string,,right,string,,right,string,
+right/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/right/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,right,string,,right,string,
+right/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/right/aa80f517_nohash_0.wav,wav,,aa80f517,string,,right,string,,right,string,
+right/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/right/5170b77f_nohash_2.wav,wav,,5170b77f,string,,right,string,,right,string,
+right/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/right/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,right,string,,right,string,
+right/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/right/db24628d_nohash_1.wav,wav,,db24628d,string,,right,string,,right,string,
+right/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/right/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,right,string,,right,string,
+right/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/right/beb458a4_nohash_1.wav,wav,,beb458a4,string,,right,string,,right,string,
+right/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/right/d7467392_nohash_0.wav,wav,,d7467392,string,,right,string,,right,string,
+right/f0ae7203_nohash_1,1.0,0,16000,/localscratch/GSC/right/f0ae7203_nohash_1.wav,wav,,f0ae7203,string,,right,string,,right,string,
+right/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/right/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,right,string,,right,string,
+right/2d82a556_nohash_1,1.0,0,16000,/localscratch/GSC/right/2d82a556_nohash_1.wav,wav,,2d82a556,string,,right,string,,right,string,
+right/a1533da4_nohash_0,1.0,0,16000,/localscratch/GSC/right/a1533da4_nohash_0.wav,wav,,a1533da4,string,,right,string,,right,string,
+right/4620dc14_nohash_0,1.0,0,16000,/localscratch/GSC/right/4620dc14_nohash_0.wav,wav,,4620dc14,string,,right,string,,right,string,
+right/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/right/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,right,string,,right,string,
+right/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/right/bb05582b_nohash_1.wav,wav,,bb05582b,string,,right,string,,right,string,
+right/7add4c5f_nohash_1,1.0,0,16000,/localscratch/GSC/right/7add4c5f_nohash_1.wav,wav,,7add4c5f,string,,right,string,,right,string,
+right/5eb5fc74_nohash_0,1.0,0,16000,/localscratch/GSC/right/5eb5fc74_nohash_0.wav,wav,,5eb5fc74,string,,right,string,,right,string,
+right/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/right/5f01c798_nohash_0.wav,wav,,5f01c798,string,,right,string,,right,string,
+right/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/right/f9643d42_nohash_2.wav,wav,,f9643d42,string,,right,string,,right,string,
+right/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/right/f292725f_nohash_0.wav,wav,,f292725f,string,,right,string,,right,string,
+right/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/right/a7216980_nohash_2.wav,wav,,a7216980,string,,right,string,,right,string,
+right/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/right/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,right,string,,right,string,
+right/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/right/1acc97de_nohash_0.wav,wav,,1acc97de,string,,right,string,,right,string,
+right/aa80f517_nohash_4,1.0,0,16000,/localscratch/GSC/right/aa80f517_nohash_4.wav,wav,,aa80f517,string,,right,string,,right,string,
+right/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/right/aa80f517_nohash_2.wav,wav,,aa80f517,string,,right,string,,right,string,
+right/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/right/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,right,string,,right,string,
+right/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/right/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,right,string,,right,string,
+right/ea37ca08_nohash_0,1.0,0,16000,/localscratch/GSC/right/ea37ca08_nohash_0.wav,wav,,ea37ca08,string,,right,string,,right,string,
+right/0d53e045_nohash_0,1.0,0,16000,/localscratch/GSC/right/0d53e045_nohash_0.wav,wav,,0d53e045,string,,right,string,,right,string,
+right/7e1054e7_nohash_0,1.0,0,16000,/localscratch/GSC/right/7e1054e7_nohash_0.wav,wav,,7e1054e7,string,,right,string,,right,string,
+right/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/right/a7216980_nohash_0.wav,wav,,a7216980,string,,right,string,,right,string,
+right/b1f8326d_nohash_0,1.0,0,16000,/localscratch/GSC/right/b1f8326d_nohash_0.wav,wav,,b1f8326d,string,,right,string,,right,string,
+right/6736bc64_nohash_2,1.0,0,16000,/localscratch/GSC/right/6736bc64_nohash_2.wav,wav,,6736bc64,string,,right,string,,right,string,
+right/db24628d_nohash_3,1.0,0,16000,/localscratch/GSC/right/db24628d_nohash_3.wav,wav,,db24628d,string,,right,string,,right,string,
+right/aa48c94a_nohash_1,1.0,0,16000,/localscratch/GSC/right/aa48c94a_nohash_1.wav,wav,,aa48c94a,string,,right,string,,right,string,
+right/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/right/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,right,string,,right,string,
+right/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/right/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,right,string,,right,string,
+right/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/right/b49caed3_nohash_2.wav,wav,,b49caed3,string,,right,string,,right,string,
+right/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_8.wav,wav,,893705bb,string,,right,string,,right,string,
+right/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/right/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,right,string,,right,string,
+right/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/right/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,right,string,,right,string,
+right/0ea0e2f4_nohash_0,1.0,0,16000,/localscratch/GSC/right/0ea0e2f4_nohash_0.wav,wav,,0ea0e2f4,string,,right,string,,right,string,
+right/cd85758f_nohash_4,1.0,0,16000,/localscratch/GSC/right/cd85758f_nohash_4.wav,wav,,cd85758f,string,,right,string,,right,string,
+right/ffb86d3c_nohash_0,1.0,0,16000,/localscratch/GSC/right/ffb86d3c_nohash_0.wav,wav,,ffb86d3c,string,,right,string,,right,string,
+right/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/right/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,right,string,,right,string,
+right/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/right/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,right,string,,right,string,
+right/ef2a3cfb_nohash_0,1.0,0,16000,/localscratch/GSC/right/ef2a3cfb_nohash_0.wav,wav,,ef2a3cfb,string,,right,string,,right,string,
+right/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/right/cd85758f_nohash_3.wav,wav,,cd85758f,string,,right,string,,right,string,
+right/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/right/91b03183_nohash_2.wav,wav,,91b03183,string,,right,string,,right,string,
+right/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/right/63f7a489_nohash_3.wav,wav,,63f7a489,string,,right,string,,right,string,
+right/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/right/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,right,string,,right,string,
+right/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/right/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,right,string,,right,string,
+right/0f250098_nohash_0,1.0,0,16000,/localscratch/GSC/right/0f250098_nohash_0.wav,wav,,0f250098,string,,right,string,,right,string,
+right/3d86b69a_nohash_3,1.0,0,16000,/localscratch/GSC/right/3d86b69a_nohash_3.wav,wav,,3d86b69a,string,,right,string,,right,string,
+right/0c40e715_nohash_1,1.0,0,16000,/localscratch/GSC/right/0c40e715_nohash_1.wav,wav,,0c40e715,string,,right,string,,right,string,
+right/0c40e715_nohash_0,1.0,0,16000,/localscratch/GSC/right/0c40e715_nohash_0.wav,wav,,0c40e715,string,,right,string,,right,string,
+right/a80f9f53_nohash_0,1.0,0,16000,/localscratch/GSC/right/a80f9f53_nohash_0.wav,wav,,a80f9f53,string,,right,string,,right,string,
+right/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/right/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,right,string,,right,string,
+right/4f8ef132_nohash_0,1.0,0,16000,/localscratch/GSC/right/4f8ef132_nohash_0.wav,wav,,4f8ef132,string,,right,string,,right,string,
+right/8fe52b97_nohash_1,1.0,0,16000,/localscratch/GSC/right/8fe52b97_nohash_1.wav,wav,,8fe52b97,string,,right,string,,right,string,
+right/c9e251d2_nohash_0,1.0,0,16000,/localscratch/GSC/right/c9e251d2_nohash_0.wav,wav,,c9e251d2,string,,right,string,,right,string,
+right/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/right/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,right,string,,right,string,
+right/f9643d42_nohash_4,1.0,0,16000,/localscratch/GSC/right/f9643d42_nohash_4.wav,wav,,f9643d42,string,,right,string,,right,string,
+right/af405b69_nohash_0,1.0,0,16000,/localscratch/GSC/right/af405b69_nohash_0.wav,wav,,af405b69,string,,right,string,,right,string,
+right/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/right/b49caed3_nohash_0.wav,wav,,b49caed3,string,,right,string,,right,string,
+right/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/right/881583a6_nohash_0.wav,wav,,881583a6,string,,right,string,,right,string,
+right/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/right/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,right,string,,right,string,
+right/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/right/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,right,string,,right,string,
+right/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/right/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,right,string,,right,string,
+right/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/right/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,right,string,,right,string,
+right/26b28ea7_nohash_2,1.0,0,16000,/localscratch/GSC/right/26b28ea7_nohash_2.wav,wav,,26b28ea7,string,,right,string,,right,string,
+right/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/right/692a88e6_nohash_1.wav,wav,,692a88e6,string,,right,string,,right,string,
+right/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/right/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,right,string,,right,string,
+right/7bae88ed_nohash_0,1.0,0,16000,/localscratch/GSC/right/7bae88ed_nohash_0.wav,wav,,7bae88ed,string,,right,string,,right,string,
+right/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/right/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,right,string,,right,string,
+right/863880b7_nohash_0,1.0,0,16000,/localscratch/GSC/right/863880b7_nohash_0.wav,wav,,863880b7,string,,right,string,,right,string,
+right/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/right/37dca74f_nohash_2.wav,wav,,37dca74f,string,,right,string,,right,string,
+right/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/right/bb05582b_nohash_2.wav,wav,,bb05582b,string,,right,string,,right,string,
+right/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/right/692a88e6_nohash_0.wav,wav,,692a88e6,string,,right,string,,right,string,
+right/cfde27ba_nohash_0,1.0,0,16000,/localscratch/GSC/right/cfde27ba_nohash_0.wav,wav,,cfde27ba,string,,right,string,,right,string,
+right/8c7f81df_nohash_1,1.0,0,16000,/localscratch/GSC/right/8c7f81df_nohash_1.wav,wav,,8c7f81df,string,,right,string,,right,string,
+right/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/right/aa80f517_nohash_3.wav,wav,,aa80f517,string,,right,string,,right,string,
+right/8fe52b97_nohash_0,1.0,0,16000,/localscratch/GSC/right/8fe52b97_nohash_0.wav,wav,,8fe52b97,string,,right,string,,right,string,
+right/6b889021_nohash_1,1.0,0,16000,/localscratch/GSC/right/6b889021_nohash_1.wav,wav,,6b889021,string,,right,string,,right,string,
+right/f0ae7203_nohash_2,1.0,0,16000,/localscratch/GSC/right/f0ae7203_nohash_2.wav,wav,,f0ae7203,string,,right,string,,right,string,
+right/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_1.wav,wav,,893705bb,string,,right,string,,right,string,
+right/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/right/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,right,string,,right,string,
+right/6736bc64_nohash_0,1.0,0,16000,/localscratch/GSC/right/6736bc64_nohash_0.wav,wav,,6736bc64,string,,right,string,,right,string,
+right/18f8afd5_nohash_4,1.0,0,16000,/localscratch/GSC/right/18f8afd5_nohash_4.wav,wav,,18f8afd5,string,,right,string,,right,string,
+right/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/right/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,right,string,,right,string,
+right/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/right/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,right,string,,right,string,
+right/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/right/cd85758f_nohash_2.wav,wav,,cd85758f,string,,right,string,,right,string,
+right/3cbd76a3_nohash_0,1.0,0,16000,/localscratch/GSC/right/3cbd76a3_nohash_0.wav,wav,,3cbd76a3,string,,right,string,,right,string,
+right/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/right/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,right,string,,right,string,
+right/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/right/63f7a489_nohash_1.wav,wav,,63f7a489,string,,right,string,,right,string,
+right/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/right/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,right,string,,right,string,
+right/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/right/189cbabe_nohash_2.wav,wav,,189cbabe,string,,right,string,,right,string,
+right/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/right/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,right,string,,right,string,
+right/a591c2ea_nohash_0,1.0,0,16000,/localscratch/GSC/right/a591c2ea_nohash_0.wav,wav,,a591c2ea,string,,right,string,,right,string,
+right/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/right/422d3197_nohash_0.wav,wav,,422d3197,string,,right,string,,right,string,
+right/fdb5155e_nohash_0,1.0,0,16000,/localscratch/GSC/right/fdb5155e_nohash_0.wav,wav,,fdb5155e,string,,right,string,,right,string,
+right/aa233654_nohash_0,1.0,0,16000,/localscratch/GSC/right/aa233654_nohash_0.wav,wav,,aa233654,string,,right,string,,right,string,
+right/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_6.wav,wav,,893705bb,string,,right,string,,right,string,
+right/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/right/189cbabe_nohash_0.wav,wav,,189cbabe,string,,right,string,,right,string,
+right/8c7f81df_nohash_0,1.0,0,16000,/localscratch/GSC/right/8c7f81df_nohash_0.wav,wav,,8c7f81df,string,,right,string,,right,string,
+right/210f3aa9_nohash_0,1.0,0,16000,/localscratch/GSC/right/210f3aa9_nohash_0.wav,wav,,210f3aa9,string,,right,string,,right,string,
+right/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/right/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,right,string,,right,string,
+right/283d7a53_nohash_1,1.0,0,16000,/localscratch/GSC/right/283d7a53_nohash_1.wav,wav,,283d7a53,string,,right,string,,right,string,
+right/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/right/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,right,string,,right,string,
+right/cfde27ba_nohash_1,1.0,0,16000,/localscratch/GSC/right/cfde27ba_nohash_1.wav,wav,,cfde27ba,string,,right,string,,right,string,
+right/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/right/7257420c_nohash_0.wav,wav,,7257420c,string,,right,string,,right,string,
+right/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/right/beb458a4_nohash_0.wav,wav,,beb458a4,string,,right,string,,right,string,
+right/1fe4c891_nohash_1,1.0,0,16000,/localscratch/GSC/right/1fe4c891_nohash_1.wav,wav,,1fe4c891,string,,right,string,,right,string,
+right/fdb5155e_nohash_2,1.0,0,16000,/localscratch/GSC/right/fdb5155e_nohash_2.wav,wav,,fdb5155e,string,,right,string,,right,string,
+right/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/right/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,right,string,,right,string,
+right/62ff07ef_nohash_0,1.0,0,16000,/localscratch/GSC/right/62ff07ef_nohash_0.wav,wav,,62ff07ef,string,,right,string,,right,string,
+right/5f814c23_nohash_0,1.0,0,16000,/localscratch/GSC/right/5f814c23_nohash_0.wav,wav,,5f814c23,string,,right,string,,right,string,
+right/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/right/6f689791_nohash_0.wav,wav,,6f689791,string,,right,string,,right,string,
+right/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/right/87070229_nohash_2.wav,wav,,87070229,string,,right,string,,right,string,
+right/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/right/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,right,string,,right,string,
+right/3659fc1c_nohash_0,1.0,0,16000,/localscratch/GSC/right/3659fc1c_nohash_0.wav,wav,,3659fc1c,string,,right,string,,right,string,
+right/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/right/1acc97de_nohash_3.wav,wav,,1acc97de,string,,right,string,,right,string,
+right/ef2a3cfb_nohash_1,1.0,0,16000,/localscratch/GSC/right/ef2a3cfb_nohash_1.wav,wav,,ef2a3cfb,string,,right,string,,right,string,
+right/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/right/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,right,string,,right,string,
+right/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/right/87070229_nohash_0.wav,wav,,87070229,string,,right,string,,right,string,
+right/9a7c1f83_nohash_2,1.0,0,16000,/localscratch/GSC/right/9a7c1f83_nohash_2.wav,wav,,9a7c1f83,string,,right,string,,right,string,
+right/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/right/837a0f64_nohash_0.wav,wav,,837a0f64,string,,right,string,,right,string,
+right/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/right/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,right,string,,right,string,
+right/b737ee80_nohash_1,1.0,0,16000,/localscratch/GSC/right/b737ee80_nohash_1.wav,wav,,b737ee80,string,,right,string,,right,string,
+right/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/right/8769c34c_nohash_2.wav,wav,,8769c34c,string,,right,string,,right,string,
+right/220ee1ef_nohash_0,1.0,0,16000,/localscratch/GSC/right/220ee1ef_nohash_0.wav,wav,,220ee1ef,string,,right,string,,right,string,
+right/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/right/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,right,string,,right,string,
+right/b737ee80_nohash_0,1.0,0,16000,/localscratch/GSC/right/b737ee80_nohash_0.wav,wav,,b737ee80,string,,right,string,,right,string,
+right/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/right/a7216980_nohash_3.wav,wav,,a7216980,string,,right,string,,right,string,
+right/8494fba8_nohash_0,1.0,0,16000,/localscratch/GSC/right/8494fba8_nohash_0.wav,wav,,8494fba8,string,,right,string,,right,string,
+right/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/right/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,right,string,,right,string,
+right/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/right/8fe67225_nohash_1.wav,wav,,8fe67225,string,,right,string,,right,string,
+right/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/right/837a0f64_nohash_2.wav,wav,,837a0f64,string,,right,string,,right,string,
+right/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/right/db24628d_nohash_2.wav,wav,,db24628d,string,,right,string,,right,string,
+right/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/right/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,right,string,,right,string,
+right/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/right/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,right,string,,right,string,
+right/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/right/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,right,string,,right,string,
+right/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/right/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,right,string,,right,string,
+right/fdb5155e_nohash_1,1.0,0,16000,/localscratch/GSC/right/fdb5155e_nohash_1.wav,wav,,fdb5155e,string,,right,string,,right,string,
+right/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_2.wav,wav,,893705bb,string,,right,string,,right,string,
+right/8494fba8_nohash_1,1.0,0,16000,/localscratch/GSC/right/8494fba8_nohash_1.wav,wav,,8494fba8,string,,right,string,,right,string,
+right/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/right/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,right,string,,right,string,
+right/f6af2457_nohash_0,1.0,0,16000,/localscratch/GSC/right/f6af2457_nohash_0.wav,wav,,f6af2457,string,,right,string,,right,string,
+right/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/right/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,right,string,,right,string,
+right/af405b69_nohash_1,1.0,0,16000,/localscratch/GSC/right/af405b69_nohash_1.wav,wav,,af405b69,string,,right,string,,right,string,
+right/e41a903b_nohash_4,1.0,0,16000,/localscratch/GSC/right/e41a903b_nohash_4.wav,wav,,e41a903b,string,,right,string,,right,string,
+right/7192fddc_nohash_0,1.0,0,16000,/localscratch/GSC/right/7192fddc_nohash_0.wav,wav,,7192fddc,string,,right,string,,right,string,
+right/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/right/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,right,string,,right,string,
+right/5f814c23_nohash_2,1.0,0,16000,/localscratch/GSC/right/5f814c23_nohash_2.wav,wav,,5f814c23,string,,right,string,,right,string,
+right/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/right/beb458a4_nohash_2.wav,wav,,beb458a4,string,,right,string,,right,string,
+right/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/right/0cb74144_nohash_2.wav,wav,,0cb74144,string,,right,string,,right,string,
+right/d0faf7e4_nohash_4,1.0,0,16000,/localscratch/GSC/right/d0faf7e4_nohash_4.wav,wav,,d0faf7e4,string,,right,string,,right,string,
+right/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/right/97f4c236_nohash_3.wav,wav,,97f4c236,string,,right,string,,right,string,
+right/3f170018_nohash_0,1.0,0,16000,/localscratch/GSC/right/3f170018_nohash_0.wav,wav,,3f170018,string,,right,string,,right,string,
+right/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/right/8769c34c_nohash_0.wav,wav,,8769c34c,string,,right,string,,right,string,
+right/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/right/189cbabe_nohash_3.wav,wav,,189cbabe,string,,right,string,,right,string,
+right/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/right/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,right,string,,right,string,
+right/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/right/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,right,string,,right,string,
+right/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/right/0cb74144_nohash_0.wav,wav,,0cb74144,string,,right,string,,right,string,
+right/283d7a53_nohash_0,1.0,0,16000,/localscratch/GSC/right/283d7a53_nohash_0.wav,wav,,283d7a53,string,,right,string,,right,string,
+right/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/right/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,right,string,,right,string,
+right/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/right/37dca74f_nohash_1.wav,wav,,37dca74f,string,,right,string,,right,string,
+right/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/right/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,right,string,,right,string,
+right/422d3197_nohash_1,1.0,0,16000,/localscratch/GSC/right/422d3197_nohash_1.wav,wav,,422d3197,string,,right,string,,right,string,
+right/d0faf7e4_nohash_5,1.0,0,16000,/localscratch/GSC/right/d0faf7e4_nohash_5.wav,wav,,d0faf7e4,string,,right,string,,right,string,
+right/9a7c1f83_nohash_3,1.0,0,16000,/localscratch/GSC/right/9a7c1f83_nohash_3.wav,wav,,9a7c1f83,string,,right,string,,right,string,
+right/0f250098_nohash_1,1.0,0,16000,/localscratch/GSC/right/0f250098_nohash_1.wav,wav,,0f250098,string,,right,string,,right,string,
+right/370844f7_nohash_0,1.0,0,16000,/localscratch/GSC/right/370844f7_nohash_0.wav,wav,,370844f7,string,,right,string,,right,string,
+right/aa48c94a_nohash_2,1.0,0,16000,/localscratch/GSC/right/aa48c94a_nohash_2.wav,wav,,aa48c94a,string,,right,string,,right,string,
+right/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/right/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,right,string,,right,string,
+right/7192fddc_nohash_1,1.0,0,16000,/localscratch/GSC/right/7192fddc_nohash_1.wav,wav,,7192fddc,string,,right,string,,right,string,
+right/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/right/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,right,string,,right,string,
+right/b11a05d2_nohash_0,1.0,0,16000,/localscratch/GSC/right/b11a05d2_nohash_0.wav,wav,,b11a05d2,string,,right,string,,right,string,
+right/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/right/68dd409e_nohash_0.wav,wav,,68dd409e,string,,right,string,,right,string,
+right/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/right/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,right,string,,right,string,
+right/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/right/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,right,string,,right,string,
+right/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_3.wav,wav,,893705bb,string,,right,string,,right,string,
+right/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/right/f428ca69_nohash_0.wav,wav,,f428ca69,string,,right,string,,right,string,
+right/475b61f1_nohash_0,1.0,0,16000,/localscratch/GSC/right/475b61f1_nohash_0.wav,wav,,475b61f1,string,,right,string,,right,string,
+right/ea356919_nohash_0,1.0,0,16000,/localscratch/GSC/right/ea356919_nohash_0.wav,wav,,ea356919,string,,right,string,,right,string,
+right/c7124b73_nohash_1,1.0,0,16000,/localscratch/GSC/right/c7124b73_nohash_1.wav,wav,,c7124b73,string,,right,string,,right,string,
+right/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/right/e41a903b_nohash_0.wav,wav,,e41a903b,string,,right,string,,right,string,
+right/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/right/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,right,string,,right,string,
+right/4c7c95de_nohash_0,1.0,0,16000,/localscratch/GSC/right/4c7c95de_nohash_0.wav,wav,,4c7c95de,string,,right,string,,right,string,
+right/91b03183_nohash_3,1.0,0,16000,/localscratch/GSC/right/91b03183_nohash_3.wav,wav,,91b03183,string,,right,string,,right,string,
+right/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/right/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,right,string,,right,string,
+right/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/right/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,right,string,,right,string,
+right/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/right/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,right,string,,right,string,
+right/cc592808_nohash_0,1.0,0,16000,/localscratch/GSC/right/cc592808_nohash_0.wav,wav,,cc592808,string,,right,string,,right,string,
+right/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/right/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,right,string,,right,string,
+right/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/right/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,right,string,,right,string,
+right/587f3271_nohash_0,1.0,0,16000,/localscratch/GSC/right/587f3271_nohash_0.wav,wav,,587f3271,string,,right,string,,right,string,
+right/6736bc64_nohash_1,1.0,0,16000,/localscratch/GSC/right/6736bc64_nohash_1.wav,wav,,6736bc64,string,,right,string,,right,string,
+right/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/right/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,right,string,,right,string,
+right/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/right/e1469561_nohash_3.wav,wav,,e1469561,string,,right,string,,right,string,
+right/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/right/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,right,string,,right,string,
+right/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/right/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,right,string,,right,string,
+right/412c675c_nohash_0,1.0,0,16000,/localscratch/GSC/right/412c675c_nohash_0.wav,wav,,412c675c,string,,right,string,,right,string,
+right/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/right/2d82a556_nohash_0.wav,wav,,2d82a556,string,,right,string,,right,string,
+right/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/right/1acc97de_nohash_1.wav,wav,,1acc97de,string,,right,string,,right,string,
+right/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/right/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,right,string,,right,string,
+right/6e916de8_nohash_0,1.0,0,16000,/localscratch/GSC/right/6e916de8_nohash_0.wav,wav,,6e916de8,string,,right,string,,right,string,
+right/5170b77f_nohash_4,1.0,0,16000,/localscratch/GSC/right/5170b77f_nohash_4.wav,wav,,5170b77f,string,,right,string,,right,string,
+right/37fc5d97_nohash_3,1.0,0,16000,/localscratch/GSC/right/37fc5d97_nohash_3.wav,wav,,37fc5d97,string,,right,string,,right,string,
+right/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/right/91b03183_nohash_1.wav,wav,,91b03183,string,,right,string,,right,string,
+right/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/right/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,right,string,,right,string,
+right/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/right/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,right,string,,right,string,
+right/3efef882_nohash_0,1.0,0,16000,/localscratch/GSC/right/3efef882_nohash_0.wav,wav,,3efef882,string,,right,string,,right,string,
+right/ad6a46f1_nohash_1,1.0,0,16000,/localscratch/GSC/right/ad6a46f1_nohash_1.wav,wav,,ad6a46f1,string,,right,string,,right,string,
+right/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/right/bb05582b_nohash_0.wav,wav,,bb05582b,string,,right,string,,right,string,
+right/fb7eb481_nohash_4,1.0,0,16000,/localscratch/GSC/right/fb7eb481_nohash_4.wav,wav,,fb7eb481,string,,right,string,,right,string,
+right/7192fddc_nohash_2,1.0,0,16000,/localscratch/GSC/right/7192fddc_nohash_2.wav,wav,,7192fddc,string,,right,string,,right,string,
+right/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/right/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,right,string,,right,string,
+right/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/right/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,right,string,,right,string,
+right/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/right/f9643d42_nohash_1.wav,wav,,f9643d42,string,,right,string,,right,string,
+right/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/right/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,right,string,,right,string,
+right/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/right/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,right,string,,right,string,
+right/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/right/837a0f64_nohash_3.wav,wav,,837a0f64,string,,right,string,,right,string,
+right/f297e878_nohash_0,1.0,0,16000,/localscratch/GSC/right/f297e878_nohash_0.wav,wav,,f297e878,string,,right,string,,right,string,
+right/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/right/e1469561_nohash_1.wav,wav,,e1469561,string,,right,string,,right,string,
+right/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/right/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,right,string,,right,string,
+right/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/right/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,right,string,,right,string,
+right/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/right/5170b77f_nohash_3.wav,wav,,5170b77f,string,,right,string,,right,string,
+right/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/right/f9643d42_nohash_3.wav,wav,,f9643d42,string,,right,string,,right,string,
+right/caedb73a_nohash_0,1.0,0,16000,/localscratch/GSC/right/caedb73a_nohash_0.wav,wav,,caedb73a,string,,right,string,,right,string,
+right/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/right/893705bb_nohash_4.wav,wav,,893705bb,string,,right,string,,right,string,
+right/3f170018_nohash_1,1.0,0,16000,/localscratch/GSC/right/3f170018_nohash_1.wav,wav,,3f170018,string,,right,string,,right,string,
+right/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/right/e1469561_nohash_2.wav,wav,,e1469561,string,,right,string,,right,string,
+right/50033893_nohash_0,1.0,0,16000,/localscratch/GSC/right/50033893_nohash_0.wav,wav,,50033893,string,,right,string,,right,string,
+right/1f653d27_nohash_0,1.0,0,16000,/localscratch/GSC/right/1f653d27_nohash_0.wav,wav,,1f653d27,string,,right,string,,right,string,
+right/692a88e6_nohash_4,1.0,0,16000,/localscratch/GSC/right/692a88e6_nohash_4.wav,wav,,692a88e6,string,,right,string,,right,string,
+right/840c366d_nohash_0,1.0,0,16000,/localscratch/GSC/right/840c366d_nohash_0.wav,wav,,840c366d,string,,right,string,,right,string,
+right/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/right/c7124b73_nohash_0.wav,wav,,c7124b73,string,,right,string,,right,string,
+right/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/right/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,right,string,,right,string,
+right/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/right/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,right,string,,right,string,
+right/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/right/e1469561_nohash_4.wav,wav,,e1469561,string,,right,string,,right,string,
+right/a2473d62_nohash_0,1.0,0,16000,/localscratch/GSC/right/a2473d62_nohash_0.wav,wav,,a2473d62,string,,right,string,,right,string,
+right/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/right/8769c34c_nohash_3.wav,wav,,8769c34c,string,,right,string,,right,string,
+right/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/right/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,right,string,,right,string,
+right/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/right/bb05582b_nohash_3.wav,wav,,bb05582b,string,,right,string,,right,string,
+right/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/right/5170b77f_nohash_0.wav,wav,,5170b77f,string,,right,string,,right,string,
+right/82d0d3ba_nohash_0,1.0,0,16000,/localscratch/GSC/right/82d0d3ba_nohash_0.wav,wav,,82d0d3ba,string,,right,string,,right,string,
+right/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/right/0cb74144_nohash_3.wav,wav,,0cb74144,string,,right,string,,right,string,
+right/798f702a_nohash_0,1.0,0,16000,/localscratch/GSC/right/798f702a_nohash_0.wav,wav,,798f702a,string,,right,string,,right,string,
+right/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/right/9a69672b_nohash_1.wav,wav,,9a69672b,string,,right,string,,right,string,
+right/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/right/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,right,string,,right,string,
+right/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/right/37dca74f_nohash_0.wav,wav,,37dca74f,string,,right,string,,right,string,
+right/587f3271_nohash_2,1.0,0,16000,/localscratch/GSC/right/587f3271_nohash_2.wav,wav,,587f3271,string,,right,string,,right,string,
+right/4620dc14_nohash_1,1.0,0,16000,/localscratch/GSC/right/4620dc14_nohash_1.wav,wav,,4620dc14,string,,right,string,,right,string,
+right/9d4bab4f_nohash_0,1.0,0,16000,/localscratch/GSC/right/9d4bab4f_nohash_0.wav,wav,,9d4bab4f,string,,right,string,,right,string,
+on/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/on/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,on,string,,on,string,
+on/b2e2773a_nohash_0,1.0,0,16000,/localscratch/GSC/on/b2e2773a_nohash_0.wav,wav,,b2e2773a,string,,on,string,,on,string,
+on/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/on/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,on,string,,on,string,
+on/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/on/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,on,string,,on,string,
+on/7dc95912_nohash_0,1.0,0,16000,/localscratch/GSC/on/7dc95912_nohash_0.wav,wav,,7dc95912,string,,on,string,,on,string,
+on/f17d21b5_nohash_0,1.0,0,16000,/localscratch/GSC/on/f17d21b5_nohash_0.wav,wav,,f17d21b5,string,,on,string,,on,string,
+on/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/on/cd85758f_nohash_1.wav,wav,,cd85758f,string,,on,string,,on,string,
+on/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/on/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,on,string,,on,string,
+on/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/on/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,on,string,,on,string,
+on/4a0e2c16_nohash_0,1.0,0,16000,/localscratch/GSC/on/4a0e2c16_nohash_0.wav,wav,,4a0e2c16,string,,on,string,,on,string,
+on/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/on/b49caed3_nohash_1.wav,wav,,b49caed3,string,,on,string,,on,string,
+on/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/on/9a69672b_nohash_3.wav,wav,,9a69672b,string,,on,string,,on,string,
+on/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/on/e41a903b_nohash_2.wav,wav,,e41a903b,string,,on,string,,on,string,
+on/4f8ef132_nohash_2,1.0,0,16000,/localscratch/GSC/on/4f8ef132_nohash_2.wav,wav,,4f8ef132,string,,on,string,,on,string,
+on/e71b4ce6_nohash_0,1.0,0,16000,/localscratch/GSC/on/e71b4ce6_nohash_0.wav,wav,,e71b4ce6,string,,on,string,,on,string,
+on/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/on/97f4c236_nohash_2.wav,wav,,97f4c236,string,,on,string,,on,string,
+on/8769c34c_nohash_4,1.0,0,16000,/localscratch/GSC/on/8769c34c_nohash_4.wav,wav,,8769c34c,string,,on,string,,on,string,
+on/81dc4a94_nohash_0,1.0,0,16000,/localscratch/GSC/on/81dc4a94_nohash_0.wav,wav,,81dc4a94,string,,on,string,,on,string,
+on/03401e93_nohash_0,1.0,0,16000,/localscratch/GSC/on/03401e93_nohash_0.wav,wav,,03401e93,string,,on,string,,on,string,
+on/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/on/91b03183_nohash_0.wav,wav,,91b03183,string,,on,string,,on,string,
+on/f5496439_nohash_1,1.0,0,16000,/localscratch/GSC/on/f5496439_nohash_1.wav,wav,,f5496439,string,,on,string,,on,string,
+on/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/on/5170b77f_nohash_1.wav,wav,,5170b77f,string,,on,string,,on,string,
+on/b11a05d2_nohash_1,1.0,0,16000,/localscratch/GSC/on/b11a05d2_nohash_1.wav,wav,,b11a05d2,string,,on,string,,on,string,
+on/f5496439_nohash_0,1.0,0,16000,/localscratch/GSC/on/f5496439_nohash_0.wav,wav,,f5496439,string,,on,string,,on,string,
+on/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/on/db24628d_nohash_0.wav,wav,,db24628d,string,,on,string,,on,string,
+on/9d171fee_nohash_0,1.0,0,16000,/localscratch/GSC/on/9d171fee_nohash_0.wav,wav,,9d171fee,string,,on,string,,on,string,
+on/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/on/837a0f64_nohash_1.wav,wav,,837a0f64,string,,on,string,,on,string,
+on/8625475c_nohash_0,1.0,0,16000,/localscratch/GSC/on/8625475c_nohash_0.wav,wav,,8625475c,string,,on,string,,on,string,
+on/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/on/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,on,string,,on,string,
+on/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/on/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,on,string,,on,string,
+on/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/on/97f4c236_nohash_1.wav,wav,,97f4c236,string,,on,string,,on,string,
+on/587f3271_nohash_1,1.0,0,16000,/localscratch/GSC/on/587f3271_nohash_1.wav,wav,,587f3271,string,,on,string,,on,string,
+on/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/on/a7216980_nohash_1.wav,wav,,a7216980,string,,on,string,,on,string,
+on/a60a09cf_nohash_0,1.0,0,16000,/localscratch/GSC/on/a60a09cf_nohash_0.wav,wav,,a60a09cf,string,,on,string,,on,string,
+on/bfd26d6b_nohash_4,1.0,0,16000,/localscratch/GSC/on/bfd26d6b_nohash_4.wav,wav,,bfd26d6b,string,,on,string,,on,string,
+on/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/on/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,on,string,,on,string,
+on/8ea6dec6_nohash_1,1.0,0,16000,/localscratch/GSC/on/8ea6dec6_nohash_1.wav,wav,,8ea6dec6,string,,on,string,,on,string,
+on/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/on/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,on,string,,on,string,
+on/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/on/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,on,string,,on,string,
+on/c9b5ff26_nohash_4,1.0,0,16000,/localscratch/GSC/on/c9b5ff26_nohash_4.wav,wav,,c9b5ff26,string,,on,string,,on,string,
+on/1f653d27_nohash_1,1.0,0,16000,/localscratch/GSC/on/1f653d27_nohash_1.wav,wav,,1f653d27,string,,on,string,,on,string,
+on/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/on/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,on,string,,on,string,
+on/c0e0f834_nohash_0,1.0,0,16000,/localscratch/GSC/on/c0e0f834_nohash_0.wav,wav,,c0e0f834,string,,on,string,,on,string,
+on/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/on/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,on,string,,on,string,
+on/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/on/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,on,string,,on,string,
+on/6b889021_nohash_0,1.0,0,16000,/localscratch/GSC/on/6b889021_nohash_0.wav,wav,,6b889021,string,,on,string,,on,string,
+on/b49caed3_nohash_4,1.0,0,16000,/localscratch/GSC/on/b49caed3_nohash_4.wav,wav,,b49caed3,string,,on,string,,on,string,
+on/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/on/692a88e6_nohash_3.wav,wav,,692a88e6,string,,on,string,,on,string,
+on/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/on/63f7a489_nohash_0.wav,wav,,63f7a489,string,,on,string,,on,string,
+on/44260689_nohash_0,1.0,0,16000,/localscratch/GSC/on/44260689_nohash_0.wav,wav,,44260689,string,,on,string,,on,string,
+on/d9e9f554_nohash_0,1.0,0,16000,/localscratch/GSC/on/d9e9f554_nohash_0.wav,wav,,d9e9f554,string,,on,string,,on,string,
+on/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/on/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,on,string,,on,string,
+on/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/on/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,on,string,,on,string,
+on/ad6a46f1_nohash_0,1.0,0,16000,/localscratch/GSC/on/ad6a46f1_nohash_0.wav,wav,,ad6a46f1,string,,on,string,,on,string,
+on/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_0.wav,wav,,893705bb,string,,on,string,,on,string,
+on/893705bb_nohash_13,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_13.wav,wav,,893705bb,string,,on,string,,on,string,
+on/4c7c95de_nohash_1,1.0,0,16000,/localscratch/GSC/on/4c7c95de_nohash_1.wav,wav,,4c7c95de,string,,on,string,,on,string,
+on/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/on/189cbabe_nohash_1.wav,wav,,189cbabe,string,,on,string,,on,string,
+on/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/on/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,on,string,,on,string,
+on/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/on/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,on,string,,on,string,
+on/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/on/8fe67225_nohash_0.wav,wav,,8fe67225,string,,on,string,,on,string,
+on/b83c1acf_nohash_5,1.0,0,16000,/localscratch/GSC/on/b83c1acf_nohash_5.wav,wav,,b83c1acf,string,,on,string,,on,string,
+on/3df9a3d4_nohash_0,1.0,0,16000,/localscratch/GSC/on/3df9a3d4_nohash_0.wav,wav,,3df9a3d4,string,,on,string,,on,string,
+on/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/on/beb458a4_nohash_4.wav,wav,,beb458a4,string,,on,string,,on,string,
+on/4290ca61_nohash_0,1.0,0,16000,/localscratch/GSC/on/4290ca61_nohash_0.wav,wav,,4290ca61,string,,on,string,,on,string,
+on/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/on/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,on,string,,on,string,
+on/370844f7_nohash_1,1.0,0,16000,/localscratch/GSC/on/370844f7_nohash_1.wav,wav,,370844f7,string,,on,string,,on,string,
+on/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/on/cd85758f_nohash_0.wav,wav,,cd85758f,string,,on,string,,on,string,
+on/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/on/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,on,string,,on,string,
+on/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/on/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,on,string,,on,string,
+on/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/on/e41a903b_nohash_1.wav,wav,,e41a903b,string,,on,string,,on,string,
+on/95ba4996_nohash_0,1.0,0,16000,/localscratch/GSC/on/95ba4996_nohash_0.wav,wav,,95ba4996,string,,on,string,,on,string,
+on/fa446c16_nohash_0,1.0,0,16000,/localscratch/GSC/on/fa446c16_nohash_0.wav,wav,,fa446c16,string,,on,string,,on,string,
+on/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/on/37dca74f_nohash_3.wav,wav,,37dca74f,string,,on,string,,on,string,
+on/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/on/8fe67225_nohash_2.wav,wav,,8fe67225,string,,on,string,,on,string,
+on/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/on/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,on,string,,on,string,
+on/653a48f5_nohash_0,1.0,0,16000,/localscratch/GSC/on/653a48f5_nohash_0.wav,wav,,653a48f5,string,,on,string,,on,string,
+on/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/on/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,on,string,,on,string,
+on/e49428d9_nohash_3,1.0,0,16000,/localscratch/GSC/on/e49428d9_nohash_3.wav,wav,,e49428d9,string,,on,string,,on,string,
+on/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_7.wav,wav,,893705bb,string,,on,string,,on,string,
+on/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/on/8056e897_nohash_0.wav,wav,,8056e897,string,,on,string,,on,string,
+on/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/on/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,on,string,,on,string,
+on/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/on/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,on,string,,on,string,
+on/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/on/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,on,string,,on,string,
+on/47d01978_nohash_0,1.0,0,16000,/localscratch/GSC/on/47d01978_nohash_0.wav,wav,,47d01978,string,,on,string,,on,string,
+on/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/on/97f4c236_nohash_0.wav,wav,,97f4c236,string,,on,string,,on,string,
+on/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/on/aa80f517_nohash_1.wav,wav,,aa80f517,string,,on,string,,on,string,
+on/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/on/0cb74144_nohash_1.wav,wav,,0cb74144,string,,on,string,,on,string,
+on/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/on/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,on,string,,on,string,
+on/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/on/9a69672b_nohash_0.wav,wav,,9a69672b,string,,on,string,,on,string,
+on/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/on/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,on,string,,on,string,
+on/d7467392_nohash_1,1.0,0,16000,/localscratch/GSC/on/d7467392_nohash_1.wav,wav,,d7467392,string,,on,string,,on,string,
+on/6379c6a2_nohash_0,1.0,0,16000,/localscratch/GSC/on/6379c6a2_nohash_0.wav,wav,,6379c6a2,string,,on,string,,on,string,
+on/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/on/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,on,string,,on,string,
+on/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/on/1acc97de_nohash_2.wav,wav,,1acc97de,string,,on,string,,on,string,
+on/9a7c1f83_nohash_4,1.0,0,16000,/localscratch/GSC/on/9a7c1f83_nohash_4.wav,wav,,9a7c1f83,string,,on,string,,on,string,
+on/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/on/63f7a489_nohash_2.wav,wav,,63f7a489,string,,on,string,,on,string,
+on/0fa1e7a9_nohash_0,1.0,0,16000,/localscratch/GSC/on/0fa1e7a9_nohash_0.wav,wav,,0fa1e7a9,string,,on,string,,on,string,
+on/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/on/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,on,string,,on,string,
+on/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/on/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,on,string,,on,string,
+on/e49428d9_nohash_0,1.0,0,16000,/localscratch/GSC/on/e49428d9_nohash_0.wav,wav,,e49428d9,string,,on,string,,on,string,
+on/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_5.wav,wav,,893705bb,string,,on,string,,on,string,
+on/4f8ef132_nohash_1,1.0,0,16000,/localscratch/GSC/on/4f8ef132_nohash_1.wav,wav,,4f8ef132,string,,on,string,,on,string,
+on/fe1916ba_nohash_0,1.0,0,16000,/localscratch/GSC/on/fe1916ba_nohash_0.wav,wav,,fe1916ba,string,,on,string,,on,string,
+on/fce96bac_nohash_0,1.0,0,16000,/localscratch/GSC/on/fce96bac_nohash_0.wav,wav,,fce96bac,string,,on,string,,on,string,
+on/ca4d5368_nohash_4,1.0,0,16000,/localscratch/GSC/on/ca4d5368_nohash_4.wav,wav,,ca4d5368,string,,on,string,,on,string,
+on/d7467392_nohash_2,1.0,0,16000,/localscratch/GSC/on/d7467392_nohash_2.wav,wav,,d7467392,string,,on,string,,on,string,
+on/7add4c5f_nohash_0,1.0,0,16000,/localscratch/GSC/on/7add4c5f_nohash_0.wav,wav,,7add4c5f,string,,on,string,,on,string,
+on/d9e9f554_nohash_1,1.0,0,16000,/localscratch/GSC/on/d9e9f554_nohash_1.wav,wav,,d9e9f554,string,,on,string,,on,string,
+on/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/on/f9643d42_nohash_0.wav,wav,,f9643d42,string,,on,string,,on,string,
+on/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/on/beb458a4_nohash_3.wav,wav,,beb458a4,string,,on,string,,on,string,
+on/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/on/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,on,string,,on,string,
+on/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/on/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,on,string,,on,string,
+on/87070229_nohash_4,1.0,0,16000,/localscratch/GSC/on/87070229_nohash_4.wav,wav,,87070229,string,,on,string,,on,string,
+on/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/on/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,on,string,,on,string,
+on/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/on/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,on,string,,on,string,
+on/37fc5d97_nohash_0,1.0,0,16000,/localscratch/GSC/on/37fc5d97_nohash_0.wav,wav,,37fc5d97,string,,on,string,,on,string,
+on/210f3aa9_nohash_1,1.0,0,16000,/localscratch/GSC/on/210f3aa9_nohash_1.wav,wav,,210f3aa9,string,,on,string,,on,string,
+on/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/on/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,on,string,,on,string,
+on/daf230ac_nohash_0,1.0,0,16000,/localscratch/GSC/on/daf230ac_nohash_0.wav,wav,,daf230ac,string,,on,string,,on,string,
+on/84d1e469_nohash_0,1.0,0,16000,/localscratch/GSC/on/84d1e469_nohash_0.wav,wav,,84d1e469,string,,on,string,,on,string,
+on/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/on/87070229_nohash_1.wav,wav,,87070229,string,,on,string,,on,string,
+on/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/on/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,on,string,,on,string,
+on/a8e25ebb_nohash_2,1.0,0,16000,/localscratch/GSC/on/a8e25ebb_nohash_2.wav,wav,,a8e25ebb,string,,on,string,,on,string,
+on/e49428d9_nohash_2,1.0,0,16000,/localscratch/GSC/on/e49428d9_nohash_2.wav,wav,,e49428d9,string,,on,string,,on,string,
+on/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/on/e1469561_nohash_0.wav,wav,,e1469561,string,,on,string,,on,string,
+on/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/on/e41a903b_nohash_3.wav,wav,,e41a903b,string,,on,string,,on,string,
+on/fa446c16_nohash_1,1.0,0,16000,/localscratch/GSC/on/fa446c16_nohash_1.wav,wav,,fa446c16,string,,on,string,,on,string,
+on/6e916de8_nohash_1,1.0,0,16000,/localscratch/GSC/on/6e916de8_nohash_1.wav,wav,,6e916de8,string,,on,string,,on,string,
+on/e9901cf0_nohash_0,1.0,0,16000,/localscratch/GSC/on/e9901cf0_nohash_0.wav,wav,,e9901cf0,string,,on,string,,on,string,
+on/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/on/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,on,string,,on,string,
+on/bb31b82b_nohash_0,1.0,0,16000,/localscratch/GSC/on/bb31b82b_nohash_0.wav,wav,,bb31b82b,string,,on,string,,on,string,
+on/837a0f64_nohash_4,1.0,0,16000,/localscratch/GSC/on/837a0f64_nohash_4.wav,wav,,837a0f64,string,,on,string,,on,string,
+on/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/on/692a88e6_nohash_2.wav,wav,,692a88e6,string,,on,string,,on,string,
+on/85834399_nohash_0,1.0,0,16000,/localscratch/GSC/on/85834399_nohash_0.wav,wav,,85834399,string,,on,string,,on,string,
+on/bb31b82b_nohash_1,1.0,0,16000,/localscratch/GSC/on/bb31b82b_nohash_1.wav,wav,,bb31b82b,string,,on,string,,on,string,
+on/5c8af87a_nohash_3,1.0,0,16000,/localscratch/GSC/on/5c8af87a_nohash_3.wav,wav,,5c8af87a,string,,on,string,,on,string,
+on/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/on/8769c34c_nohash_1.wav,wav,,8769c34c,string,,on,string,,on,string,
+on/84d1e469_nohash_1,1.0,0,16000,/localscratch/GSC/on/84d1e469_nohash_1.wav,wav,,84d1e469,string,,on,string,,on,string,
+on/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/on/b49caed3_nohash_3.wav,wav,,b49caed3,string,,on,string,,on,string,
+on/a4e8a997_nohash_0,1.0,0,16000,/localscratch/GSC/on/a4e8a997_nohash_0.wav,wav,,a4e8a997,string,,on,string,,on,string,
+on/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/on/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,on,string,,on,string,
+on/1fe4c891_nohash_0,1.0,0,16000,/localscratch/GSC/on/1fe4c891_nohash_0.wav,wav,,1fe4c891,string,,on,string,,on,string,
+on/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/on/2fa39636_nohash_0.wav,wav,,2fa39636,string,,on,string,,on,string,
+on/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/on/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,on,string,,on,string,
+on/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/on/a4383927_nohash_0.wav,wav,,a4383927,string,,on,string,,on,string,
+on/85d2ac4b_nohash_0,1.0,0,16000,/localscratch/GSC/on/85d2ac4b_nohash_0.wav,wav,,85d2ac4b,string,,on,string,,on,string,
+on/42beb5eb_nohash_0,1.0,0,16000,/localscratch/GSC/on/42beb5eb_nohash_0.wav,wav,,42beb5eb,string,,on,string,,on,string,
+on/e49428d9_nohash_1,1.0,0,16000,/localscratch/GSC/on/e49428d9_nohash_1.wav,wav,,e49428d9,string,,on,string,,on,string,
+on/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/on/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,on,string,,on,string,
+on/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/on/aa80f517_nohash_0.wav,wav,,aa80f517,string,,on,string,,on,string,
+on/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/on/5170b77f_nohash_2.wav,wav,,5170b77f,string,,on,string,,on,string,
+on/9a7c1f83_nohash_6,1.0,0,16000,/localscratch/GSC/on/9a7c1f83_nohash_6.wav,wav,,9a7c1f83,string,,on,string,,on,string,
+on/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/on/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,on,string,,on,string,
+on/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/on/db24628d_nohash_1.wav,wav,,db24628d,string,,on,string,,on,string,
+on/d9e9f554_nohash_3,1.0,0,16000,/localscratch/GSC/on/d9e9f554_nohash_3.wav,wav,,d9e9f554,string,,on,string,,on,string,
+on/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/on/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,on,string,,on,string,
+on/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/on/beb458a4_nohash_1.wav,wav,,beb458a4,string,,on,string,,on,string,
+on/5eb5fc74_nohash_1,1.0,0,16000,/localscratch/GSC/on/5eb5fc74_nohash_1.wav,wav,,5eb5fc74,string,,on,string,,on,string,
+on/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/on/d7467392_nohash_0.wav,wav,,d7467392,string,,on,string,,on,string,
+on/f0ae7203_nohash_1,1.0,0,16000,/localscratch/GSC/on/f0ae7203_nohash_1.wav,wav,,f0ae7203,string,,on,string,,on,string,
+on/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/on/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,on,string,,on,string,
+on/4845bb10_nohash_0,1.0,0,16000,/localscratch/GSC/on/4845bb10_nohash_0.wav,wav,,4845bb10,string,,on,string,,on,string,
+on/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/on/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,on,string,,on,string,
+on/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/on/bb05582b_nohash_1.wav,wav,,bb05582b,string,,on,string,,on,string,
+on/5eb5fc74_nohash_0,1.0,0,16000,/localscratch/GSC/on/5eb5fc74_nohash_0.wav,wav,,5eb5fc74,string,,on,string,,on,string,
+on/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/on/5f01c798_nohash_0.wav,wav,,5f01c798,string,,on,string,,on,string,
+on/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/on/f9643d42_nohash_2.wav,wav,,f9643d42,string,,on,string,,on,string,
+on/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/on/f292725f_nohash_0.wav,wav,,f292725f,string,,on,string,,on,string,
+on/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/on/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,on,string,,on,string,
+on/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/on/1acc97de_nohash_0.wav,wav,,1acc97de,string,,on,string,,on,string,
+on/5f01c798_nohash_1,1.0,0,16000,/localscratch/GSC/on/5f01c798_nohash_1.wav,wav,,5f01c798,string,,on,string,,on,string,
+on/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/on/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,on,string,,on,string,
+on/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/on/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,on,string,,on,string,
+on/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/on/a7216980_nohash_0.wav,wav,,a7216980,string,,on,string,,on,string,
+on/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/on/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,on,string,,on,string,
+on/a8e25ebb_nohash_1,1.0,0,16000,/localscratch/GSC/on/a8e25ebb_nohash_1.wav,wav,,a8e25ebb,string,,on,string,,on,string,
+on/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/on/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,on,string,,on,string,
+on/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/on/b49caed3_nohash_2.wav,wav,,b49caed3,string,,on,string,,on,string,
+on/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_8.wav,wav,,893705bb,string,,on,string,,on,string,
+on/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/on/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,on,string,,on,string,
+on/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/on/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,on,string,,on,string,
+on/0ea0e2f4_nohash_0,1.0,0,16000,/localscratch/GSC/on/0ea0e2f4_nohash_0.wav,wav,,0ea0e2f4,string,,on,string,,on,string,
+on/cd85758f_nohash_4,1.0,0,16000,/localscratch/GSC/on/cd85758f_nohash_4.wav,wav,,cd85758f,string,,on,string,,on,string,
+on/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/on/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,on,string,,on,string,
+on/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/on/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,on,string,,on,string,
+on/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/on/cd85758f_nohash_3.wav,wav,,cd85758f,string,,on,string,,on,string,
+on/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/on/91b03183_nohash_2.wav,wav,,91b03183,string,,on,string,,on,string,
+on/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/on/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,on,string,,on,string,
+on/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/on/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,on,string,,on,string,
+on/a80f9f53_nohash_0,1.0,0,16000,/localscratch/GSC/on/a80f9f53_nohash_0.wav,wav,,a80f9f53,string,,on,string,,on,string,
+on/4c841771_nohash_1,1.0,0,16000,/localscratch/GSC/on/4c841771_nohash_1.wav,wav,,4c841771,string,,on,string,,on,string,
+on/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/on/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,on,string,,on,string,
+on/4f8ef132_nohash_0,1.0,0,16000,/localscratch/GSC/on/4f8ef132_nohash_0.wav,wav,,4f8ef132,string,,on,string,,on,string,
+on/8fe52b97_nohash_1,1.0,0,16000,/localscratch/GSC/on/8fe52b97_nohash_1.wav,wav,,8fe52b97,string,,on,string,,on,string,
+on/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/on/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,on,string,,on,string,
+on/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/on/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,on,string,,on,string,
+on/af405b69_nohash_0,1.0,0,16000,/localscratch/GSC/on/af405b69_nohash_0.wav,wav,,af405b69,string,,on,string,,on,string,
+on/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/on/b49caed3_nohash_0.wav,wav,,b49caed3,string,,on,string,,on,string,
+on/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/on/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,on,string,,on,string,
+on/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/on/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,on,string,,on,string,
+on/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/on/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,on,string,,on,string,
+on/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/on/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,on,string,,on,string,
+on/9a7c1f83_nohash_5,1.0,0,16000,/localscratch/GSC/on/9a7c1f83_nohash_5.wav,wav,,9a7c1f83,string,,on,string,,on,string,
+on/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/on/692a88e6_nohash_1.wav,wav,,692a88e6,string,,on,string,,on,string,
+on/27c30960_nohash_0,1.0,0,16000,/localscratch/GSC/on/27c30960_nohash_0.wav,wav,,27c30960,string,,on,string,,on,string,
+on/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/on/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,on,string,,on,string,
+on/7bae88ed_nohash_0,1.0,0,16000,/localscratch/GSC/on/7bae88ed_nohash_0.wav,wav,,7bae88ed,string,,on,string,,on,string,
+on/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/on/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,on,string,,on,string,
+on/863880b7_nohash_0,1.0,0,16000,/localscratch/GSC/on/863880b7_nohash_0.wav,wav,,863880b7,string,,on,string,,on,string,
+on/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/on/37dca74f_nohash_2.wav,wav,,37dca74f,string,,on,string,,on,string,
+on/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/on/bb05582b_nohash_2.wav,wav,,bb05582b,string,,on,string,,on,string,
+on/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/on/692a88e6_nohash_0.wav,wav,,692a88e6,string,,on,string,,on,string,
+on/105a0eea_nohash_0,1.0,0,16000,/localscratch/GSC/on/105a0eea_nohash_0.wav,wav,,105a0eea,string,,on,string,,on,string,
+on/8c7f81df_nohash_1,1.0,0,16000,/localscratch/GSC/on/8c7f81df_nohash_1.wav,wav,,8c7f81df,string,,on,string,,on,string,
+on/8056e897_nohash_1,1.0,0,16000,/localscratch/GSC/on/8056e897_nohash_1.wav,wav,,8056e897,string,,on,string,,on,string,
+on/8fe52b97_nohash_0,1.0,0,16000,/localscratch/GSC/on/8fe52b97_nohash_0.wav,wav,,8fe52b97,string,,on,string,,on,string,
+on/81dc4a94_nohash_1,1.0,0,16000,/localscratch/GSC/on/81dc4a94_nohash_1.wav,wav,,81dc4a94,string,,on,string,,on,string,
+on/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_1.wav,wav,,893705bb,string,,on,string,,on,string,
+on/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/on/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,on,string,,on,string,
+on/18f8afd5_nohash_4,1.0,0,16000,/localscratch/GSC/on/18f8afd5_nohash_4.wav,wav,,18f8afd5,string,,on,string,,on,string,
+on/f264e0df_nohash_0,1.0,0,16000,/localscratch/GSC/on/f264e0df_nohash_0.wav,wav,,f264e0df,string,,on,string,,on,string,
+on/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/on/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,on,string,,on,string,
+on/c518d1b1_nohash_0,1.0,0,16000,/localscratch/GSC/on/c518d1b1_nohash_0.wav,wav,,c518d1b1,string,,on,string,,on,string,
+on/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/on/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,on,string,,on,string,
+on/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/on/cd85758f_nohash_2.wav,wav,,cd85758f,string,,on,string,,on,string,
+on/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/on/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,on,string,,on,string,
+on/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/on/63f7a489_nohash_1.wav,wav,,63f7a489,string,,on,string,,on,string,
+on/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/on/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,on,string,,on,string,
+on/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/on/189cbabe_nohash_2.wav,wav,,189cbabe,string,,on,string,,on,string,
+on/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/on/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,on,string,,on,string,
+on/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/on/422d3197_nohash_0.wav,wav,,422d3197,string,,on,string,,on,string,
+on/8ec6dab6_nohash_0,1.0,0,16000,/localscratch/GSC/on/8ec6dab6_nohash_0.wav,wav,,8ec6dab6,string,,on,string,,on,string,
+on/aa233654_nohash_0,1.0,0,16000,/localscratch/GSC/on/aa233654_nohash_0.wav,wav,,aa233654,string,,on,string,,on,string,
+on/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_6.wav,wav,,893705bb,string,,on,string,,on,string,
+on/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/on/189cbabe_nohash_0.wav,wav,,189cbabe,string,,on,string,,on,string,
+on/840c366d_nohash_1,1.0,0,16000,/localscratch/GSC/on/840c366d_nohash_1.wav,wav,,840c366d,string,,on,string,,on,string,
+on/8c7f81df_nohash_0,1.0,0,16000,/localscratch/GSC/on/8c7f81df_nohash_0.wav,wav,,8c7f81df,string,,on,string,,on,string,
+on/210f3aa9_nohash_0,1.0,0,16000,/localscratch/GSC/on/210f3aa9_nohash_0.wav,wav,,210f3aa9,string,,on,string,,on,string,
+on/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/on/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,on,string,,on,string,
+on/283d7a53_nohash_1,1.0,0,16000,/localscratch/GSC/on/283d7a53_nohash_1.wav,wav,,283d7a53,string,,on,string,,on,string,
+on/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/on/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,on,string,,on,string,
+on/d91a159e_nohash_0,1.0,0,16000,/localscratch/GSC/on/d91a159e_nohash_0.wav,wav,,d91a159e,string,,on,string,,on,string,
+on/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/on/7257420c_nohash_0.wav,wav,,7257420c,string,,on,string,,on,string,
+on/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/on/beb458a4_nohash_0.wav,wav,,beb458a4,string,,on,string,,on,string,
+on/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/on/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,on,string,,on,string,
+on/f5496439_nohash_2,1.0,0,16000,/localscratch/GSC/on/f5496439_nohash_2.wav,wav,,f5496439,string,,on,string,,on,string,
+on/a60a09cf_nohash_1,1.0,0,16000,/localscratch/GSC/on/a60a09cf_nohash_1.wav,wav,,a60a09cf,string,,on,string,,on,string,
+on/893705bb_nohash_12,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_12.wav,wav,,893705bb,string,,on,string,,on,string,
+on/62ff07ef_nohash_0,1.0,0,16000,/localscratch/GSC/on/62ff07ef_nohash_0.wav,wav,,62ff07ef,string,,on,string,,on,string,
+on/5f814c23_nohash_0,1.0,0,16000,/localscratch/GSC/on/5f814c23_nohash_0.wav,wav,,5f814c23,string,,on,string,,on,string,
+on/67961766_nohash_0,1.0,0,16000,/localscratch/GSC/on/67961766_nohash_0.wav,wav,,67961766,string,,on,string,,on,string,
+on/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/on/6f689791_nohash_0.wav,wav,,6f689791,string,,on,string,,on,string,
+on/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/on/87070229_nohash_2.wav,wav,,87070229,string,,on,string,,on,string,
+on/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/on/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,on,string,,on,string,
+on/5c8af87a_nohash_4,1.0,0,16000,/localscratch/GSC/on/5c8af87a_nohash_4.wav,wav,,5c8af87a,string,,on,string,,on,string,
+on/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/on/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,on,string,,on,string,
+on/563aa4e6_nohash_4,1.0,0,16000,/localscratch/GSC/on/563aa4e6_nohash_4.wav,wav,,563aa4e6,string,,on,string,,on,string,
+on/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/on/87070229_nohash_0.wav,wav,,87070229,string,,on,string,,on,string,
+on/893705bb_nohash_14,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_14.wav,wav,,893705bb,string,,on,string,,on,string,
+on/a4383927_nohash_1,1.0,0,16000,/localscratch/GSC/on/a4383927_nohash_1.wav,wav,,a4383927,string,,on,string,,on,string,
+on/6021f08b_nohash_0,1.0,0,16000,/localscratch/GSC/on/6021f08b_nohash_0.wav,wav,,6021f08b,string,,on,string,,on,string,
+on/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/on/97f4c236_nohash_4.wav,wav,,97f4c236,string,,on,string,,on,string,
+on/9a7c1f83_nohash_2,1.0,0,16000,/localscratch/GSC/on/9a7c1f83_nohash_2.wav,wav,,9a7c1f83,string,,on,string,,on,string,
+on/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/on/837a0f64_nohash_0.wav,wav,,837a0f64,string,,on,string,,on,string,
+on/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/on/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,on,string,,on,string,
+on/8c0e5970_nohash_1,1.0,0,16000,/localscratch/GSC/on/8c0e5970_nohash_1.wav,wav,,8c0e5970,string,,on,string,,on,string,
+on/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/on/8769c34c_nohash_2.wav,wav,,8769c34c,string,,on,string,,on,string,
+on/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/on/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,on,string,,on,string,
+on/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/on/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,on,string,,on,string,
+on/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/on/8fe67225_nohash_1.wav,wav,,8fe67225,string,,on,string,,on,string,
+on/b11a05d2_nohash_2,1.0,0,16000,/localscratch/GSC/on/b11a05d2_nohash_2.wav,wav,,b11a05d2,string,,on,string,,on,string,
+on/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/on/837a0f64_nohash_2.wav,wav,,837a0f64,string,,on,string,,on,string,
+on/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/on/db24628d_nohash_2.wav,wav,,db24628d,string,,on,string,,on,string,
+on/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/on/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,on,string,,on,string,
+on/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/on/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,on,string,,on,string,
+on/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/on/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,on,string,,on,string,
+on/893705bb_nohash_11,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_11.wav,wav,,893705bb,string,,on,string,,on,string,
+on/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/on/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,on,string,,on,string,
+on/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/on/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,on,string,,on,string,
+on/6f689791_nohash_1,1.0,0,16000,/localscratch/GSC/on/6f689791_nohash_1.wav,wav,,6f689791,string,,on,string,,on,string,
+on/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_2.wav,wav,,893705bb,string,,on,string,,on,string,
+on/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/on/9a69672b_nohash_2.wav,wav,,9a69672b,string,,on,string,,on,string,
+on/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/on/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,on,string,,on,string,
+on/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/on/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,on,string,,on,string,
+on/8c0e5970_nohash_0,1.0,0,16000,/localscratch/GSC/on/8c0e5970_nohash_0.wav,wav,,8c0e5970,string,,on,string,,on,string,
+on/e41a903b_nohash_4,1.0,0,16000,/localscratch/GSC/on/e41a903b_nohash_4.wav,wav,,e41a903b,string,,on,string,,on,string,
+on/7192fddc_nohash_0,1.0,0,16000,/localscratch/GSC/on/7192fddc_nohash_0.wav,wav,,7192fddc,string,,on,string,,on,string,
+on/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/on/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,on,string,,on,string,
+on/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/on/beb458a4_nohash_2.wav,wav,,beb458a4,string,,on,string,,on,string,
+on/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/on/0cb74144_nohash_2.wav,wav,,0cb74144,string,,on,string,,on,string,
+on/d0faf7e4_nohash_4,1.0,0,16000,/localscratch/GSC/on/d0faf7e4_nohash_4.wav,wav,,d0faf7e4,string,,on,string,,on,string,
+on/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/on/97f4c236_nohash_3.wav,wav,,97f4c236,string,,on,string,,on,string,
+on/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/on/8769c34c_nohash_0.wav,wav,,8769c34c,string,,on,string,,on,string,
+on/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/on/8fe67225_nohash_3.wav,wav,,8fe67225,string,,on,string,,on,string,
+on/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/on/189cbabe_nohash_3.wav,wav,,189cbabe,string,,on,string,,on,string,
+on/d9e9f554_nohash_2,1.0,0,16000,/localscratch/GSC/on/d9e9f554_nohash_2.wav,wav,,d9e9f554,string,,on,string,,on,string,
+on/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/on/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,on,string,,on,string,
+on/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/on/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,on,string,,on,string,
+on/5ff3f9a1_nohash_1,1.0,0,16000,/localscratch/GSC/on/5ff3f9a1_nohash_1.wav,wav,,5ff3f9a1,string,,on,string,,on,string,
+on/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/on/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,on,string,,on,string,
+on/e9901cf0_nohash_1,1.0,0,16000,/localscratch/GSC/on/e9901cf0_nohash_1.wav,wav,,e9901cf0,string,,on,string,,on,string,
+on/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/on/0cb74144_nohash_0.wav,wav,,0cb74144,string,,on,string,,on,string,
+on/6f2f57c1_nohash_0,1.0,0,16000,/localscratch/GSC/on/6f2f57c1_nohash_0.wav,wav,,6f2f57c1,string,,on,string,,on,string,
+on/283d7a53_nohash_0,1.0,0,16000,/localscratch/GSC/on/283d7a53_nohash_0.wav,wav,,283d7a53,string,,on,string,,on,string,
+on/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/on/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,on,string,,on,string,
+on/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/on/37dca74f_nohash_1.wav,wav,,37dca74f,string,,on,string,,on,string,
+on/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/on/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,on,string,,on,string,
+on/1093c8e7_nohash_0,1.0,0,16000,/localscratch/GSC/on/1093c8e7_nohash_0.wav,wav,,1093c8e7,string,,on,string,,on,string,
+on/80c45ed6_nohash_0,1.0,0,16000,/localscratch/GSC/on/80c45ed6_nohash_0.wav,wav,,80c45ed6,string,,on,string,,on,string,
+on/4c6167ca_nohash_6,1.0,0,16000,/localscratch/GSC/on/4c6167ca_nohash_6.wav,wav,,4c6167ca,string,,on,string,,on,string,
+on/9a7c1f83_nohash_3,1.0,0,16000,/localscratch/GSC/on/9a7c1f83_nohash_3.wav,wav,,9a7c1f83,string,,on,string,,on,string,
+on/370844f7_nohash_0,1.0,0,16000,/localscratch/GSC/on/370844f7_nohash_0.wav,wav,,370844f7,string,,on,string,,on,string,
+on/d103dd6e_nohash_0,1.0,0,16000,/localscratch/GSC/on/d103dd6e_nohash_0.wav,wav,,d103dd6e,string,,on,string,,on,string,
+on/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/on/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,on,string,,on,string,
+on/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/on/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,on,string,,on,string,
+on/b11a05d2_nohash_0,1.0,0,16000,/localscratch/GSC/on/b11a05d2_nohash_0.wav,wav,,b11a05d2,string,,on,string,,on,string,
+on/105a0eea_nohash_1,1.0,0,16000,/localscratch/GSC/on/105a0eea_nohash_1.wav,wav,,105a0eea,string,,on,string,,on,string,
+on/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/on/68dd409e_nohash_0.wav,wav,,68dd409e,string,,on,string,,on,string,
+on/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/on/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,on,string,,on,string,
+on/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/on/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,on,string,,on,string,
+on/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_3.wav,wav,,893705bb,string,,on,string,,on,string,
+on/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/on/f428ca69_nohash_0.wav,wav,,f428ca69,string,,on,string,,on,string,
+on/ea356919_nohash_0,1.0,0,16000,/localscratch/GSC/on/ea356919_nohash_0.wav,wav,,ea356919,string,,on,string,,on,string,
+on/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/on/e41a903b_nohash_0.wav,wav,,e41a903b,string,,on,string,,on,string,
+on/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/on/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,on,string,,on,string,
+on/4c7c95de_nohash_0,1.0,0,16000,/localscratch/GSC/on/4c7c95de_nohash_0.wav,wav,,4c7c95de,string,,on,string,,on,string,
+on/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/on/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,on,string,,on,string,
+on/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/on/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,on,string,,on,string,
+on/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/on/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,on,string,,on,string,
+on/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/on/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,on,string,,on,string,
+on/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/on/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,on,string,,on,string,
+on/587f3271_nohash_0,1.0,0,16000,/localscratch/GSC/on/587f3271_nohash_0.wav,wav,,587f3271,string,,on,string,,on,string,
+on/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/on/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,on,string,,on,string,
+on/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/on/e1469561_nohash_3.wav,wav,,e1469561,string,,on,string,,on,string,
+on/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/on/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,on,string,,on,string,
+on/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/on/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,on,string,,on,string,
+on/412c675c_nohash_0,1.0,0,16000,/localscratch/GSC/on/412c675c_nohash_0.wav,wav,,412c675c,string,,on,string,,on,string,
+on/022cd682_nohash_0,1.0,0,16000,/localscratch/GSC/on/022cd682_nohash_0.wav,wav,,022cd682,string,,on,string,,on,string,
+on/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/on/1acc97de_nohash_1.wav,wav,,1acc97de,string,,on,string,,on,string,
+on/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/on/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,on,string,,on,string,
+on/6e916de8_nohash_0,1.0,0,16000,/localscratch/GSC/on/6e916de8_nohash_0.wav,wav,,6e916de8,string,,on,string,,on,string,
+on/1f3bece8_nohash_0,1.0,0,16000,/localscratch/GSC/on/1f3bece8_nohash_0.wav,wav,,1f3bece8,string,,on,string,,on,string,
+on/6021f08b_nohash_1,1.0,0,16000,/localscratch/GSC/on/6021f08b_nohash_1.wav,wav,,6021f08b,string,,on,string,,on,string,
+on/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/on/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,on,string,,on,string,
+on/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/on/87070229_nohash_3.wav,wav,,87070229,string,,on,string,,on,string,
+on/893705bb_nohash_10,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_10.wav,wav,,893705bb,string,,on,string,,on,string,
+on/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/on/91b03183_nohash_1.wav,wav,,91b03183,string,,on,string,,on,string,
+on/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/on/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,on,string,,on,string,
+on/ad6a46f1_nohash_1,1.0,0,16000,/localscratch/GSC/on/ad6a46f1_nohash_1.wav,wav,,ad6a46f1,string,,on,string,,on,string,
+on/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/on/bb05582b_nohash_0.wav,wav,,bb05582b,string,,on,string,,on,string,
+on/4c841771_nohash_0,1.0,0,16000,/localscratch/GSC/on/4c841771_nohash_0.wav,wav,,4c841771,string,,on,string,,on,string,
+on/6021f08b_nohash_2,1.0,0,16000,/localscratch/GSC/on/6021f08b_nohash_2.wav,wav,,6021f08b,string,,on,string,,on,string,
+on/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/on/8fe67225_nohash_4.wav,wav,,8fe67225,string,,on,string,,on,string,
+on/a4383927_nohash_2,1.0,0,16000,/localscratch/GSC/on/a4383927_nohash_2.wav,wav,,a4383927,string,,on,string,,on,string,
+on/fb7eb481_nohash_4,1.0,0,16000,/localscratch/GSC/on/fb7eb481_nohash_4.wav,wav,,fb7eb481,string,,on,string,,on,string,
+on/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/on/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,on,string,,on,string,
+on/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/on/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,on,string,,on,string,
+on/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/on/f9643d42_nohash_1.wav,wav,,f9643d42,string,,on,string,,on,string,
+on/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/on/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,on,string,,on,string,
+on/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/on/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,on,string,,on,string,
+on/863880b7_nohash_1,1.0,0,16000,/localscratch/GSC/on/863880b7_nohash_1.wav,wav,,863880b7,string,,on,string,,on,string,
+on/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/on/837a0f64_nohash_3.wav,wav,,837a0f64,string,,on,string,,on,string,
+on/f297e878_nohash_0,1.0,0,16000,/localscratch/GSC/on/f297e878_nohash_0.wav,wav,,f297e878,string,,on,string,,on,string,
+on/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/on/e1469561_nohash_1.wav,wav,,e1469561,string,,on,string,,on,string,
+on/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/on/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,on,string,,on,string,
+on/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/on/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,on,string,,on,string,
+on/af8b2f2c_nohash_0,1.0,0,16000,/localscratch/GSC/on/af8b2f2c_nohash_0.wav,wav,,af8b2f2c,string,,on,string,,on,string,
+on/e5e54cee_nohash_0,1.0,0,16000,/localscratch/GSC/on/e5e54cee_nohash_0.wav,wav,,e5e54cee,string,,on,string,,on,string,
+on/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/on/5170b77f_nohash_3.wav,wav,,5170b77f,string,,on,string,,on,string,
+on/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/on/f9643d42_nohash_3.wav,wav,,f9643d42,string,,on,string,,on,string,
+on/caedb73a_nohash_0,1.0,0,16000,/localscratch/GSC/on/caedb73a_nohash_0.wav,wav,,caedb73a,string,,on,string,,on,string,
+on/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_4.wav,wav,,893705bb,string,,on,string,,on,string,
+on/0b57a6ed_nohash_0,1.0,0,16000,/localscratch/GSC/on/0b57a6ed_nohash_0.wav,wav,,0b57a6ed,string,,on,string,,on,string,
+on/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/on/e1469561_nohash_2.wav,wav,,e1469561,string,,on,string,,on,string,
+on/1f653d27_nohash_0,1.0,0,16000,/localscratch/GSC/on/1f653d27_nohash_0.wav,wav,,1f653d27,string,,on,string,,on,string,
+on/f17d21b5_nohash_1,1.0,0,16000,/localscratch/GSC/on/f17d21b5_nohash_1.wav,wav,,f17d21b5,string,,on,string,,on,string,
+on/692a88e6_nohash_4,1.0,0,16000,/localscratch/GSC/on/692a88e6_nohash_4.wav,wav,,692a88e6,string,,on,string,,on,string,
+on/840c366d_nohash_0,1.0,0,16000,/localscratch/GSC/on/840c366d_nohash_0.wav,wav,,840c366d,string,,on,string,,on,string,
+on/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/on/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,on,string,,on,string,
+on/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/on/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,on,string,,on,string,
+on/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/on/e1469561_nohash_4.wav,wav,,e1469561,string,,on,string,,on,string,
+on/9a356ab9_nohash_0,1.0,0,16000,/localscratch/GSC/on/9a356ab9_nohash_0.wav,wav,,9a356ab9,string,,on,string,,on,string,
+on/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/on/8769c34c_nohash_3.wav,wav,,8769c34c,string,,on,string,,on,string,
+on/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/on/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,on,string,,on,string,
+on/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/on/5170b77f_nohash_0.wav,wav,,5170b77f,string,,on,string,,on,string,
+on/82d0d3ba_nohash_0,1.0,0,16000,/localscratch/GSC/on/82d0d3ba_nohash_0.wav,wav,,82d0d3ba,string,,on,string,,on,string,
+on/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/on/0cb74144_nohash_3.wav,wav,,0cb74144,string,,on,string,,on,string,
+on/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/on/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,on,string,,on,string,
+on/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/on/9a69672b_nohash_1.wav,wav,,9a69672b,string,,on,string,,on,string,
+on/8c0e5970_nohash_2,1.0,0,16000,/localscratch/GSC/on/8c0e5970_nohash_2.wav,wav,,8c0e5970,string,,on,string,,on,string,
+on/893705bb_nohash_9,1.0,0,16000,/localscratch/GSC/on/893705bb_nohash_9.wav,wav,,893705bb,string,,on,string,,on,string,
+on/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/on/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,on,string,,on,string,
+on/7bae88ed_nohash_1,1.0,0,16000,/localscratch/GSC/on/7bae88ed_nohash_1.wav,wav,,7bae88ed,string,,on,string,,on,string,
+on/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/on/37dca74f_nohash_0.wav,wav,,37dca74f,string,,on,string,,on,string,
+on/9d4bab4f_nohash_0,1.0,0,16000,/localscratch/GSC/on/9d4bab4f_nohash_0.wav,wav,,9d4bab4f,string,,on,string,,on,string,
+off/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/off/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,off,string,,off,string,
+off/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/off/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,off,string,,off,string,
+off/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/off/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,off,string,,off,string,
+off/f17d21b5_nohash_0,1.0,0,16000,/localscratch/GSC/off/f17d21b5_nohash_0.wav,wav,,f17d21b5,string,,off,string,,off,string,
+off/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/off/cd85758f_nohash_1.wav,wav,,cd85758f,string,,off,string,,off,string,
+off/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/off/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,off,string,,off,string,
+off/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/off/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,off,string,,off,string,
+off/4a0e2c16_nohash_0,1.0,0,16000,/localscratch/GSC/off/4a0e2c16_nohash_0.wav,wav,,4a0e2c16,string,,off,string,,off,string,
+off/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/off/b49caed3_nohash_1.wav,wav,,b49caed3,string,,off,string,,off,string,
+off/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/off/9a69672b_nohash_3.wav,wav,,9a69672b,string,,off,string,,off,string,
+off/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/off/e41a903b_nohash_2.wav,wav,,e41a903b,string,,off,string,,off,string,
+off/e71b4ce6_nohash_0,1.0,0,16000,/localscratch/GSC/off/e71b4ce6_nohash_0.wav,wav,,e71b4ce6,string,,off,string,,off,string,
+off/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/off/dc75148d_nohash_0.wav,wav,,dc75148d,string,,off,string,,off,string,
+off/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/off/97f4c236_nohash_2.wav,wav,,97f4c236,string,,off,string,,off,string,
+off/8769c34c_nohash_4,1.0,0,16000,/localscratch/GSC/off/8769c34c_nohash_4.wav,wav,,8769c34c,string,,off,string,,off,string,
+off/81dc4a94_nohash_0,1.0,0,16000,/localscratch/GSC/off/81dc4a94_nohash_0.wav,wav,,81dc4a94,string,,off,string,,off,string,
+off/03401e93_nohash_0,1.0,0,16000,/localscratch/GSC/off/03401e93_nohash_0.wav,wav,,03401e93,string,,off,string,,off,string,
+off/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/off/91b03183_nohash_0.wav,wav,,91b03183,string,,off,string,,off,string,
+off/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/off/5170b77f_nohash_1.wav,wav,,5170b77f,string,,off,string,,off,string,
+off/47d01978_nohash_1,1.0,0,16000,/localscratch/GSC/off/47d01978_nohash_1.wav,wav,,47d01978,string,,off,string,,off,string,
+off/b11a05d2_nohash_1,1.0,0,16000,/localscratch/GSC/off/b11a05d2_nohash_1.wav,wav,,b11a05d2,string,,off,string,,off,string,
+off/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/off/db24628d_nohash_0.wav,wav,,db24628d,string,,off,string,,off,string,
+off/9d171fee_nohash_0,1.0,0,16000,/localscratch/GSC/off/9d171fee_nohash_0.wav,wav,,9d171fee,string,,off,string,,off,string,
+off/8625475c_nohash_0,1.0,0,16000,/localscratch/GSC/off/8625475c_nohash_0.wav,wav,,8625475c,string,,off,string,,off,string,
+off/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/off/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,off,string,,off,string,
+off/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/off/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,off,string,,off,string,
+off/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/off/97f4c236_nohash_1.wav,wav,,97f4c236,string,,off,string,,off,string,
+off/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/off/a7216980_nohash_1.wav,wav,,a7216980,string,,off,string,,off,string,
+off/a60a09cf_nohash_0,1.0,0,16000,/localscratch/GSC/off/a60a09cf_nohash_0.wav,wav,,a60a09cf,string,,off,string,,off,string,
+off/6205088b_nohash_0,1.0,0,16000,/localscratch/GSC/off/6205088b_nohash_0.wav,wav,,6205088b,string,,off,string,,off,string,
+off/bfd26d6b_nohash_4,1.0,0,16000,/localscratch/GSC/off/bfd26d6b_nohash_4.wav,wav,,bfd26d6b,string,,off,string,,off,string,
+off/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/off/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,off,string,,off,string,
+off/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/off/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,off,string,,off,string,
+off/3efef882_nohash_2,1.0,0,16000,/localscratch/GSC/off/3efef882_nohash_2.wav,wav,,3efef882,string,,off,string,,off,string,
+off/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/off/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,off,string,,off,string,
+off/9dc1889e_nohash_1,1.0,0,16000,/localscratch/GSC/off/9dc1889e_nohash_1.wav,wav,,9dc1889e,string,,off,string,,off,string,
+off/135c6841_nohash_1,1.0,0,16000,/localscratch/GSC/off/135c6841_nohash_1.wav,wav,,135c6841,string,,off,string,,off,string,
+off/c9b5ff26_nohash_4,1.0,0,16000,/localscratch/GSC/off/c9b5ff26_nohash_4.wav,wav,,c9b5ff26,string,,off,string,,off,string,
+off/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/off/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,off,string,,off,string,
+off/c0e0f834_nohash_0,1.0,0,16000,/localscratch/GSC/off/c0e0f834_nohash_0.wav,wav,,c0e0f834,string,,off,string,,off,string,
+off/3efef882_nohash_1,1.0,0,16000,/localscratch/GSC/off/3efef882_nohash_1.wav,wav,,3efef882,string,,off,string,,off,string,
+off/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/off/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,off,string,,off,string,
+off/6b889021_nohash_0,1.0,0,16000,/localscratch/GSC/off/6b889021_nohash_0.wav,wav,,6b889021,string,,off,string,,off,string,
+off/a7216980_nohash_4,1.0,0,16000,/localscratch/GSC/off/a7216980_nohash_4.wav,wav,,a7216980,string,,off,string,,off,string,
+off/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/off/692a88e6_nohash_3.wav,wav,,692a88e6,string,,off,string,,off,string,
+off/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/off/63f7a489_nohash_0.wav,wav,,63f7a489,string,,off,string,,off,string,
+off/d5b963aa_nohash_4,1.0,0,16000,/localscratch/GSC/off/d5b963aa_nohash_4.wav,wav,,d5b963aa,string,,off,string,,off,string,
+off/44260689_nohash_0,1.0,0,16000,/localscratch/GSC/off/44260689_nohash_0.wav,wav,,44260689,string,,off,string,,off,string,
+off/d9e9f554_nohash_0,1.0,0,16000,/localscratch/GSC/off/d9e9f554_nohash_0.wav,wav,,d9e9f554,string,,off,string,,off,string,
+off/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/off/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,off,string,,off,string,
+off/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/off/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,off,string,,off,string,
+off/6205088b_nohash_1,1.0,0,16000,/localscratch/GSC/off/6205088b_nohash_1.wav,wav,,6205088b,string,,off,string,,off,string,
+off/2fa39636_nohash_1,1.0,0,16000,/localscratch/GSC/off/2fa39636_nohash_1.wav,wav,,2fa39636,string,,off,string,,off,string,
+off/2aa787cf_nohash_0,1.0,0,16000,/localscratch/GSC/off/2aa787cf_nohash_0.wav,wav,,2aa787cf,string,,off,string,,off,string,
+off/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/off/893705bb_nohash_0.wav,wav,,893705bb,string,,off,string,,off,string,
+off/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/off/189cbabe_nohash_1.wav,wav,,189cbabe,string,,off,string,,off,string,
+off/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/off/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,off,string,,off,string,
+off/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/off/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,off,string,,off,string,
+off/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/off/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,off,string,,off,string,
+off/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/off/8fe67225_nohash_0.wav,wav,,8fe67225,string,,off,string,,off,string,
+off/9d171fee_nohash_1,1.0,0,16000,/localscratch/GSC/off/9d171fee_nohash_1.wav,wav,,9d171fee,string,,off,string,,off,string,
+off/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/off/beb458a4_nohash_4.wav,wav,,beb458a4,string,,off,string,,off,string,
+off/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/off/2796ac50_nohash_0.wav,wav,,2796ac50,string,,off,string,,off,string,
+off/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/off/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,off,string,,off,string,
+off/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/off/cd85758f_nohash_0.wav,wav,,cd85758f,string,,off,string,,off,string,
+off/0cb74144_nohash_4,1.0,0,16000,/localscratch/GSC/off/0cb74144_nohash_4.wav,wav,,0cb74144,string,,off,string,,off,string,
+off/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/off/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,off,string,,off,string,
+off/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/off/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,off,string,,off,string,
+off/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/off/e41a903b_nohash_1.wav,wav,,e41a903b,string,,off,string,,off,string,
+off/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/off/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,off,string,,off,string,
+off/95ba4996_nohash_0,1.0,0,16000,/localscratch/GSC/off/95ba4996_nohash_0.wav,wav,,95ba4996,string,,off,string,,off,string,
+off/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/off/37dca74f_nohash_3.wav,wav,,37dca74f,string,,off,string,,off,string,
+off/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/off/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,off,string,,off,string,
+off/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/off/8fe67225_nohash_2.wav,wav,,8fe67225,string,,off,string,,off,string,
+off/1acc97de_nohash_4,1.0,0,16000,/localscratch/GSC/off/1acc97de_nohash_4.wav,wav,,1acc97de,string,,off,string,,off,string,
+off/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/off/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,off,string,,off,string,
+off/653a48f5_nohash_0,1.0,0,16000,/localscratch/GSC/off/653a48f5_nohash_0.wav,wav,,653a48f5,string,,off,string,,off,string,
+off/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/off/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,off,string,,off,string,
+off/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/off/8056e897_nohash_0.wav,wav,,8056e897,string,,off,string,,off,string,
+off/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/off/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,off,string,,off,string,
+off/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/off/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,off,string,,off,string,
+off/caf9fceb_nohash_0,1.0,0,16000,/localscratch/GSC/off/caf9fceb_nohash_0.wav,wav,,caf9fceb,string,,off,string,,off,string,
+off/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/off/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,off,string,,off,string,
+off/653a48f5_nohash_2,1.0,0,16000,/localscratch/GSC/off/653a48f5_nohash_2.wav,wav,,653a48f5,string,,off,string,,off,string,
+off/3d86b69a_nohash_2,1.0,0,16000,/localscratch/GSC/off/3d86b69a_nohash_2.wav,wav,,3d86b69a,string,,off,string,,off,string,
+off/47d01978_nohash_0,1.0,0,16000,/localscratch/GSC/off/47d01978_nohash_0.wav,wav,,47d01978,string,,off,string,,off,string,
+off/c7dc7278_nohash_4,1.0,0,16000,/localscratch/GSC/off/c7dc7278_nohash_4.wav,wav,,c7dc7278,string,,off,string,,off,string,
+off/0d53e045_nohash_2,1.0,0,16000,/localscratch/GSC/off/0d53e045_nohash_2.wav,wav,,0d53e045,string,,off,string,,off,string,
+off/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/off/97f4c236_nohash_0.wav,wav,,97f4c236,string,,off,string,,off,string,
+off/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/off/aa80f517_nohash_1.wav,wav,,aa80f517,string,,off,string,,off,string,
+off/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/off/0cb74144_nohash_1.wav,wav,,0cb74144,string,,off,string,,off,string,
+off/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/off/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,off,string,,off,string,
+off/653a48f5_nohash_1,1.0,0,16000,/localscratch/GSC/off/653a48f5_nohash_1.wav,wav,,653a48f5,string,,off,string,,off,string,
+off/44715c1c_nohash_0,1.0,0,16000,/localscratch/GSC/off/44715c1c_nohash_0.wav,wav,,44715c1c,string,,off,string,,off,string,
+off/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/off/9a69672b_nohash_0.wav,wav,,9a69672b,string,,off,string,,off,string,
+off/84d1e469_nohash_3,1.0,0,16000,/localscratch/GSC/off/84d1e469_nohash_3.wav,wav,,84d1e469,string,,off,string,,off,string,
+off/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/off/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,off,string,,off,string,
+off/ffa76c4a_nohash_1,1.0,0,16000,/localscratch/GSC/off/ffa76c4a_nohash_1.wav,wav,,ffa76c4a,string,,off,string,,off,string,
+off/d7467392_nohash_1,1.0,0,16000,/localscratch/GSC/off/d7467392_nohash_1.wav,wav,,d7467392,string,,off,string,,off,string,
+off/6205088b_nohash_2,1.0,0,16000,/localscratch/GSC/off/6205088b_nohash_2.wav,wav,,6205088b,string,,off,string,,off,string,
+off/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/off/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,off,string,,off,string,
+off/37dca74f_nohash_4,1.0,0,16000,/localscratch/GSC/off/37dca74f_nohash_4.wav,wav,,37dca74f,string,,off,string,,off,string,
+off/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/off/1acc97de_nohash_2.wav,wav,,1acc97de,string,,off,string,,off,string,
+off/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/off/63f7a489_nohash_2.wav,wav,,63f7a489,string,,off,string,,off,string,
+off/0fa1e7a9_nohash_0,1.0,0,16000,/localscratch/GSC/off/0fa1e7a9_nohash_0.wav,wav,,0fa1e7a9,string,,off,string,,off,string,
+off/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/off/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,off,string,,off,string,
+off/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/off/893705bb_nohash_5.wav,wav,,893705bb,string,,off,string,,off,string,
+off/fe1916ba_nohash_0,1.0,0,16000,/localscratch/GSC/off/fe1916ba_nohash_0.wav,wav,,fe1916ba,string,,off,string,,off,string,
+off/fce96bac_nohash_0,1.0,0,16000,/localscratch/GSC/off/fce96bac_nohash_0.wav,wav,,fce96bac,string,,off,string,,off,string,
+off/0c40e715_nohash_3,1.0,0,16000,/localscratch/GSC/off/0c40e715_nohash_3.wav,wav,,0c40e715,string,,off,string,,off,string,
+off/135c6841_nohash_0,1.0,0,16000,/localscratch/GSC/off/135c6841_nohash_0.wav,wav,,135c6841,string,,off,string,,off,string,
+off/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/off/f9643d42_nohash_0.wav,wav,,f9643d42,string,,off,string,,off,string,
+off/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/off/beb458a4_nohash_3.wav,wav,,beb458a4,string,,off,string,,off,string,
+off/0c540988_nohash_0,1.0,0,16000,/localscratch/GSC/off/0c540988_nohash_0.wav,wav,,0c540988,string,,off,string,,off,string,
+off/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/off/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,off,string,,off,string,
+off/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/off/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,off,string,,off,string,
+off/87070229_nohash_4,1.0,0,16000,/localscratch/GSC/off/87070229_nohash_4.wav,wav,,87070229,string,,off,string,,off,string,
+off/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/off/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,off,string,,off,string,
+off/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/off/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,off,string,,off,string,
+off/81dc4a94_nohash_2,1.0,0,16000,/localscratch/GSC/off/81dc4a94_nohash_2.wav,wav,,81dc4a94,string,,off,string,,off,string,
+off/8a325749_nohash_0,1.0,0,16000,/localscratch/GSC/off/8a325749_nohash_0.wav,wav,,8a325749,string,,off,string,,off,string,
+off/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/off/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,off,string,,off,string,
+off/daf230ac_nohash_0,1.0,0,16000,/localscratch/GSC/off/daf230ac_nohash_0.wav,wav,,daf230ac,string,,off,string,,off,string,
+off/84d1e469_nohash_0,1.0,0,16000,/localscratch/GSC/off/84d1e469_nohash_0.wav,wav,,84d1e469,string,,off,string,,off,string,
+off/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/off/87070229_nohash_1.wav,wav,,87070229,string,,off,string,,off,string,
+off/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/off/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,off,string,,off,string,
+off/adebe223_nohash_0,1.0,0,16000,/localscratch/GSC/off/adebe223_nohash_0.wav,wav,,adebe223,string,,off,string,,off,string,
+off/f6af2457_nohash_1,1.0,0,16000,/localscratch/GSC/off/f6af2457_nohash_1.wav,wav,,f6af2457,string,,off,string,,off,string,
+off/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/off/e1469561_nohash_0.wav,wav,,e1469561,string,,off,string,,off,string,
+off/db24628d_nohash_4,1.0,0,16000,/localscratch/GSC/off/db24628d_nohash_4.wav,wav,,db24628d,string,,off,string,,off,string,
+off/caedb73a_nohash_1,1.0,0,16000,/localscratch/GSC/off/caedb73a_nohash_1.wav,wav,,caedb73a,string,,off,string,,off,string,
+off/68dd409e_nohash_2,1.0,0,16000,/localscratch/GSC/off/68dd409e_nohash_2.wav,wav,,68dd409e,string,,off,string,,off,string,
+off/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/off/e41a903b_nohash_3.wav,wav,,e41a903b,string,,off,string,,off,string,
+off/a4383927_nohash_3,1.0,0,16000,/localscratch/GSC/off/a4383927_nohash_3.wav,wav,,a4383927,string,,off,string,,off,string,
+off/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/off/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,off,string,,off,string,
+off/bb31b82b_nohash_0,1.0,0,16000,/localscratch/GSC/off/bb31b82b_nohash_0.wav,wav,,bb31b82b,string,,off,string,,off,string,
+off/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/off/692a88e6_nohash_2.wav,wav,,692a88e6,string,,off,string,,off,string,
+off/bb31b82b_nohash_1,1.0,0,16000,/localscratch/GSC/off/bb31b82b_nohash_1.wav,wav,,bb31b82b,string,,off,string,,off,string,
+off/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/off/8769c34c_nohash_1.wav,wav,,8769c34c,string,,off,string,,off,string,
+off/84d1e469_nohash_1,1.0,0,16000,/localscratch/GSC/off/84d1e469_nohash_1.wav,wav,,84d1e469,string,,off,string,,off,string,
+off/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/off/b49caed3_nohash_3.wav,wav,,b49caed3,string,,off,string,,off,string,
+off/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/off/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,off,string,,off,string,
+off/bb05582b_nohash_4,1.0,0,16000,/localscratch/GSC/off/bb05582b_nohash_4.wav,wav,,bb05582b,string,,off,string,,off,string,
+off/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/off/2fa39636_nohash_0.wav,wav,,2fa39636,string,,off,string,,off,string,
+off/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/off/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,off,string,,off,string,
+off/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/off/a4383927_nohash_0.wav,wav,,a4383927,string,,off,string,,off,string,
+off/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/off/aa80f517_nohash_0.wav,wav,,aa80f517,string,,off,string,,off,string,
+off/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/off/5170b77f_nohash_2.wav,wav,,5170b77f,string,,off,string,,off,string,
+off/dc75148d_nohash_1,1.0,0,16000,/localscratch/GSC/off/dc75148d_nohash_1.wav,wav,,dc75148d,string,,off,string,,off,string,
+off/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/off/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,off,string,,off,string,
+off/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/off/db24628d_nohash_1.wav,wav,,db24628d,string,,off,string,,off,string,
+off/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/off/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,off,string,,off,string,
+off/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/off/beb458a4_nohash_1.wav,wav,,beb458a4,string,,off,string,,off,string,
+off/0c540988_nohash_1,1.0,0,16000,/localscratch/GSC/off/0c540988_nohash_1.wav,wav,,0c540988,string,,off,string,,off,string,
+off/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/off/d7467392_nohash_0.wav,wav,,d7467392,string,,off,string,,off,string,
+off/f0ae7203_nohash_1,1.0,0,16000,/localscratch/GSC/off/f0ae7203_nohash_1.wav,wav,,f0ae7203,string,,off,string,,off,string,
+off/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/off/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,off,string,,off,string,
+off/5e3dde6b_nohash_3,1.0,0,16000,/localscratch/GSC/off/5e3dde6b_nohash_3.wav,wav,,5e3dde6b,string,,off,string,,off,string,
+off/4620dc14_nohash_0,1.0,0,16000,/localscratch/GSC/off/4620dc14_nohash_0.wav,wav,,4620dc14,string,,off,string,,off,string,
+off/4845bb10_nohash_0,1.0,0,16000,/localscratch/GSC/off/4845bb10_nohash_0.wav,wav,,4845bb10,string,,off,string,,off,string,
+off/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/off/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,off,string,,off,string,
+off/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/off/bb05582b_nohash_1.wav,wav,,bb05582b,string,,off,string,,off,string,
+off/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/off/5f01c798_nohash_0.wav,wav,,5f01c798,string,,off,string,,off,string,
+off/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/off/f9643d42_nohash_2.wav,wav,,f9643d42,string,,off,string,,off,string,
+off/f428ca69_nohash_1,1.0,0,16000,/localscratch/GSC/off/f428ca69_nohash_1.wav,wav,,f428ca69,string,,off,string,,off,string,
+off/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/off/f292725f_nohash_0.wav,wav,,f292725f,string,,off,string,,off,string,
+off/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/off/a7216980_nohash_2.wav,wav,,a7216980,string,,off,string,,off,string,
+off/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/off/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,off,string,,off,string,
+off/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/off/1acc97de_nohash_0.wav,wav,,1acc97de,string,,off,string,,off,string,
+off/aa80f517_nohash_4,1.0,0,16000,/localscratch/GSC/off/aa80f517_nohash_4.wav,wav,,aa80f517,string,,off,string,,off,string,
+off/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/off/aa80f517_nohash_2.wav,wav,,aa80f517,string,,off,string,,off,string,
+off/e71b4ce6_nohash_1,1.0,0,16000,/localscratch/GSC/off/e71b4ce6_nohash_1.wav,wav,,e71b4ce6,string,,off,string,,off,string,
+off/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/off/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,off,string,,off,string,
+off/f6af2457_nohash_2,1.0,0,16000,/localscratch/GSC/off/f6af2457_nohash_2.wav,wav,,f6af2457,string,,off,string,,off,string,
+off/ffa76c4a_nohash_2,1.0,0,16000,/localscratch/GSC/off/ffa76c4a_nohash_2.wav,wav,,ffa76c4a,string,,off,string,,off,string,
+off/0d53e045_nohash_0,1.0,0,16000,/localscratch/GSC/off/0d53e045_nohash_0.wav,wav,,0d53e045,string,,off,string,,off,string,
+off/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/off/a7216980_nohash_0.wav,wav,,a7216980,string,,off,string,,off,string,
+off/b1f8326d_nohash_0,1.0,0,16000,/localscratch/GSC/off/b1f8326d_nohash_0.wav,wav,,b1f8326d,string,,off,string,,off,string,
+off/db24628d_nohash_3,1.0,0,16000,/localscratch/GSC/off/db24628d_nohash_3.wav,wav,,db24628d,string,,off,string,,off,string,
+off/9d171fee_nohash_2,1.0,0,16000,/localscratch/GSC/off/9d171fee_nohash_2.wav,wav,,9d171fee,string,,off,string,,off,string,
+off/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/off/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,off,string,,off,string,
+off/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/off/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,off,string,,off,string,
+off/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/off/b49caed3_nohash_2.wav,wav,,b49caed3,string,,off,string,,off,string,
+off/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/off/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,off,string,,off,string,
+off/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/off/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,off,string,,off,string,
+off/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/off/cd85758f_nohash_3.wav,wav,,cd85758f,string,,off,string,,off,string,
+off/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/off/91b03183_nohash_2.wav,wav,,91b03183,string,,off,string,,off,string,
+off/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/off/63f7a489_nohash_3.wav,wav,,63f7a489,string,,off,string,,off,string,
+off/5e3dde6b_nohash_4,1.0,0,16000,/localscratch/GSC/off/5e3dde6b_nohash_4.wav,wav,,5e3dde6b,string,,off,string,,off,string,
+off/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/off/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,off,string,,off,string,
+off/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/off/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,off,string,,off,string,
+off/0f250098_nohash_0,1.0,0,16000,/localscratch/GSC/off/0f250098_nohash_0.wav,wav,,0f250098,string,,off,string,,off,string,
+off/0c40e715_nohash_1,1.0,0,16000,/localscratch/GSC/off/0c40e715_nohash_1.wav,wav,,0c40e715,string,,off,string,,off,string,
+off/0c40e715_nohash_0,1.0,0,16000,/localscratch/GSC/off/0c40e715_nohash_0.wav,wav,,0c40e715,string,,off,string,,off,string,
+off/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/off/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,off,string,,off,string,
+off/c9e251d2_nohash_0,1.0,0,16000,/localscratch/GSC/off/c9e251d2_nohash_0.wav,wav,,c9e251d2,string,,off,string,,off,string,
+off/553f1a79_nohash_0,1.0,0,16000,/localscratch/GSC/off/553f1a79_nohash_0.wav,wav,,553f1a79,string,,off,string,,off,string,
+off/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/off/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,off,string,,off,string,
+off/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/off/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,off,string,,off,string,
+off/f9643d42_nohash_4,1.0,0,16000,/localscratch/GSC/off/f9643d42_nohash_4.wav,wav,,f9643d42,string,,off,string,,off,string,
+off/af405b69_nohash_0,1.0,0,16000,/localscratch/GSC/off/af405b69_nohash_0.wav,wav,,af405b69,string,,off,string,,off,string,
+off/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/off/b49caed3_nohash_0.wav,wav,,b49caed3,string,,off,string,,off,string,
+off/63f7a489_nohash_4,1.0,0,16000,/localscratch/GSC/off/63f7a489_nohash_4.wav,wav,,63f7a489,string,,off,string,,off,string,
+off/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/off/881583a6_nohash_0.wav,wav,,881583a6,string,,off,string,,off,string,
+off/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/off/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,off,string,,off,string,
+off/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/off/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,off,string,,off,string,
+off/af7a8296_nohash_0,1.0,0,16000,/localscratch/GSC/off/af7a8296_nohash_0.wav,wav,,af7a8296,string,,off,string,,off,string,
+off/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/off/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,off,string,,off,string,
+off/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/off/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,off,string,,off,string,
+off/d1bf406b_nohash_0,1.0,0,16000,/localscratch/GSC/off/d1bf406b_nohash_0.wav,wav,,d1bf406b,string,,off,string,,off,string,
+off/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/off/692a88e6_nohash_1.wav,wav,,692a88e6,string,,off,string,,off,string,
+off/27c30960_nohash_0,1.0,0,16000,/localscratch/GSC/off/27c30960_nohash_0.wav,wav,,27c30960,string,,off,string,,off,string,
+off/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/off/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,off,string,,off,string,
+off/7bae88ed_nohash_0,1.0,0,16000,/localscratch/GSC/off/7bae88ed_nohash_0.wav,wav,,7bae88ed,string,,off,string,,off,string,
+off/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/off/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,off,string,,off,string,
+off/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/off/37dca74f_nohash_2.wav,wav,,37dca74f,string,,off,string,,off,string,
+off/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/off/bb05582b_nohash_2.wav,wav,,bb05582b,string,,off,string,,off,string,
+off/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/off/692a88e6_nohash_0.wav,wav,,692a88e6,string,,off,string,,off,string,
+off/105a0eea_nohash_0,1.0,0,16000,/localscratch/GSC/off/105a0eea_nohash_0.wav,wav,,105a0eea,string,,off,string,,off,string,
+off/8c7f81df_nohash_1,1.0,0,16000,/localscratch/GSC/off/8c7f81df_nohash_1.wav,wav,,8c7f81df,string,,off,string,,off,string,
+off/84d1e469_nohash_2,1.0,0,16000,/localscratch/GSC/off/84d1e469_nohash_2.wav,wav,,84d1e469,string,,off,string,,off,string,
+off/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/off/aa80f517_nohash_3.wav,wav,,aa80f517,string,,off,string,,off,string,
+off/8fe52b97_nohash_0,1.0,0,16000,/localscratch/GSC/off/8fe52b97_nohash_0.wav,wav,,8fe52b97,string,,off,string,,off,string,
+off/81dc4a94_nohash_1,1.0,0,16000,/localscratch/GSC/off/81dc4a94_nohash_1.wav,wav,,81dc4a94,string,,off,string,,off,string,
+off/4620dc14_nohash_2,1.0,0,16000,/localscratch/GSC/off/4620dc14_nohash_2.wav,wav,,4620dc14,string,,off,string,,off,string,
+off/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/off/893705bb_nohash_1.wav,wav,,893705bb,string,,off,string,,off,string,
+off/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/off/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,off,string,,off,string,
+off/6736bc64_nohash_0,1.0,0,16000,/localscratch/GSC/off/6736bc64_nohash_0.wav,wav,,6736bc64,string,,off,string,,off,string,
+off/a591c2ea_nohash_1,1.0,0,16000,/localscratch/GSC/off/a591c2ea_nohash_1.wav,wav,,a591c2ea,string,,off,string,,off,string,
+off/f264e0df_nohash_0,1.0,0,16000,/localscratch/GSC/off/f264e0df_nohash_0.wav,wav,,f264e0df,string,,off,string,,off,string,
+off/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/off/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,off,string,,off,string,
+off/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/off/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,off,string,,off,string,
+off/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/off/cd85758f_nohash_2.wav,wav,,cd85758f,string,,off,string,,off,string,
+off/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/off/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,off,string,,off,string,
+off/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/off/63f7a489_nohash_1.wav,wav,,63f7a489,string,,off,string,,off,string,
+off/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/off/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,off,string,,off,string,
+off/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/off/189cbabe_nohash_2.wav,wav,,189cbabe,string,,off,string,,off,string,
+off/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/off/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,off,string,,off,string,
+off/dc75148d_nohash_2,1.0,0,16000,/localscratch/GSC/off/dc75148d_nohash_2.wav,wav,,dc75148d,string,,off,string,,off,string,
+off/a591c2ea_nohash_0,1.0,0,16000,/localscratch/GSC/off/a591c2ea_nohash_0.wav,wav,,a591c2ea,string,,off,string,,off,string,
+off/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/off/422d3197_nohash_0.wav,wav,,422d3197,string,,off,string,,off,string,
+off/fdb5155e_nohash_0,1.0,0,16000,/localscratch/GSC/off/fdb5155e_nohash_0.wav,wav,,fdb5155e,string,,off,string,,off,string,
+off/8ec6dab6_nohash_0,1.0,0,16000,/localscratch/GSC/off/8ec6dab6_nohash_0.wav,wav,,8ec6dab6,string,,off,string,,off,string,
+off/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/off/893705bb_nohash_6.wav,wav,,893705bb,string,,off,string,,off,string,
+off/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/off/189cbabe_nohash_0.wav,wav,,189cbabe,string,,off,string,,off,string,
+off/f297e878_nohash_1,1.0,0,16000,/localscratch/GSC/off/f297e878_nohash_1.wav,wav,,f297e878,string,,off,string,,off,string,
+off/8c7f81df_nohash_0,1.0,0,16000,/localscratch/GSC/off/8c7f81df_nohash_0.wav,wav,,8c7f81df,string,,off,string,,off,string,
+off/210f3aa9_nohash_0,1.0,0,16000,/localscratch/GSC/off/210f3aa9_nohash_0.wav,wav,,210f3aa9,string,,off,string,,off,string,
+off/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/off/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,off,string,,off,string,
+off/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/off/7257420c_nohash_0.wav,wav,,7257420c,string,,off,string,,off,string,
+off/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/off/beb458a4_nohash_0.wav,wav,,beb458a4,string,,off,string,,off,string,
+off/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/off/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,off,string,,off,string,
+off/62ff07ef_nohash_0,1.0,0,16000,/localscratch/GSC/off/62ff07ef_nohash_0.wav,wav,,62ff07ef,string,,off,string,,off,string,
+off/67961766_nohash_0,1.0,0,16000,/localscratch/GSC/off/67961766_nohash_0.wav,wav,,67961766,string,,off,string,,off,string,
+off/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/off/6f689791_nohash_0.wav,wav,,6f689791,string,,off,string,,off,string,
+off/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/off/87070229_nohash_2.wav,wav,,87070229,string,,off,string,,off,string,
+off/1cb788bc_nohash_0,1.0,0,16000,/localscratch/GSC/off/1cb788bc_nohash_0.wav,wav,,1cb788bc,string,,off,string,,off,string,
+off/ea356919_nohash_1,1.0,0,16000,/localscratch/GSC/off/ea356919_nohash_1.wav,wav,,ea356919,string,,off,string,,off,string,
+off/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/off/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,off,string,,off,string,
+off/3659fc1c_nohash_0,1.0,0,16000,/localscratch/GSC/off/3659fc1c_nohash_0.wav,wav,,3659fc1c,string,,off,string,,off,string,
+off/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/off/1acc97de_nohash_3.wav,wav,,1acc97de,string,,off,string,,off,string,
+off/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/off/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,off,string,,off,string,
+off/563aa4e6_nohash_4,1.0,0,16000,/localscratch/GSC/off/563aa4e6_nohash_4.wav,wav,,563aa4e6,string,,off,string,,off,string,
+off/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/off/87070229_nohash_0.wav,wav,,87070229,string,,off,string,,off,string,
+off/a4383927_nohash_1,1.0,0,16000,/localscratch/GSC/off/a4383927_nohash_1.wav,wav,,a4383927,string,,off,string,,off,string,
+off/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/off/97f4c236_nohash_4.wav,wav,,97f4c236,string,,off,string,,off,string,
+off/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/off/837a0f64_nohash_0.wav,wav,,837a0f64,string,,off,string,,off,string,
+off/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/off/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,off,string,,off,string,
+off/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/off/8769c34c_nohash_2.wav,wav,,8769c34c,string,,off,string,,off,string,
+off/220ee1ef_nohash_0,1.0,0,16000,/localscratch/GSC/off/220ee1ef_nohash_0.wav,wav,,220ee1ef,string,,off,string,,off,string,
+off/daf230ac_nohash_1,1.0,0,16000,/localscratch/GSC/off/daf230ac_nohash_1.wav,wav,,daf230ac,string,,off,string,,off,string,
+off/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/off/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,off,string,,off,string,
+off/0d53e045_nohash_1,1.0,0,16000,/localscratch/GSC/off/0d53e045_nohash_1.wav,wav,,0d53e045,string,,off,string,,off,string,
+off/b737ee80_nohash_0,1.0,0,16000,/localscratch/GSC/off/b737ee80_nohash_0.wav,wav,,b737ee80,string,,off,string,,off,string,
+off/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/off/a7216980_nohash_3.wav,wav,,a7216980,string,,off,string,,off,string,
+off/8494fba8_nohash_0,1.0,0,16000,/localscratch/GSC/off/8494fba8_nohash_0.wav,wav,,8494fba8,string,,off,string,,off,string,
+off/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/off/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,off,string,,off,string,
+off/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/off/8fe67225_nohash_1.wav,wav,,8fe67225,string,,off,string,,off,string,
+off/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/off/db24628d_nohash_2.wav,wav,,db24628d,string,,off,string,,off,string,
+off/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/off/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,off,string,,off,string,
+off/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/off/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,off,string,,off,string,
+off/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/off/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,off,string,,off,string,
+off/0c540988_nohash_2,1.0,0,16000,/localscratch/GSC/off/0c540988_nohash_2.wav,wav,,0c540988,string,,off,string,,off,string,
+off/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/off/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,off,string,,off,string,
+off/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/off/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,off,string,,off,string,
+off/d5ca80c6_nohash_0,1.0,0,16000,/localscratch/GSC/off/d5ca80c6_nohash_0.wav,wav,,d5ca80c6,string,,off,string,,off,string,
+off/6f689791_nohash_1,1.0,0,16000,/localscratch/GSC/off/6f689791_nohash_1.wav,wav,,6f689791,string,,off,string,,off,string,
+off/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/off/893705bb_nohash_2.wav,wav,,893705bb,string,,off,string,,off,string,
+off/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/off/9a69672b_nohash_2.wav,wav,,9a69672b,string,,off,string,,off,string,
+off/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/off/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,off,string,,off,string,
+off/f6af2457_nohash_0,1.0,0,16000,/localscratch/GSC/off/f6af2457_nohash_0.wav,wav,,f6af2457,string,,off,string,,off,string,
+off/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/off/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,off,string,,off,string,
+off/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/off/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,off,string,,off,string,
+off/d5ca80c6_nohash_2,1.0,0,16000,/localscratch/GSC/off/d5ca80c6_nohash_2.wav,wav,,d5ca80c6,string,,off,string,,off,string,
+off/af130f12_nohash_1,1.0,0,16000,/localscratch/GSC/off/af130f12_nohash_1.wav,wav,,af130f12,string,,off,string,,off,string,
+off/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/off/beb458a4_nohash_2.wav,wav,,beb458a4,string,,off,string,,off,string,
+off/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/off/0cb74144_nohash_2.wav,wav,,0cb74144,string,,off,string,,off,string,
+off/d0faf7e4_nohash_4,1.0,0,16000,/localscratch/GSC/off/d0faf7e4_nohash_4.wav,wav,,d0faf7e4,string,,off,string,,off,string,
+off/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/off/97f4c236_nohash_3.wav,wav,,97f4c236,string,,off,string,,off,string,
+off/3f170018_nohash_0,1.0,0,16000,/localscratch/GSC/off/3f170018_nohash_0.wav,wav,,3f170018,string,,off,string,,off,string,
+off/44260689_nohash_1,1.0,0,16000,/localscratch/GSC/off/44260689_nohash_1.wav,wav,,44260689,string,,off,string,,off,string,
+off/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/off/8769c34c_nohash_0.wav,wav,,8769c34c,string,,off,string,,off,string,
+off/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/off/8fe67225_nohash_3.wav,wav,,8fe67225,string,,off,string,,off,string,
+off/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/off/189cbabe_nohash_3.wav,wav,,189cbabe,string,,off,string,,off,string,
+off/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/off/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,off,string,,off,string,
+off/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/off/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,off,string,,off,string,
+off/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/off/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,off,string,,off,string,
+off/24ad3ebe_nohash_0,1.0,0,16000,/localscratch/GSC/off/24ad3ebe_nohash_0.wav,wav,,24ad3ebe,string,,off,string,,off,string,
+off/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/off/0cb74144_nohash_0.wav,wav,,0cb74144,string,,off,string,,off,string,
+off/283d7a53_nohash_0,1.0,0,16000,/localscratch/GSC/off/283d7a53_nohash_0.wav,wav,,283d7a53,string,,off,string,,off,string,
+off/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/off/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,off,string,,off,string,
+off/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/off/37dca74f_nohash_1.wav,wav,,37dca74f,string,,off,string,,off,string,
+off/d5ca80c6_nohash_1,1.0,0,16000,/localscratch/GSC/off/d5ca80c6_nohash_1.wav,wav,,d5ca80c6,string,,off,string,,off,string,
+off/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/off/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,off,string,,off,string,
+off/80c45ed6_nohash_0,1.0,0,16000,/localscratch/GSC/off/80c45ed6_nohash_0.wav,wav,,80c45ed6,string,,off,string,,off,string,
+off/370844f7_nohash_0,1.0,0,16000,/localscratch/GSC/off/370844f7_nohash_0.wav,wav,,370844f7,string,,off,string,,off,string,
+off/9e2ce5e3_nohash_1,1.0,0,16000,/localscratch/GSC/off/9e2ce5e3_nohash_1.wav,wav,,9e2ce5e3,string,,off,string,,off,string,
+off/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/off/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,off,string,,off,string,
+off/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/off/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,off,string,,off,string,
+off/b11a05d2_nohash_0,1.0,0,16000,/localscratch/GSC/off/b11a05d2_nohash_0.wav,wav,,b11a05d2,string,,off,string,,off,string,
+off/c0e0f834_nohash_1,1.0,0,16000,/localscratch/GSC/off/c0e0f834_nohash_1.wav,wav,,c0e0f834,string,,off,string,,off,string,
+off/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/off/68dd409e_nohash_0.wav,wav,,68dd409e,string,,off,string,,off,string,
+off/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/off/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,off,string,,off,string,
+off/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/off/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,off,string,,off,string,
+off/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/off/893705bb_nohash_3.wav,wav,,893705bb,string,,off,string,,off,string,
+off/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/off/f428ca69_nohash_0.wav,wav,,f428ca69,string,,off,string,,off,string,
+off/ea356919_nohash_0,1.0,0,16000,/localscratch/GSC/off/ea356919_nohash_0.wav,wav,,ea356919,string,,off,string,,off,string,
+off/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/off/e41a903b_nohash_0.wav,wav,,e41a903b,string,,off,string,,off,string,
+off/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/off/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,off,string,,off,string,
+off/91b03183_nohash_3,1.0,0,16000,/localscratch/GSC/off/91b03183_nohash_3.wav,wav,,91b03183,string,,off,string,,off,string,
+off/9dc1889e_nohash_0,1.0,0,16000,/localscratch/GSC/off/9dc1889e_nohash_0.wav,wav,,9dc1889e,string,,off,string,,off,string,
+off/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/off/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,off,string,,off,string,
+off/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/off/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,off,string,,off,string,
+off/cc592808_nohash_0,1.0,0,16000,/localscratch/GSC/off/cc592808_nohash_0.wav,wav,,cc592808,string,,off,string,,off,string,
+off/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/off/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,off,string,,off,string,
+off/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/off/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,off,string,,off,string,
+off/587f3271_nohash_0,1.0,0,16000,/localscratch/GSC/off/587f3271_nohash_0.wav,wav,,587f3271,string,,off,string,,off,string,
+off/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/off/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,off,string,,off,string,
+off/40c4e774_nohash_0,1.0,0,16000,/localscratch/GSC/off/40c4e774_nohash_0.wav,wav,,40c4e774,string,,off,string,,off,string,
+off/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/off/e1469561_nohash_3.wav,wav,,e1469561,string,,off,string,,off,string,
+off/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/off/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,off,string,,off,string,
+off/ca48dc76_nohash_0,1.0,0,16000,/localscratch/GSC/off/ca48dc76_nohash_0.wav,wav,,ca48dc76,string,,off,string,,off,string,
+off/0c40e715_nohash_2,1.0,0,16000,/localscratch/GSC/off/0c40e715_nohash_2.wav,wav,,0c40e715,string,,off,string,,off,string,
+off/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/off/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,off,string,,off,string,
+off/412c675c_nohash_0,1.0,0,16000,/localscratch/GSC/off/412c675c_nohash_0.wav,wav,,412c675c,string,,off,string,,off,string,
+off/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/off/1acc97de_nohash_1.wav,wav,,1acc97de,string,,off,string,,off,string,
+off/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/off/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,off,string,,off,string,
+off/553f1a79_nohash_1,1.0,0,16000,/localscratch/GSC/off/553f1a79_nohash_1.wav,wav,,553f1a79,string,,off,string,,off,string,
+off/5170b77f_nohash_4,1.0,0,16000,/localscratch/GSC/off/5170b77f_nohash_4.wav,wav,,5170b77f,string,,off,string,,off,string,
+off/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/off/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,off,string,,off,string,
+off/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/off/87070229_nohash_3.wav,wav,,87070229,string,,off,string,,off,string,
+off/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/off/91b03183_nohash_1.wav,wav,,91b03183,string,,off,string,,off,string,
+off/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/off/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,off,string,,off,string,
+off/4845bb10_nohash_1,1.0,0,16000,/localscratch/GSC/off/4845bb10_nohash_1.wav,wav,,4845bb10,string,,off,string,,off,string,
+off/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/off/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,off,string,,off,string,
+off/3efef882_nohash_0,1.0,0,16000,/localscratch/GSC/off/3efef882_nohash_0.wav,wav,,3efef882,string,,off,string,,off,string,
+off/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/off/bb05582b_nohash_0.wav,wav,,bb05582b,string,,off,string,,off,string,
+off/2aa787cf_nohash_1,1.0,0,16000,/localscratch/GSC/off/2aa787cf_nohash_1.wav,wav,,2aa787cf,string,,off,string,,off,string,
+off/4c841771_nohash_0,1.0,0,16000,/localscratch/GSC/off/4c841771_nohash_0.wav,wav,,4c841771,string,,off,string,,off,string,
+off/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/off/8fe67225_nohash_4.wav,wav,,8fe67225,string,,off,string,,off,string,
+off/a4383927_nohash_2,1.0,0,16000,/localscratch/GSC/off/a4383927_nohash_2.wav,wav,,a4383927,string,,off,string,,off,string,
+off/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/off/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,off,string,,off,string,
+off/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/off/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,off,string,,off,string,
+off/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/off/f9643d42_nohash_1.wav,wav,,f9643d42,string,,off,string,,off,string,
+off/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/off/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,off,string,,off,string,
+off/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/off/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,off,string,,off,string,
+off/f297e878_nohash_0,1.0,0,16000,/localscratch/GSC/off/f297e878_nohash_0.wav,wav,,f297e878,string,,off,string,,off,string,
+off/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/off/e1469561_nohash_1.wav,wav,,e1469561,string,,off,string,,off,string,
+off/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/off/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,off,string,,off,string,
+off/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/off/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,off,string,,off,string,
+off/e5e54cee_nohash_0,1.0,0,16000,/localscratch/GSC/off/e5e54cee_nohash_0.wav,wav,,e5e54cee,string,,off,string,,off,string,
+off/d056d4d8_nohash_0,1.0,0,16000,/localscratch/GSC/off/d056d4d8_nohash_0.wav,wav,,d056d4d8,string,,off,string,,off,string,
+off/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/off/5170b77f_nohash_3.wav,wav,,5170b77f,string,,off,string,,off,string,
+off/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/off/f9643d42_nohash_3.wav,wav,,f9643d42,string,,off,string,,off,string,
+off/caedb73a_nohash_0,1.0,0,16000,/localscratch/GSC/off/caedb73a_nohash_0.wav,wav,,caedb73a,string,,off,string,,off,string,
+off/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/off/893705bb_nohash_4.wav,wav,,893705bb,string,,off,string,,off,string,
+off/0b57a6ed_nohash_0,1.0,0,16000,/localscratch/GSC/off/0b57a6ed_nohash_0.wav,wav,,0b57a6ed,string,,off,string,,off,string,
+off/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/off/e1469561_nohash_2.wav,wav,,e1469561,string,,off,string,,off,string,
+off/50033893_nohash_0,1.0,0,16000,/localscratch/GSC/off/50033893_nohash_0.wav,wav,,50033893,string,,off,string,,off,string,
+off/f17d21b5_nohash_1,1.0,0,16000,/localscratch/GSC/off/f17d21b5_nohash_1.wav,wav,,f17d21b5,string,,off,string,,off,string,
+off/840c366d_nohash_0,1.0,0,16000,/localscratch/GSC/off/840c366d_nohash_0.wav,wav,,840c366d,string,,off,string,,off,string,
+off/3659fc1c_nohash_1,1.0,0,16000,/localscratch/GSC/off/3659fc1c_nohash_1.wav,wav,,3659fc1c,string,,off,string,,off,string,
+off/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/off/c7124b73_nohash_0.wav,wav,,c7124b73,string,,off,string,,off,string,
+off/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/off/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,off,string,,off,string,
+off/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/off/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,off,string,,off,string,
+off/af130f12_nohash_0,1.0,0,16000,/localscratch/GSC/off/af130f12_nohash_0.wav,wav,,af130f12,string,,off,string,,off,string,
+off/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/off/e1469561_nohash_4.wav,wav,,e1469561,string,,off,string,,off,string,
+off/a2473d62_nohash_0,1.0,0,16000,/localscratch/GSC/off/a2473d62_nohash_0.wav,wav,,a2473d62,string,,off,string,,off,string,
+off/68dd409e_nohash_1,1.0,0,16000,/localscratch/GSC/off/68dd409e_nohash_1.wav,wav,,68dd409e,string,,off,string,,off,string,
+off/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/off/8769c34c_nohash_3.wav,wav,,8769c34c,string,,off,string,,off,string,
+off/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/off/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,off,string,,off,string,
+off/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/off/bb05582b_nohash_3.wav,wav,,bb05582b,string,,off,string,,off,string,
+off/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/off/5170b77f_nohash_0.wav,wav,,5170b77f,string,,off,string,,off,string,
+off/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/off/0cb74144_nohash_3.wav,wav,,0cb74144,string,,off,string,,off,string,
+off/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/off/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,off,string,,off,string,
+off/798f702a_nohash_0,1.0,0,16000,/localscratch/GSC/off/798f702a_nohash_0.wav,wav,,798f702a,string,,off,string,,off,string,
+off/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/off/9a69672b_nohash_1.wav,wav,,9a69672b,string,,off,string,,off,string,
+off/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/off/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,off,string,,off,string,
+off/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/off/37dca74f_nohash_0.wav,wav,,37dca74f,string,,off,string,,off,string,
+off/82b99576_nohash_0,1.0,0,16000,/localscratch/GSC/off/82b99576_nohash_0.wav,wav,,82b99576,string,,off,string,,off,string,
+off/4620dc14_nohash_1,1.0,0,16000,/localscratch/GSC/off/4620dc14_nohash_1.wav,wav,,4620dc14,string,,off,string,,off,string,
+off/9d4bab4f_nohash_0,1.0,0,16000,/localscratch/GSC/off/9d4bab4f_nohash_0.wav,wav,,9d4bab4f,string,,off,string,,off,string,
+stop/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/stop/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,stop,string,,stop,string,
+stop/b2e2773a_nohash_0,1.0,0,16000,/localscratch/GSC/stop/b2e2773a_nohash_0.wav,wav,,b2e2773a,string,,stop,string,,stop,string,
+stop/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/stop/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,stop,string,,stop,string,
+stop/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/stop/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,stop,string,,stop,string,
+stop/7dc95912_nohash_0,1.0,0,16000,/localscratch/GSC/stop/7dc95912_nohash_0.wav,wav,,7dc95912,string,,stop,string,,stop,string,
+stop/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/stop/cd85758f_nohash_1.wav,wav,,cd85758f,string,,stop,string,,stop,string,
+stop/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/stop/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,stop,string,,stop,string,
+stop/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,stop,string,,stop,string,
+stop/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/stop/b49caed3_nohash_1.wav,wav,,b49caed3,string,,stop,string,,stop,string,
+stop/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/stop/9a69672b_nohash_3.wav,wav,,9a69672b,string,,stop,string,,stop,string,
+stop/f297e878_nohash_2,1.0,0,16000,/localscratch/GSC/stop/f297e878_nohash_2.wav,wav,,f297e878,string,,stop,string,,stop,string,
+stop/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/stop/e41a903b_nohash_2.wav,wav,,e41a903b,string,,stop,string,,stop,string,
+stop/e71b4ce6_nohash_0,1.0,0,16000,/localscratch/GSC/stop/e71b4ce6_nohash_0.wav,wav,,e71b4ce6,string,,stop,string,,stop,string,
+stop/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/stop/97f4c236_nohash_2.wav,wav,,97f4c236,string,,stop,string,,stop,string,
+stop/81dc4a94_nohash_0,1.0,0,16000,/localscratch/GSC/stop/81dc4a94_nohash_0.wav,wav,,81dc4a94,string,,stop,string,,stop,string,
+stop/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/stop/91b03183_nohash_0.wav,wav,,91b03183,string,,stop,string,,stop,string,
+stop/f5496439_nohash_1,1.0,0,16000,/localscratch/GSC/stop/f5496439_nohash_1.wav,wav,,f5496439,string,,stop,string,,stop,string,
+stop/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/stop/5170b77f_nohash_1.wav,wav,,5170b77f,string,,stop,string,,stop,string,
+stop/f5496439_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f5496439_nohash_0.wav,wav,,f5496439,string,,stop,string,,stop,string,
+stop/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/stop/db24628d_nohash_0.wav,wav,,db24628d,string,,stop,string,,stop,string,
+stop/9d171fee_nohash_0,1.0,0,16000,/localscratch/GSC/stop/9d171fee_nohash_0.wav,wav,,9d171fee,string,,stop,string,,stop,string,
+stop/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/stop/837a0f64_nohash_1.wav,wav,,837a0f64,string,,stop,string,,stop,string,
+stop/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/stop/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,stop,string,,stop,string,
+stop/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/stop/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,stop,string,,stop,string,
+stop/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/stop/97f4c236_nohash_1.wav,wav,,97f4c236,string,,stop,string,,stop,string,
+stop/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/stop/a7216980_nohash_1.wav,wav,,a7216980,string,,stop,string,,stop,string,
+stop/bfd26d6b_nohash_4,1.0,0,16000,/localscratch/GSC/stop/bfd26d6b_nohash_4.wav,wav,,bfd26d6b,string,,stop,string,,stop,string,
+stop/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/stop/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,stop,string,,stop,string,
+stop/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/stop/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,stop,string,,stop,string,
+stop/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/stop/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,stop,string,,stop,string,
+stop/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/stop/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,stop,string,,stop,string,
+stop/9dc1889e_nohash_1,1.0,0,16000,/localscratch/GSC/stop/9dc1889e_nohash_1.wav,wav,,9dc1889e,string,,stop,string,,stop,string,
+stop/135c6841_nohash_1,1.0,0,16000,/localscratch/GSC/stop/135c6841_nohash_1.wav,wav,,135c6841,string,,stop,string,,stop,string,
+stop/c9b5ff26_nohash_4,1.0,0,16000,/localscratch/GSC/stop/c9b5ff26_nohash_4.wav,wav,,c9b5ff26,string,,stop,string,,stop,string,
+stop/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/stop/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,stop,string,,stop,string,
+stop/a4e8a997_nohash_1,1.0,0,16000,/localscratch/GSC/stop/a4e8a997_nohash_1.wav,wav,,a4e8a997,string,,stop,string,,stop,string,
+stop/a2473d62_nohash_2,1.0,0,16000,/localscratch/GSC/stop/a2473d62_nohash_2.wav,wav,,a2473d62,string,,stop,string,,stop,string,
+stop/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/stop/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,stop,string,,stop,string,
+stop/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/stop/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,stop,string,,stop,string,
+stop/a7216980_nohash_4,1.0,0,16000,/localscratch/GSC/stop/a7216980_nohash_4.wav,wav,,a7216980,string,,stop,string,,stop,string,
+stop/b49caed3_nohash_4,1.0,0,16000,/localscratch/GSC/stop/b49caed3_nohash_4.wav,wav,,b49caed3,string,,stop,string,,stop,string,
+stop/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/stop/692a88e6_nohash_3.wav,wav,,692a88e6,string,,stop,string,,stop,string,
+stop/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/stop/63f7a489_nohash_0.wav,wav,,63f7a489,string,,stop,string,,stop,string,
+stop/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/stop/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,stop,string,,stop,string,
+stop/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/stop/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,stop,string,,stop,string,
+stop/2fa39636_nohash_1,1.0,0,16000,/localscratch/GSC/stop/2fa39636_nohash_1.wav,wav,,2fa39636,string,,stop,string,,stop,string,
+stop/2aa787cf_nohash_0,1.0,0,16000,/localscratch/GSC/stop/2aa787cf_nohash_0.wav,wav,,2aa787cf,string,,stop,string,,stop,string,
+stop/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_0.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/893705bb_nohash_13,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_13.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/stop/189cbabe_nohash_1.wav,wav,,189cbabe,string,,stop,string,,stop,string,
+stop/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/stop/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,stop,string,,stop,string,
+stop/bed06fac_nohash_0,1.0,0,16000,/localscratch/GSC/stop/bed06fac_nohash_0.wav,wav,,bed06fac,string,,stop,string,,stop,string,
+stop/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/stop/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,stop,string,,stop,string,
+stop/893705bb_nohash_15,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_15.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/stop/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,stop,string,,stop,string,
+stop/37fc5d97_nohash_1,1.0,0,16000,/localscratch/GSC/stop/37fc5d97_nohash_1.wav,wav,,37fc5d97,string,,stop,string,,stop,string,
+stop/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/stop/8fe67225_nohash_0.wav,wav,,8fe67225,string,,stop,string,,stop,string,
+stop/aef8dcf5_nohash_1,1.0,0,16000,/localscratch/GSC/stop/aef8dcf5_nohash_1.wav,wav,,aef8dcf5,string,,stop,string,,stop,string,
+stop/3df9a3d4_nohash_0,1.0,0,16000,/localscratch/GSC/stop/3df9a3d4_nohash_0.wav,wav,,3df9a3d4,string,,stop,string,,stop,string,
+stop/37fc5d97_nohash_2,1.0,0,16000,/localscratch/GSC/stop/37fc5d97_nohash_2.wav,wav,,37fc5d97,string,,stop,string,,stop,string,
+stop/4290ca61_nohash_0,1.0,0,16000,/localscratch/GSC/stop/4290ca61_nohash_0.wav,wav,,4290ca61,string,,stop,string,,stop,string,
+stop/48a8a69d_nohash_0,1.0,0,16000,/localscratch/GSC/stop/48a8a69d_nohash_0.wav,wav,,48a8a69d,string,,stop,string,,stop,string,
+stop/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/stop/2796ac50_nohash_0.wav,wav,,2796ac50,string,,stop,string,,stop,string,
+stop/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/stop/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,stop,string,,stop,string,
+stop/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/stop/cd85758f_nohash_0.wav,wav,,cd85758f,string,,stop,string,,stop,string,
+stop/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/stop/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,stop,string,,stop,string,
+stop/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/stop/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,stop,string,,stop,string,
+stop/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/stop/e41a903b_nohash_1.wav,wav,,e41a903b,string,,stop,string,,stop,string,
+stop/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/stop/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,stop,string,,stop,string,
+stop/fa446c16_nohash_0,1.0,0,16000,/localscratch/GSC/stop/fa446c16_nohash_0.wav,wav,,fa446c16,string,,stop,string,,stop,string,
+stop/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/stop/37dca74f_nohash_3.wav,wav,,37dca74f,string,,stop,string,,stop,string,
+stop/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/stop/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,stop,string,,stop,string,
+stop/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/stop/8fe67225_nohash_2.wav,wav,,8fe67225,string,,stop,string,,stop,string,
+stop/1acc97de_nohash_4,1.0,0,16000,/localscratch/GSC/stop/1acc97de_nohash_4.wav,wav,,1acc97de,string,,stop,string,,stop,string,
+stop/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/stop/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,stop,string,,stop,string,
+stop/653a48f5_nohash_0,1.0,0,16000,/localscratch/GSC/stop/653a48f5_nohash_0.wav,wav,,653a48f5,string,,stop,string,,stop,string,
+stop/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/stop/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,stop,string,,stop,string,
+stop/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_7.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/stop/8056e897_nohash_0.wav,wav,,8056e897,string,,stop,string,,stop,string,
+stop/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/stop/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,stop,string,,stop,string,
+stop/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/stop/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,stop,string,,stop,string,
+stop/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/stop/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,stop,string,,stop,string,
+stop/3d86b69a_nohash_2,1.0,0,16000,/localscratch/GSC/stop/3d86b69a_nohash_2.wav,wav,,3d86b69a,string,,stop,string,,stop,string,
+stop/47d01978_nohash_0,1.0,0,16000,/localscratch/GSC/stop/47d01978_nohash_0.wav,wav,,47d01978,string,,stop,string,,stop,string,
+stop/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/stop/97f4c236_nohash_0.wav,wav,,97f4c236,string,,stop,string,,stop,string,
+stop/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/stop/aa80f517_nohash_1.wav,wav,,aa80f517,string,,stop,string,,stop,string,
+stop/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/stop/0cb74144_nohash_1.wav,wav,,0cb74144,string,,stop,string,,stop,string,
+stop/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/stop/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,stop,string,,stop,string,
+stop/44715c1c_nohash_0,1.0,0,16000,/localscratch/GSC/stop/44715c1c_nohash_0.wav,wav,,44715c1c,string,,stop,string,,stop,string,
+stop/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/stop/9a69672b_nohash_0.wav,wav,,9a69672b,string,,stop,string,,stop,string,
+stop/dfdabe19_nohash_0,1.0,0,16000,/localscratch/GSC/stop/dfdabe19_nohash_0.wav,wav,,dfdabe19,string,,stop,string,,stop,string,
+stop/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/stop/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,stop,string,,stop,string,
+stop/ffa76c4a_nohash_1,1.0,0,16000,/localscratch/GSC/stop/ffa76c4a_nohash_1.wav,wav,,ffa76c4a,string,,stop,string,,stop,string,
+stop/d7467392_nohash_1,1.0,0,16000,/localscratch/GSC/stop/d7467392_nohash_1.wav,wav,,d7467392,string,,stop,string,,stop,string,
+stop/83957201_nohash_0,1.0,0,16000,/localscratch/GSC/stop/83957201_nohash_0.wav,wav,,83957201,string,,stop,string,,stop,string,
+stop/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/stop/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,stop,string,,stop,string,
+stop/37dca74f_nohash_4,1.0,0,16000,/localscratch/GSC/stop/37dca74f_nohash_4.wav,wav,,37dca74f,string,,stop,string,,stop,string,
+stop/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/stop/1acc97de_nohash_2.wav,wav,,1acc97de,string,,stop,string,,stop,string,
+stop/9a7c1f83_nohash_4,1.0,0,16000,/localscratch/GSC/stop/9a7c1f83_nohash_4.wav,wav,,9a7c1f83,string,,stop,string,,stop,string,
+stop/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/stop/63f7a489_nohash_2.wav,wav,,63f7a489,string,,stop,string,,stop,string,
+stop/0fa1e7a9_nohash_0,1.0,0,16000,/localscratch/GSC/stop/0fa1e7a9_nohash_0.wav,wav,,0fa1e7a9,string,,stop,string,,stop,string,
+stop/a2473d62_nohash_1,1.0,0,16000,/localscratch/GSC/stop/a2473d62_nohash_1.wav,wav,,a2473d62,string,,stop,string,,stop,string,
+stop/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/stop/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,stop,string,,stop,string,
+stop/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/stop/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,stop,string,,stop,string,
+stop/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_5.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/893705bb_nohash_16,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_16.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/4f8ef132_nohash_1,1.0,0,16000,/localscratch/GSC/stop/4f8ef132_nohash_1.wav,wav,,4f8ef132,string,,stop,string,,stop,string,
+stop/fe1916ba_nohash_0,1.0,0,16000,/localscratch/GSC/stop/fe1916ba_nohash_0.wav,wav,,fe1916ba,string,,stop,string,,stop,string,
+stop/fce96bac_nohash_0,1.0,0,16000,/localscratch/GSC/stop/fce96bac_nohash_0.wav,wav,,fce96bac,string,,stop,string,,stop,string,
+stop/ca4d5368_nohash_4,1.0,0,16000,/localscratch/GSC/stop/ca4d5368_nohash_4.wav,wav,,ca4d5368,string,,stop,string,,stop,string,
+stop/7add4c5f_nohash_0,1.0,0,16000,/localscratch/GSC/stop/7add4c5f_nohash_0.wav,wav,,7add4c5f,string,,stop,string,,stop,string,
+stop/135c6841_nohash_0,1.0,0,16000,/localscratch/GSC/stop/135c6841_nohash_0.wav,wav,,135c6841,string,,stop,string,,stop,string,
+stop/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f9643d42_nohash_0.wav,wav,,f9643d42,string,,stop,string,,stop,string,
+stop/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/stop/beb458a4_nohash_3.wav,wav,,beb458a4,string,,stop,string,,stop,string,
+stop/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/stop/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,stop,string,,stop,string,
+stop/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/stop/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,stop,string,,stop,string,
+stop/87070229_nohash_4,1.0,0,16000,/localscratch/GSC/stop/87070229_nohash_4.wav,wav,,87070229,string,,stop,string,,stop,string,
+stop/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/stop/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,stop,string,,stop,string,
+stop/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/stop/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,stop,string,,stop,string,
+stop/37fc5d97_nohash_0,1.0,0,16000,/localscratch/GSC/stop/37fc5d97_nohash_0.wav,wav,,37fc5d97,string,,stop,string,,stop,string,
+stop/43fc47a7_nohash_2,1.0,0,16000,/localscratch/GSC/stop/43fc47a7_nohash_2.wav,wav,,43fc47a7,string,,stop,string,,stop,string,
+stop/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/stop/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,stop,string,,stop,string,
+stop/daf230ac_nohash_0,1.0,0,16000,/localscratch/GSC/stop/daf230ac_nohash_0.wav,wav,,daf230ac,string,,stop,string,,stop,string,
+stop/cc592808_nohash_1,1.0,0,16000,/localscratch/GSC/stop/cc592808_nohash_1.wav,wav,,cc592808,string,,stop,string,,stop,string,
+stop/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/stop/87070229_nohash_1.wav,wav,,87070229,string,,stop,string,,stop,string,
+stop/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/stop/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,stop,string,,stop,string,
+stop/e0c782d5_nohash_4,1.0,0,16000,/localscratch/GSC/stop/e0c782d5_nohash_4.wav,wav,,e0c782d5,string,,stop,string,,stop,string,
+stop/f6af2457_nohash_1,1.0,0,16000,/localscratch/GSC/stop/f6af2457_nohash_1.wav,wav,,f6af2457,string,,stop,string,,stop,string,
+stop/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/stop/e1469561_nohash_0.wav,wav,,e1469561,string,,stop,string,,stop,string,
+stop/68dd409e_nohash_2,1.0,0,16000,/localscratch/GSC/stop/68dd409e_nohash_2.wav,wav,,68dd409e,string,,stop,string,,stop,string,
+stop/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/stop/e41a903b_nohash_3.wav,wav,,e41a903b,string,,stop,string,,stop,string,
+stop/94de6a6a_nohash_4,1.0,0,16000,/localscratch/GSC/stop/94de6a6a_nohash_4.wav,wav,,94de6a6a,string,,stop,string,,stop,string,
+stop/fa446c16_nohash_1,1.0,0,16000,/localscratch/GSC/stop/fa446c16_nohash_1.wav,wav,,fa446c16,string,,stop,string,,stop,string,
+stop/e9901cf0_nohash_0,1.0,0,16000,/localscratch/GSC/stop/e9901cf0_nohash_0.wav,wav,,e9901cf0,string,,stop,string,,stop,string,
+stop/fa446c16_nohash_3,1.0,0,16000,/localscratch/GSC/stop/fa446c16_nohash_3.wav,wav,,fa446c16,string,,stop,string,,stop,string,
+stop/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/stop/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,stop,string,,stop,string,
+stop/bb31b82b_nohash_0,1.0,0,16000,/localscratch/GSC/stop/bb31b82b_nohash_0.wav,wav,,bb31b82b,string,,stop,string,,stop,string,
+stop/837a0f64_nohash_4,1.0,0,16000,/localscratch/GSC/stop/837a0f64_nohash_4.wav,wav,,837a0f64,string,,stop,string,,stop,string,
+stop/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/stop/692a88e6_nohash_2.wav,wav,,692a88e6,string,,stop,string,,stop,string,
+stop/85834399_nohash_0,1.0,0,16000,/localscratch/GSC/stop/85834399_nohash_0.wav,wav,,85834399,string,,stop,string,,stop,string,
+stop/bb31b82b_nohash_1,1.0,0,16000,/localscratch/GSC/stop/bb31b82b_nohash_1.wav,wav,,bb31b82b,string,,stop,string,,stop,string,
+stop/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/stop/8769c34c_nohash_1.wav,wav,,8769c34c,string,,stop,string,,stop,string,
+stop/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/stop/b49caed3_nohash_3.wav,wav,,b49caed3,string,,stop,string,,stop,string,
+stop/a4e8a997_nohash_0,1.0,0,16000,/localscratch/GSC/stop/a4e8a997_nohash_0.wav,wav,,a4e8a997,string,,stop,string,,stop,string,
+stop/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/stop/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,stop,string,,stop,string,
+stop/bb05582b_nohash_4,1.0,0,16000,/localscratch/GSC/stop/bb05582b_nohash_4.wav,wav,,bb05582b,string,,stop,string,,stop,string,
+stop/1fe4c891_nohash_0,1.0,0,16000,/localscratch/GSC/stop/1fe4c891_nohash_0.wav,wav,,1fe4c891,string,,stop,string,,stop,string,
+stop/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/stop/2fa39636_nohash_0.wav,wav,,2fa39636,string,,stop,string,,stop,string,
+stop/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/stop/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,stop,string,,stop,string,
+stop/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/stop/a4383927_nohash_0.wav,wav,,a4383927,string,,stop,string,,stop,string,
+stop/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/stop/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,stop,string,,stop,string,
+stop/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/stop/aa80f517_nohash_0.wav,wav,,aa80f517,string,,stop,string,,stop,string,
+stop/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/stop/5170b77f_nohash_2.wav,wav,,5170b77f,string,,stop,string,,stop,string,
+stop/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/stop/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,stop,string,,stop,string,
+stop/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/stop/db24628d_nohash_1.wav,wav,,db24628d,string,,stop,string,,stop,string,
+stop/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/stop/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,stop,string,,stop,string,
+stop/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/stop/beb458a4_nohash_1.wav,wav,,beb458a4,string,,stop,string,,stop,string,
+stop/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/stop/d7467392_nohash_0.wav,wav,,d7467392,string,,stop,string,,stop,string,
+stop/c518d1b1_nohash_1,1.0,0,16000,/localscratch/GSC/stop/c518d1b1_nohash_1.wav,wav,,c518d1b1,string,,stop,string,,stop,string,
+stop/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/stop/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,stop,string,,stop,string,
+stop/a1533da4_nohash_0,1.0,0,16000,/localscratch/GSC/stop/a1533da4_nohash_0.wav,wav,,a1533da4,string,,stop,string,,stop,string,
+stop/5e3dde6b_nohash_3,1.0,0,16000,/localscratch/GSC/stop/5e3dde6b_nohash_3.wav,wav,,5e3dde6b,string,,stop,string,,stop,string,
+stop/893705bb_nohash_19,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_19.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/4620dc14_nohash_0,1.0,0,16000,/localscratch/GSC/stop/4620dc14_nohash_0.wav,wav,,4620dc14,string,,stop,string,,stop,string,
+stop/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/stop/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,stop,string,,stop,string,
+stop/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/stop/bb05582b_nohash_1.wav,wav,,bb05582b,string,,stop,string,,stop,string,
+stop/7add4c5f_nohash_1,1.0,0,16000,/localscratch/GSC/stop/7add4c5f_nohash_1.wav,wav,,7add4c5f,string,,stop,string,,stop,string,
+stop/5eb5fc74_nohash_0,1.0,0,16000,/localscratch/GSC/stop/5eb5fc74_nohash_0.wav,wav,,5eb5fc74,string,,stop,string,,stop,string,
+stop/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/stop/5f01c798_nohash_0.wav,wav,,5f01c798,string,,stop,string,,stop,string,
+stop/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/stop/f9643d42_nohash_2.wav,wav,,f9643d42,string,,stop,string,,stop,string,
+stop/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f292725f_nohash_0.wav,wav,,f292725f,string,,stop,string,,stop,string,
+stop/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/stop/a7216980_nohash_2.wav,wav,,a7216980,string,,stop,string,,stop,string,
+stop/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/stop/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,stop,string,,stop,string,
+stop/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/stop/1acc97de_nohash_0.wav,wav,,1acc97de,string,,stop,string,,stop,string,
+stop/aa80f517_nohash_4,1.0,0,16000,/localscratch/GSC/stop/aa80f517_nohash_4.wav,wav,,aa80f517,string,,stop,string,,stop,string,
+stop/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/stop/aa80f517_nohash_2.wav,wav,,aa80f517,string,,stop,string,,stop,string,
+stop/a1533da4_nohash_1,1.0,0,16000,/localscratch/GSC/stop/a1533da4_nohash_1.wav,wav,,a1533da4,string,,stop,string,,stop,string,
+stop/e71b4ce6_nohash_1,1.0,0,16000,/localscratch/GSC/stop/e71b4ce6_nohash_1.wav,wav,,e71b4ce6,string,,stop,string,,stop,string,
+stop/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/stop/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,stop,string,,stop,string,
+stop/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/stop/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,stop,string,,stop,string,
+stop/f6af2457_nohash_2,1.0,0,16000,/localscratch/GSC/stop/f6af2457_nohash_2.wav,wav,,f6af2457,string,,stop,string,,stop,string,
+stop/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/stop/a7216980_nohash_0.wav,wav,,a7216980,string,,stop,string,,stop,string,
+stop/b1f8326d_nohash_0,1.0,0,16000,/localscratch/GSC/stop/b1f8326d_nohash_0.wav,wav,,b1f8326d,string,,stop,string,,stop,string,
+stop/893705bb_nohash_20,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_20.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/db24628d_nohash_3,1.0,0,16000,/localscratch/GSC/stop/db24628d_nohash_3.wav,wav,,db24628d,string,,stop,string,,stop,string,
+stop/ffb86d3c_nohash_1,1.0,0,16000,/localscratch/GSC/stop/ffb86d3c_nohash_1.wav,wav,,ffb86d3c,string,,stop,string,,stop,string,
+stop/ffb86d3c_nohash_2,1.0,0,16000,/localscratch/GSC/stop/ffb86d3c_nohash_2.wav,wav,,ffb86d3c,string,,stop,string,,stop,string,
+stop/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/stop/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,stop,string,,stop,string,
+stop/a8e25ebb_nohash_1,1.0,0,16000,/localscratch/GSC/stop/a8e25ebb_nohash_1.wav,wav,,a8e25ebb,string,,stop,string,,stop,string,
+stop/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/stop/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,stop,string,,stop,string,
+stop/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/stop/b49caed3_nohash_2.wav,wav,,b49caed3,string,,stop,string,,stop,string,
+stop/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_8.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/stop/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,stop,string,,stop,string,
+stop/0ea0e2f4_nohash_0,1.0,0,16000,/localscratch/GSC/stop/0ea0e2f4_nohash_0.wav,wav,,0ea0e2f4,string,,stop,string,,stop,string,
+stop/cd85758f_nohash_4,1.0,0,16000,/localscratch/GSC/stop/cd85758f_nohash_4.wav,wav,,cd85758f,string,,stop,string,,stop,string,
+stop/ffb86d3c_nohash_0,1.0,0,16000,/localscratch/GSC/stop/ffb86d3c_nohash_0.wav,wav,,ffb86d3c,string,,stop,string,,stop,string,
+stop/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/stop/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,stop,string,,stop,string,
+stop/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/stop/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,stop,string,,stop,string,
+stop/ef2a3cfb_nohash_0,1.0,0,16000,/localscratch/GSC/stop/ef2a3cfb_nohash_0.wav,wav,,ef2a3cfb,string,,stop,string,,stop,string,
+stop/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/stop/cd85758f_nohash_3.wav,wav,,cd85758f,string,,stop,string,,stop,string,
+stop/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/stop/91b03183_nohash_2.wav,wav,,91b03183,string,,stop,string,,stop,string,
+stop/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/stop/63f7a489_nohash_3.wav,wav,,63f7a489,string,,stop,string,,stop,string,
+stop/5e3dde6b_nohash_4,1.0,0,16000,/localscratch/GSC/stop/5e3dde6b_nohash_4.wav,wav,,5e3dde6b,string,,stop,string,,stop,string,
+stop/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/stop/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,stop,string,,stop,string,
+stop/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/stop/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,stop,string,,stop,string,
+stop/0c40e715_nohash_1,1.0,0,16000,/localscratch/GSC/stop/0c40e715_nohash_1.wav,wav,,0c40e715,string,,stop,string,,stop,string,
+stop/0c40e715_nohash_0,1.0,0,16000,/localscratch/GSC/stop/0c40e715_nohash_0.wav,wav,,0c40e715,string,,stop,string,,stop,string,
+stop/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/stop/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,stop,string,,stop,string,
+stop/4f8ef132_nohash_0,1.0,0,16000,/localscratch/GSC/stop/4f8ef132_nohash_0.wav,wav,,4f8ef132,string,,stop,string,,stop,string,
+stop/c9e251d2_nohash_0,1.0,0,16000,/localscratch/GSC/stop/c9e251d2_nohash_0.wav,wav,,c9e251d2,string,,stop,string,,stop,string,
+stop/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/stop/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,stop,string,,stop,string,
+stop/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/stop/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,stop,string,,stop,string,
+stop/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/stop/b49caed3_nohash_0.wav,wav,,b49caed3,string,,stop,string,,stop,string,
+stop/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/stop/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,stop,string,,stop,string,
+stop/80c45ed6_nohash_1,1.0,0,16000,/localscratch/GSC/stop/80c45ed6_nohash_1.wav,wav,,80c45ed6,string,,stop,string,,stop,string,
+stop/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/stop/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,stop,string,,stop,string,
+stop/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/stop/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,stop,string,,stop,string,
+stop/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/stop/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,stop,string,,stop,string,
+stop/9a7c1f83_nohash_5,1.0,0,16000,/localscratch/GSC/stop/9a7c1f83_nohash_5.wav,wav,,9a7c1f83,string,,stop,string,,stop,string,
+stop/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/stop/692a88e6_nohash_1.wav,wav,,692a88e6,string,,stop,string,,stop,string,
+stop/27c30960_nohash_0,1.0,0,16000,/localscratch/GSC/stop/27c30960_nohash_0.wav,wav,,27c30960,string,,stop,string,,stop,string,
+stop/d962e5ac_nohash_4,1.0,0,16000,/localscratch/GSC/stop/d962e5ac_nohash_4.wav,wav,,d962e5ac,string,,stop,string,,stop,string,
+stop/7bae88ed_nohash_0,1.0,0,16000,/localscratch/GSC/stop/7bae88ed_nohash_0.wav,wav,,7bae88ed,string,,stop,string,,stop,string,
+stop/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/stop/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,stop,string,,stop,string,
+stop/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/stop/37dca74f_nohash_2.wav,wav,,37dca74f,string,,stop,string,,stop,string,
+stop/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/stop/bb05582b_nohash_2.wav,wav,,bb05582b,string,,stop,string,,stop,string,
+stop/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/stop/692a88e6_nohash_0.wav,wav,,692a88e6,string,,stop,string,,stop,string,
+stop/105a0eea_nohash_0,1.0,0,16000,/localscratch/GSC/stop/105a0eea_nohash_0.wav,wav,,105a0eea,string,,stop,string,,stop,string,
+stop/cfde27ba_nohash_0,1.0,0,16000,/localscratch/GSC/stop/cfde27ba_nohash_0.wav,wav,,cfde27ba,string,,stop,string,,stop,string,
+stop/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/stop/aa80f517_nohash_3.wav,wav,,aa80f517,string,,stop,string,,stop,string,
+stop/8fe52b97_nohash_0,1.0,0,16000,/localscratch/GSC/stop/8fe52b97_nohash_0.wav,wav,,8fe52b97,string,,stop,string,,stop,string,
+stop/27c30960_nohash_1,1.0,0,16000,/localscratch/GSC/stop/27c30960_nohash_1.wav,wav,,27c30960,string,,stop,string,,stop,string,
+stop/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_1.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/stop/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,stop,string,,stop,string,
+stop/f264e0df_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f264e0df_nohash_0.wav,wav,,f264e0df,string,,stop,string,,stop,string,
+stop/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/stop/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,stop,string,,stop,string,
+stop/c518d1b1_nohash_0,1.0,0,16000,/localscratch/GSC/stop/c518d1b1_nohash_0.wav,wav,,c518d1b1,string,,stop,string,,stop,string,
+stop/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/stop/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,stop,string,,stop,string,
+stop/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/stop/cd85758f_nohash_2.wav,wav,,cd85758f,string,,stop,string,,stop,string,
+stop/3df9a3d4_nohash_1,1.0,0,16000,/localscratch/GSC/stop/3df9a3d4_nohash_1.wav,wav,,3df9a3d4,string,,stop,string,,stop,string,
+stop/0fa1e7a9_nohash_1,1.0,0,16000,/localscratch/GSC/stop/0fa1e7a9_nohash_1.wav,wav,,0fa1e7a9,string,,stop,string,,stop,string,
+stop/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/stop/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,stop,string,,stop,string,
+stop/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/stop/63f7a489_nohash_1.wav,wav,,63f7a489,string,,stop,string,,stop,string,
+stop/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/stop/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,stop,string,,stop,string,
+stop/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/stop/189cbabe_nohash_2.wav,wav,,189cbabe,string,,stop,string,,stop,string,
+stop/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/stop/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,stop,string,,stop,string,
+stop/fdb5155e_nohash_0,1.0,0,16000,/localscratch/GSC/stop/fdb5155e_nohash_0.wav,wav,,fdb5155e,string,,stop,string,,stop,string,
+stop/ffb86d3c_nohash_3,1.0,0,16000,/localscratch/GSC/stop/ffb86d3c_nohash_3.wav,wav,,ffb86d3c,string,,stop,string,,stop,string,
+stop/8ec6dab6_nohash_0,1.0,0,16000,/localscratch/GSC/stop/8ec6dab6_nohash_0.wav,wav,,8ec6dab6,string,,stop,string,,stop,string,
+stop/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_6.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/6f689791_nohash_2,1.0,0,16000,/localscratch/GSC/stop/6f689791_nohash_2.wav,wav,,6f689791,string,,stop,string,,stop,string,
+stop/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/stop/189cbabe_nohash_0.wav,wav,,189cbabe,string,,stop,string,,stop,string,
+stop/f297e878_nohash_1,1.0,0,16000,/localscratch/GSC/stop/f297e878_nohash_1.wav,wav,,f297e878,string,,stop,string,,stop,string,
+stop/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/stop/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,stop,string,,stop,string,
+stop/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/stop/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,stop,string,,stop,string,
+stop/d91a159e_nohash_0,1.0,0,16000,/localscratch/GSC/stop/d91a159e_nohash_0.wav,wav,,d91a159e,string,,stop,string,,stop,string,
+stop/893705bb_nohash_17,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_17.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/stop/7257420c_nohash_0.wav,wav,,7257420c,string,,stop,string,,stop,string,
+stop/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/stop/beb458a4_nohash_0.wav,wav,,beb458a4,string,,stop,string,,stop,string,
+stop/1fe4c891_nohash_1,1.0,0,16000,/localscratch/GSC/stop/1fe4c891_nohash_1.wav,wav,,1fe4c891,string,,stop,string,,stop,string,
+stop/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/stop/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,stop,string,,stop,string,
+stop/893705bb_nohash_12,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_12.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/5f814c23_nohash_0,1.0,0,16000,/localscratch/GSC/stop/5f814c23_nohash_0.wav,wav,,5f814c23,string,,stop,string,,stop,string,
+stop/67961766_nohash_0,1.0,0,16000,/localscratch/GSC/stop/67961766_nohash_0.wav,wav,,67961766,string,,stop,string,,stop,string,
+stop/6f689791_nohash_0,1.0,0,16000,/localscratch/GSC/stop/6f689791_nohash_0.wav,wav,,6f689791,string,,stop,string,,stop,string,
+stop/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/stop/87070229_nohash_2.wav,wav,,87070229,string,,stop,string,,stop,string,
+stop/ea356919_nohash_1,1.0,0,16000,/localscratch/GSC/stop/ea356919_nohash_1.wav,wav,,ea356919,string,,stop,string,,stop,string,
+stop/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/stop/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,stop,string,,stop,string,
+stop/3659fc1c_nohash_0,1.0,0,16000,/localscratch/GSC/stop/3659fc1c_nohash_0.wav,wav,,3659fc1c,string,,stop,string,,stop,string,
+stop/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/stop/1acc97de_nohash_3.wav,wav,,1acc97de,string,,stop,string,,stop,string,
+stop/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/stop/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,stop,string,,stop,string,
+stop/563aa4e6_nohash_4,1.0,0,16000,/localscratch/GSC/stop/563aa4e6_nohash_4.wav,wav,,563aa4e6,string,,stop,string,,stop,string,
+stop/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/stop/87070229_nohash_0.wav,wav,,87070229,string,,stop,string,,stop,string,
+stop/893705bb_nohash_14,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_14.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/a4383927_nohash_1,1.0,0,16000,/localscratch/GSC/stop/a4383927_nohash_1.wav,wav,,a4383927,string,,stop,string,,stop,string,
+stop/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/stop/97f4c236_nohash_4.wav,wav,,97f4c236,string,,stop,string,,stop,string,
+stop/9a7c1f83_nohash_2,1.0,0,16000,/localscratch/GSC/stop/9a7c1f83_nohash_2.wav,wav,,9a7c1f83,string,,stop,string,,stop,string,
+stop/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/stop/837a0f64_nohash_0.wav,wav,,837a0f64,string,,stop,string,,stop,string,
+stop/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/stop/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,stop,string,,stop,string,
+stop/8c0e5970_nohash_1,1.0,0,16000,/localscratch/GSC/stop/8c0e5970_nohash_1.wav,wav,,8c0e5970,string,,stop,string,,stop,string,
+stop/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/stop/8769c34c_nohash_2.wav,wav,,8769c34c,string,,stop,string,,stop,string,
+stop/220ee1ef_nohash_0,1.0,0,16000,/localscratch/GSC/stop/220ee1ef_nohash_0.wav,wav,,220ee1ef,string,,stop,string,,stop,string,
+stop/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/stop/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,stop,string,,stop,string,
+stop/893705bb_nohash_18,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_18.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/stop/a7216980_nohash_3.wav,wav,,a7216980,string,,stop,string,,stop,string,
+stop/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/stop/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,stop,string,,stop,string,
+stop/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/stop/8fe67225_nohash_1.wav,wav,,8fe67225,string,,stop,string,,stop,string,
+stop/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/stop/837a0f64_nohash_2.wav,wav,,837a0f64,string,,stop,string,,stop,string,
+stop/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/stop/db24628d_nohash_2.wav,wav,,db24628d,string,,stop,string,,stop,string,
+stop/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/stop/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,stop,string,,stop,string,
+stop/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/stop/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,stop,string,,stop,string,
+stop/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/stop/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,stop,string,,stop,string,
+stop/893705bb_nohash_11,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_11.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/0fa1e7a9_nohash_2,1.0,0,16000,/localscratch/GSC/stop/0fa1e7a9_nohash_2.wav,wav,,0fa1e7a9,string,,stop,string,,stop,string,
+stop/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/stop/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,stop,string,,stop,string,
+stop/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/stop/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,stop,string,,stop,string,
+stop/aef8dcf5_nohash_2,1.0,0,16000,/localscratch/GSC/stop/aef8dcf5_nohash_2.wav,wav,,aef8dcf5,string,,stop,string,,stop,string,
+stop/d5ca80c6_nohash_0,1.0,0,16000,/localscratch/GSC/stop/d5ca80c6_nohash_0.wav,wav,,d5ca80c6,string,,stop,string,,stop,string,
+stop/fdb5155e_nohash_1,1.0,0,16000,/localscratch/GSC/stop/fdb5155e_nohash_1.wav,wav,,fdb5155e,string,,stop,string,,stop,string,
+stop/6f689791_nohash_1,1.0,0,16000,/localscratch/GSC/stop/6f689791_nohash_1.wav,wav,,6f689791,string,,stop,string,,stop,string,
+stop/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_2.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/stop/9a69672b_nohash_2.wav,wav,,9a69672b,string,,stop,string,,stop,string,
+stop/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/stop/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,stop,string,,stop,string,
+stop/f6af2457_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f6af2457_nohash_0.wav,wav,,f6af2457,string,,stop,string,,stop,string,
+stop/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/stop/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,stop,string,,stop,string,
+stop/8c0e5970_nohash_0,1.0,0,16000,/localscratch/GSC/stop/8c0e5970_nohash_0.wav,wav,,8c0e5970,string,,stop,string,,stop,string,
+stop/e41a903b_nohash_4,1.0,0,16000,/localscratch/GSC/stop/e41a903b_nohash_4.wav,wav,,e41a903b,string,,stop,string,,stop,string,
+stop/7192fddc_nohash_0,1.0,0,16000,/localscratch/GSC/stop/7192fddc_nohash_0.wav,wav,,7192fddc,string,,stop,string,,stop,string,
+stop/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/stop/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,stop,string,,stop,string,
+stop/83957201_nohash_1,1.0,0,16000,/localscratch/GSC/stop/83957201_nohash_1.wav,wav,,83957201,string,,stop,string,,stop,string,
+stop/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/stop/beb458a4_nohash_2.wav,wav,,beb458a4,string,,stop,string,,stop,string,
+stop/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/stop/97f4c236_nohash_3.wav,wav,,97f4c236,string,,stop,string,,stop,string,
+stop/3f170018_nohash_0,1.0,0,16000,/localscratch/GSC/stop/3f170018_nohash_0.wav,wav,,3f170018,string,,stop,string,,stop,string,
+stop/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/stop/8769c34c_nohash_0.wav,wav,,8769c34c,string,,stop,string,,stop,string,
+stop/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/stop/8fe67225_nohash_3.wav,wav,,8fe67225,string,,stop,string,,stop,string,
+stop/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/stop/189cbabe_nohash_3.wav,wav,,189cbabe,string,,stop,string,,stop,string,
+stop/9d4bab4f_nohash_1,1.0,0,16000,/localscratch/GSC/stop/9d4bab4f_nohash_1.wav,wav,,9d4bab4f,string,,stop,string,,stop,string,
+stop/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/stop/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,stop,string,,stop,string,
+stop/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/stop/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,stop,string,,stop,string,
+stop/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/stop/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,stop,string,,stop,string,
+stop/9dc1889e_nohash_2,1.0,0,16000,/localscratch/GSC/stop/9dc1889e_nohash_2.wav,wav,,9dc1889e,string,,stop,string,,stop,string,
+stop/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/stop/0cb74144_nohash_0.wav,wav,,0cb74144,string,,stop,string,,stop,string,
+stop/6f2f57c1_nohash_0,1.0,0,16000,/localscratch/GSC/stop/6f2f57c1_nohash_0.wav,wav,,6f2f57c1,string,,stop,string,,stop,string,
+stop/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/stop/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,stop,string,,stop,string,
+stop/a6f2fd71_nohash_4,1.0,0,16000,/localscratch/GSC/stop/a6f2fd71_nohash_4.wav,wav,,a6f2fd71,string,,stop,string,,stop,string,
+stop/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/stop/37dca74f_nohash_1.wav,wav,,37dca74f,string,,stop,string,,stop,string,
+stop/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/stop/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,stop,string,,stop,string,
+stop/80c45ed6_nohash_0,1.0,0,16000,/localscratch/GSC/stop/80c45ed6_nohash_0.wav,wav,,80c45ed6,string,,stop,string,,stop,string,
+stop/4c6167ca_nohash_6,1.0,0,16000,/localscratch/GSC/stop/4c6167ca_nohash_6.wav,wav,,4c6167ca,string,,stop,string,,stop,string,
+stop/9a7c1f83_nohash_3,1.0,0,16000,/localscratch/GSC/stop/9a7c1f83_nohash_3.wav,wav,,9a7c1f83,string,,stop,string,,stop,string,
+stop/370844f7_nohash_0,1.0,0,16000,/localscratch/GSC/stop/370844f7_nohash_0.wav,wav,,370844f7,string,,stop,string,,stop,string,
+stop/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/stop/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,stop,string,,stop,string,
+stop/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/stop/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,stop,string,,stop,string,
+stop/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/stop/68dd409e_nohash_0.wav,wav,,68dd409e,string,,stop,string,,stop,string,
+stop/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/stop/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,stop,string,,stop,string,
+stop/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/stop/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,stop,string,,stop,string,
+stop/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_3.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f428ca69_nohash_0.wav,wav,,f428ca69,string,,stop,string,,stop,string,
+stop/475b61f1_nohash_0,1.0,0,16000,/localscratch/GSC/stop/475b61f1_nohash_0.wav,wav,,475b61f1,string,,stop,string,,stop,string,
+stop/ea356919_nohash_0,1.0,0,16000,/localscratch/GSC/stop/ea356919_nohash_0.wav,wav,,ea356919,string,,stop,string,,stop,string,
+stop/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/stop/e41a903b_nohash_0.wav,wav,,e41a903b,string,,stop,string,,stop,string,
+stop/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/stop/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,stop,string,,stop,string,
+stop/893705bb_nohash_21,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_21.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/9dc1889e_nohash_0,1.0,0,16000,/localscratch/GSC/stop/9dc1889e_nohash_0.wav,wav,,9dc1889e,string,,stop,string,,stop,string,
+stop/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/stop/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,stop,string,,stop,string,
+stop/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/stop/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,stop,string,,stop,string,
+stop/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/stop/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,stop,string,,stop,string,
+stop/cc592808_nohash_0,1.0,0,16000,/localscratch/GSC/stop/cc592808_nohash_0.wav,wav,,cc592808,string,,stop,string,,stop,string,
+stop/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/stop/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,stop,string,,stop,string,
+stop/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/stop/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,stop,string,,stop,string,
+stop/b2e2773a_nohash_1,1.0,0,16000,/localscratch/GSC/stop/b2e2773a_nohash_1.wav,wav,,b2e2773a,string,,stop,string,,stop,string,
+stop/587f3271_nohash_0,1.0,0,16000,/localscratch/GSC/stop/587f3271_nohash_0.wav,wav,,587f3271,string,,stop,string,,stop,string,
+stop/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/stop/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,stop,string,,stop,string,
+stop/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/stop/e1469561_nohash_3.wav,wav,,e1469561,string,,stop,string,,stop,string,
+stop/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/stop/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,stop,string,,stop,string,
+stop/ca48dc76_nohash_0,1.0,0,16000,/localscratch/GSC/stop/ca48dc76_nohash_0.wav,wav,,ca48dc76,string,,stop,string,,stop,string,
+stop/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/stop/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,stop,string,,stop,string,
+stop/022cd682_nohash_0,1.0,0,16000,/localscratch/GSC/stop/022cd682_nohash_0.wav,wav,,022cd682,string,,stop,string,,stop,string,
+stop/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/stop/1acc97de_nohash_1.wav,wav,,1acc97de,string,,stop,string,,stop,string,
+stop/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/stop/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,stop,string,,stop,string,
+stop/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/stop/87070229_nohash_3.wav,wav,,87070229,string,,stop,string,,stop,string,
+stop/fa446c16_nohash_2,1.0,0,16000,/localscratch/GSC/stop/fa446c16_nohash_2.wav,wav,,fa446c16,string,,stop,string,,stop,string,
+stop/893705bb_nohash_10,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_10.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/stop/91b03183_nohash_1.wav,wav,,91b03183,string,,stop,string,,stop,string,
+stop/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/stop/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,stop,string,,stop,string,
+stop/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/stop/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,stop,string,,stop,string,
+stop/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/stop/bb05582b_nohash_0.wav,wav,,bb05582b,string,,stop,string,,stop,string,
+stop/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/stop/8fe67225_nohash_4.wav,wav,,8fe67225,string,,stop,string,,stop,string,
+stop/fb7eb481_nohash_4,1.0,0,16000,/localscratch/GSC/stop/fb7eb481_nohash_4.wav,wav,,fb7eb481,string,,stop,string,,stop,string,
+stop/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/stop/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,stop,string,,stop,string,
+stop/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,stop,string,,stop,string,
+stop/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/stop/f9643d42_nohash_1.wav,wav,,f9643d42,string,,stop,string,,stop,string,
+stop/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/stop/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,stop,string,,stop,string,
+stop/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/stop/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,stop,string,,stop,string,
+stop/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/stop/837a0f64_nohash_3.wav,wav,,837a0f64,string,,stop,string,,stop,string,
+stop/f297e878_nohash_0,1.0,0,16000,/localscratch/GSC/stop/f297e878_nohash_0.wav,wav,,f297e878,string,,stop,string,,stop,string,
+stop/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/stop/e1469561_nohash_1.wav,wav,,e1469561,string,,stop,string,,stop,string,
+stop/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/stop/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,stop,string,,stop,string,
+stop/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/stop/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,stop,string,,stop,string,
+stop/e5e54cee_nohash_0,1.0,0,16000,/localscratch/GSC/stop/e5e54cee_nohash_0.wav,wav,,e5e54cee,string,,stop,string,,stop,string,
+stop/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/stop/5170b77f_nohash_3.wav,wav,,5170b77f,string,,stop,string,,stop,string,
+stop/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/stop/f9643d42_nohash_3.wav,wav,,f9643d42,string,,stop,string,,stop,string,
+stop/caedb73a_nohash_0,1.0,0,16000,/localscratch/GSC/stop/caedb73a_nohash_0.wav,wav,,caedb73a,string,,stop,string,,stop,string,
+stop/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_4.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/stop/e1469561_nohash_2.wav,wav,,e1469561,string,,stop,string,,stop,string,
+stop/50033893_nohash_0,1.0,0,16000,/localscratch/GSC/stop/50033893_nohash_0.wav,wav,,50033893,string,,stop,string,,stop,string,
+stop/1f653d27_nohash_0,1.0,0,16000,/localscratch/GSC/stop/1f653d27_nohash_0.wav,wav,,1f653d27,string,,stop,string,,stop,string,
+stop/692a88e6_nohash_4,1.0,0,16000,/localscratch/GSC/stop/692a88e6_nohash_4.wav,wav,,692a88e6,string,,stop,string,,stop,string,
+stop/840c366d_nohash_0,1.0,0,16000,/localscratch/GSC/stop/840c366d_nohash_0.wav,wav,,840c366d,string,,stop,string,,stop,string,
+stop/3659fc1c_nohash_1,1.0,0,16000,/localscratch/GSC/stop/3659fc1c_nohash_1.wav,wav,,3659fc1c,string,,stop,string,,stop,string,
+stop/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/stop/c7124b73_nohash_0.wav,wav,,c7124b73,string,,stop,string,,stop,string,
+stop/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/stop/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,stop,string,,stop,string,
+stop/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/stop/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,stop,string,,stop,string,
+stop/af130f12_nohash_0,1.0,0,16000,/localscratch/GSC/stop/af130f12_nohash_0.wav,wav,,af130f12,string,,stop,string,,stop,string,
+stop/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/stop/e1469561_nohash_4.wav,wav,,e1469561,string,,stop,string,,stop,string,
+stop/a2473d62_nohash_0,1.0,0,16000,/localscratch/GSC/stop/a2473d62_nohash_0.wav,wav,,a2473d62,string,,stop,string,,stop,string,
+stop/68dd409e_nohash_1,1.0,0,16000,/localscratch/GSC/stop/68dd409e_nohash_1.wav,wav,,68dd409e,string,,stop,string,,stop,string,
+stop/9a69672b_nohash_4,1.0,0,16000,/localscratch/GSC/stop/9a69672b_nohash_4.wav,wav,,9a69672b,string,,stop,string,,stop,string,
+stop/9a356ab9_nohash_0,1.0,0,16000,/localscratch/GSC/stop/9a356ab9_nohash_0.wav,wav,,9a356ab9,string,,stop,string,,stop,string,
+stop/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/stop/8769c34c_nohash_3.wav,wav,,8769c34c,string,,stop,string,,stop,string,
+stop/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/stop/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,stop,string,,stop,string,
+stop/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/stop/bb05582b_nohash_3.wav,wav,,bb05582b,string,,stop,string,,stop,string,
+stop/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/stop/5170b77f_nohash_0.wav,wav,,5170b77f,string,,stop,string,,stop,string,
+stop/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/stop/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,stop,string,,stop,string,
+stop/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/stop/9a69672b_nohash_1.wav,wav,,9a69672b,string,,stop,string,,stop,string,
+stop/893705bb_nohash_9,1.0,0,16000,/localscratch/GSC/stop/893705bb_nohash_9.wav,wav,,893705bb,string,,stop,string,,stop,string,
+stop/a6f2fd71_nohash_2,1.0,0,16000,/localscratch/GSC/stop/a6f2fd71_nohash_2.wav,wav,,a6f2fd71,string,,stop,string,,stop,string,
+stop/43fc47a7_nohash_1,1.0,0,16000,/localscratch/GSC/stop/43fc47a7_nohash_1.wav,wav,,43fc47a7,string,,stop,string,,stop,string,
+stop/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/stop/37dca74f_nohash_0.wav,wav,,37dca74f,string,,stop,string,,stop,string,
+stop/82b99576_nohash_0,1.0,0,16000,/localscratch/GSC/stop/82b99576_nohash_0.wav,wav,,82b99576,string,,stop,string,,stop,string,
+stop/9d4bab4f_nohash_0,1.0,0,16000,/localscratch/GSC/stop/9d4bab4f_nohash_0.wav,wav,,9d4bab4f,string,,stop,string,,stop,string,
+go/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/go/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,go,string,,go,string,
+go/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/go/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,go,string,,go,string,
+go/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/go/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,go,string,,go,string,
+go/f17d21b5_nohash_0,1.0,0,16000,/localscratch/GSC/go/f17d21b5_nohash_0.wav,wav,,f17d21b5,string,,go,string,,go,string,
+go/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/go/cd85758f_nohash_1.wav,wav,,cd85758f,string,,go,string,,go,string,
+go/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/go/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,go,string,,go,string,
+go/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/go/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,go,string,,go,string,
+go/b7e9f841_nohash_0,1.0,0,16000,/localscratch/GSC/go/b7e9f841_nohash_0.wav,wav,,b7e9f841,string,,go,string,,go,string,
+go/62ff07ef_nohash_1,1.0,0,16000,/localscratch/GSC/go/62ff07ef_nohash_1.wav,wav,,62ff07ef,string,,go,string,,go,string,
+go/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/go/b49caed3_nohash_1.wav,wav,,b49caed3,string,,go,string,,go,string,
+go/26b28ea7_nohash_1,1.0,0,16000,/localscratch/GSC/go/26b28ea7_nohash_1.wav,wav,,26b28ea7,string,,go,string,,go,string,
+go/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/go/9a69672b_nohash_3.wav,wav,,9a69672b,string,,go,string,,go,string,
+go/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/go/e41a903b_nohash_2.wav,wav,,e41a903b,string,,go,string,,go,string,
+go/ca48dc76_nohash_1,1.0,0,16000,/localscratch/GSC/go/ca48dc76_nohash_1.wav,wav,,ca48dc76,string,,go,string,,go,string,
+go/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/go/dc75148d_nohash_0.wav,wav,,dc75148d,string,,go,string,,go,string,
+go/97f4c236_nohash_2,1.0,0,16000,/localscratch/GSC/go/97f4c236_nohash_2.wav,wav,,97f4c236,string,,go,string,,go,string,
+go/8769c34c_nohash_4,1.0,0,16000,/localscratch/GSC/go/8769c34c_nohash_4.wav,wav,,8769c34c,string,,go,string,,go,string,
+go/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/go/91b03183_nohash_0.wav,wav,,91b03183,string,,go,string,,go,string,
+go/5170b77f_nohash_1,1.0,0,16000,/localscratch/GSC/go/5170b77f_nohash_1.wav,wav,,5170b77f,string,,go,string,,go,string,
+go/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/go/db24628d_nohash_0.wav,wav,,db24628d,string,,go,string,,go,string,
+go/9d171fee_nohash_0,1.0,0,16000,/localscratch/GSC/go/9d171fee_nohash_0.wav,wav,,9d171fee,string,,go,string,,go,string,
+go/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/go/837a0f64_nohash_1.wav,wav,,837a0f64,string,,go,string,,go,string,
+go/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/go/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,go,string,,go,string,
+go/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/go/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,go,string,,go,string,
+go/97f4c236_nohash_1,1.0,0,16000,/localscratch/GSC/go/97f4c236_nohash_1.wav,wav,,97f4c236,string,,go,string,,go,string,
+go/a7216980_nohash_1,1.0,0,16000,/localscratch/GSC/go/a7216980_nohash_1.wav,wav,,a7216980,string,,go,string,,go,string,
+go/a60a09cf_nohash_0,1.0,0,16000,/localscratch/GSC/go/a60a09cf_nohash_0.wav,wav,,a60a09cf,string,,go,string,,go,string,
+go/6205088b_nohash_0,1.0,0,16000,/localscratch/GSC/go/6205088b_nohash_0.wav,wav,,6205088b,string,,go,string,,go,string,
+go/bfd26d6b_nohash_4,1.0,0,16000,/localscratch/GSC/go/bfd26d6b_nohash_4.wav,wav,,bfd26d6b,string,,go,string,,go,string,
+go/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/go/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,go,string,,go,string,
+go/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/go/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,go,string,,go,string,
+go/8ea6dec6_nohash_1,1.0,0,16000,/localscratch/GSC/go/8ea6dec6_nohash_1.wav,wav,,8ea6dec6,string,,go,string,,go,string,
+go/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/go/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,go,string,,go,string,
+go/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/go/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,go,string,,go,string,
+go/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/go/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,go,string,,go,string,
+go/c0e0f834_nohash_0,1.0,0,16000,/localscratch/GSC/go/c0e0f834_nohash_0.wav,wav,,c0e0f834,string,,go,string,,go,string,
+go/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/go/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,go,string,,go,string,
+go/8db999cf_nohash_0,1.0,0,16000,/localscratch/GSC/go/8db999cf_nohash_0.wav,wav,,8db999cf,string,,go,string,,go,string,
+go/5c8af87a_nohash_0,1.0,0,16000,/localscratch/GSC/go/5c8af87a_nohash_0.wav,wav,,5c8af87a,string,,go,string,,go,string,
+go/6b889021_nohash_0,1.0,0,16000,/localscratch/GSC/go/6b889021_nohash_0.wav,wav,,6b889021,string,,go,string,,go,string,
+go/b49caed3_nohash_4,1.0,0,16000,/localscratch/GSC/go/b49caed3_nohash_4.wav,wav,,b49caed3,string,,go,string,,go,string,
+go/692a88e6_nohash_3,1.0,0,16000,/localscratch/GSC/go/692a88e6_nohash_3.wav,wav,,692a88e6,string,,go,string,,go,string,
+go/63f7a489_nohash_0,1.0,0,16000,/localscratch/GSC/go/63f7a489_nohash_0.wav,wav,,63f7a489,string,,go,string,,go,string,
+go/44260689_nohash_0,1.0,0,16000,/localscratch/GSC/go/44260689_nohash_0.wav,wav,,44260689,string,,go,string,,go,string,
+go/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/go/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,go,string,,go,string,
+go/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/go/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,go,string,,go,string,
+go/6205088b_nohash_1,1.0,0,16000,/localscratch/GSC/go/6205088b_nohash_1.wav,wav,,6205088b,string,,go,string,,go,string,
+go/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_0.wav,wav,,893705bb,string,,go,string,,go,string,
+go/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/go/189cbabe_nohash_1.wav,wav,,189cbabe,string,,go,string,,go,string,
+go/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/go/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,go,string,,go,string,
+go/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/go/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,go,string,,go,string,
+go/af8b2f2c_nohash_1,1.0,0,16000,/localscratch/GSC/go/af8b2f2c_nohash_1.wav,wav,,af8b2f2c,string,,go,string,,go,string,
+go/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/go/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,go,string,,go,string,
+go/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/go/8fe67225_nohash_0.wav,wav,,8fe67225,string,,go,string,,go,string,
+go/9d171fee_nohash_1,1.0,0,16000,/localscratch/GSC/go/9d171fee_nohash_1.wav,wav,,9d171fee,string,,go,string,,go,string,
+go/3df9a3d4_nohash_0,1.0,0,16000,/localscratch/GSC/go/3df9a3d4_nohash_0.wav,wav,,3df9a3d4,string,,go,string,,go,string,
+go/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/go/beb458a4_nohash_4.wav,wav,,beb458a4,string,,go,string,,go,string,
+go/4290ca61_nohash_0,1.0,0,16000,/localscratch/GSC/go/4290ca61_nohash_0.wav,wav,,4290ca61,string,,go,string,,go,string,
+go/48a8a69d_nohash_0,1.0,0,16000,/localscratch/GSC/go/48a8a69d_nohash_0.wav,wav,,48a8a69d,string,,go,string,,go,string,
+go/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/go/2796ac50_nohash_0.wav,wav,,2796ac50,string,,go,string,,go,string,
+go/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/go/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,go,string,,go,string,
+go/370844f7_nohash_1,1.0,0,16000,/localscratch/GSC/go/370844f7_nohash_1.wav,wav,,370844f7,string,,go,string,,go,string,
+go/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/go/cd85758f_nohash_0.wav,wav,,cd85758f,string,,go,string,,go,string,
+go/0cb74144_nohash_4,1.0,0,16000,/localscratch/GSC/go/0cb74144_nohash_4.wav,wav,,0cb74144,string,,go,string,,go,string,
+go/964e8cfd_nohash_4,1.0,0,16000,/localscratch/GSC/go/964e8cfd_nohash_4.wav,wav,,964e8cfd,string,,go,string,,go,string,
+go/f2e59fea_nohash_1,1.0,0,16000,/localscratch/GSC/go/f2e59fea_nohash_1.wav,wav,,f2e59fea,string,,go,string,,go,string,
+go/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/go/e41a903b_nohash_1.wav,wav,,e41a903b,string,,go,string,,go,string,
+go/95ba4996_nohash_0,1.0,0,16000,/localscratch/GSC/go/95ba4996_nohash_0.wav,wav,,95ba4996,string,,go,string,,go,string,
+go/fa446c16_nohash_0,1.0,0,16000,/localscratch/GSC/go/fa446c16_nohash_0.wav,wav,,fa446c16,string,,go,string,,go,string,
+go/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/go/37dca74f_nohash_3.wav,wav,,37dca74f,string,,go,string,,go,string,
+go/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/go/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,go,string,,go,string,
+go/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/go/8fe67225_nohash_2.wav,wav,,8fe67225,string,,go,string,,go,string,
+go/1acc97de_nohash_4,1.0,0,16000,/localscratch/GSC/go/1acc97de_nohash_4.wav,wav,,1acc97de,string,,go,string,,go,string,
+go/4fd1443e_nohash_1,1.0,0,16000,/localscratch/GSC/go/4fd1443e_nohash_1.wav,wav,,4fd1443e,string,,go,string,,go,string,
+go/653a48f5_nohash_0,1.0,0,16000,/localscratch/GSC/go/653a48f5_nohash_0.wav,wav,,653a48f5,string,,go,string,,go,string,
+go/4c6167ca_nohash_5,1.0,0,16000,/localscratch/GSC/go/4c6167ca_nohash_5.wav,wav,,4c6167ca,string,,go,string,,go,string,
+go/e49428d9_nohash_3,1.0,0,16000,/localscratch/GSC/go/e49428d9_nohash_3.wav,wav,,e49428d9,string,,go,string,,go,string,
+go/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_7.wav,wav,,893705bb,string,,go,string,,go,string,
+go/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/go/8056e897_nohash_0.wav,wav,,8056e897,string,,go,string,,go,string,
+go/94de6a6a_nohash_2,1.0,0,16000,/localscratch/GSC/go/94de6a6a_nohash_2.wav,wav,,94de6a6a,string,,go,string,,go,string,
+go/d962e5ac_nohash_2,1.0,0,16000,/localscratch/GSC/go/d962e5ac_nohash_2.wav,wav,,d962e5ac,string,,go,string,,go,string,
+go/0487ba9b_nohash_0,1.0,0,16000,/localscratch/GSC/go/0487ba9b_nohash_0.wav,wav,,0487ba9b,string,,go,string,,go,string,
+go/fb7eb481_nohash_3,1.0,0,16000,/localscratch/GSC/go/fb7eb481_nohash_3.wav,wav,,fb7eb481,string,,go,string,,go,string,
+go/3d86b69a_nohash_2,1.0,0,16000,/localscratch/GSC/go/3d86b69a_nohash_2.wav,wav,,3d86b69a,string,,go,string,,go,string,
+go/4290ca61_nohash_2,1.0,0,16000,/localscratch/GSC/go/4290ca61_nohash_2.wav,wav,,4290ca61,string,,go,string,,go,string,
+go/c7dc7278_nohash_4,1.0,0,16000,/localscratch/GSC/go/c7dc7278_nohash_4.wav,wav,,c7dc7278,string,,go,string,,go,string,
+go/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/go/97f4c236_nohash_0.wav,wav,,97f4c236,string,,go,string,,go,string,
+go/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/go/aa80f517_nohash_1.wav,wav,,aa80f517,string,,go,string,,go,string,
+go/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/go/0cb74144_nohash_1.wav,wav,,0cb74144,string,,go,string,,go,string,
+go/964e8cfd_nohash_1,1.0,0,16000,/localscratch/GSC/go/964e8cfd_nohash_1.wav,wav,,964e8cfd,string,,go,string,,go,string,
+go/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/go/9a69672b_nohash_0.wav,wav,,9a69672b,string,,go,string,,go,string,
+go/dfdabe19_nohash_0,1.0,0,16000,/localscratch/GSC/go/dfdabe19_nohash_0.wav,wav,,dfdabe19,string,,go,string,,go,string,
+go/964e8cfd_nohash_2,1.0,0,16000,/localscratch/GSC/go/964e8cfd_nohash_2.wav,wav,,964e8cfd,string,,go,string,,go,string,
+go/6205088b_nohash_2,1.0,0,16000,/localscratch/GSC/go/6205088b_nohash_2.wav,wav,,6205088b,string,,go,string,,go,string,
+go/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/go/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,go,string,,go,string,
+go/0bac8a71_nohash_0,1.0,0,16000,/localscratch/GSC/go/0bac8a71_nohash_0.wav,wav,,0bac8a71,string,,go,string,,go,string,
+go/1acc97de_nohash_2,1.0,0,16000,/localscratch/GSC/go/1acc97de_nohash_2.wav,wav,,1acc97de,string,,go,string,,go,string,
+go/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/go/63f7a489_nohash_2.wav,wav,,63f7a489,string,,go,string,,go,string,
+go/0fa1e7a9_nohash_0,1.0,0,16000,/localscratch/GSC/go/0fa1e7a9_nohash_0.wav,wav,,0fa1e7a9,string,,go,string,,go,string,
+go/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/go/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,go,string,,go,string,
+go/c22d3f18_nohash_1,1.0,0,16000,/localscratch/GSC/go/c22d3f18_nohash_1.wav,wav,,c22d3f18,string,,go,string,,go,string,
+go/e49428d9_nohash_0,1.0,0,16000,/localscratch/GSC/go/e49428d9_nohash_0.wav,wav,,e49428d9,string,,go,string,,go,string,
+go/893705bb_nohash_5,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_5.wav,wav,,893705bb,string,,go,string,,go,string,
+go/798f702a_nohash_1,1.0,0,16000,/localscratch/GSC/go/798f702a_nohash_1.wav,wav,,798f702a,string,,go,string,,go,string,
+go/fe1916ba_nohash_0,1.0,0,16000,/localscratch/GSC/go/fe1916ba_nohash_0.wav,wav,,fe1916ba,string,,go,string,,go,string,
+go/ca4d5368_nohash_4,1.0,0,16000,/localscratch/GSC/go/ca4d5368_nohash_4.wav,wav,,ca4d5368,string,,go,string,,go,string,
+go/135c6841_nohash_0,1.0,0,16000,/localscratch/GSC/go/135c6841_nohash_0.wav,wav,,135c6841,string,,go,string,,go,string,
+go/f9643d42_nohash_0,1.0,0,16000,/localscratch/GSC/go/f9643d42_nohash_0.wav,wav,,f9643d42,string,,go,string,,go,string,
+go/beb458a4_nohash_3,1.0,0,16000,/localscratch/GSC/go/beb458a4_nohash_3.wav,wav,,beb458a4,string,,go,string,,go,string,
+go/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/go/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,go,string,,go,string,
+go/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/go/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,go,string,,go,string,
+go/c9b5ff26_nohash_3,1.0,0,16000,/localscratch/GSC/go/c9b5ff26_nohash_3.wav,wav,,c9b5ff26,string,,go,string,,go,string,
+go/37fc5d97_nohash_0,1.0,0,16000,/localscratch/GSC/go/37fc5d97_nohash_0.wav,wav,,37fc5d97,string,,go,string,,go,string,
+go/3b4f8f24_nohash_4,1.0,0,16000,/localscratch/GSC/go/3b4f8f24_nohash_4.wav,wav,,3b4f8f24,string,,go,string,,go,string,
+go/ca4d5368_nohash_3,1.0,0,16000,/localscratch/GSC/go/ca4d5368_nohash_3.wav,wav,,ca4d5368,string,,go,string,,go,string,
+go/daf230ac_nohash_0,1.0,0,16000,/localscratch/GSC/go/daf230ac_nohash_0.wav,wav,,daf230ac,string,,go,string,,go,string,
+go/2005ca25_nohash_0,1.0,0,16000,/localscratch/GSC/go/2005ca25_nohash_0.wav,wav,,2005ca25,string,,go,string,,go,string,
+go/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/go/87070229_nohash_1.wav,wav,,87070229,string,,go,string,,go,string,
+go/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/go/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,go,string,,go,string,
+go/adebe223_nohash_0,1.0,0,16000,/localscratch/GSC/go/adebe223_nohash_0.wav,wav,,adebe223,string,,go,string,,go,string,
+go/e49428d9_nohash_2,1.0,0,16000,/localscratch/GSC/go/e49428d9_nohash_2.wav,wav,,e49428d9,string,,go,string,,go,string,
+go/e1469561_nohash_0,1.0,0,16000,/localscratch/GSC/go/e1469561_nohash_0.wav,wav,,e1469561,string,,go,string,,go,string,
+go/db24628d_nohash_4,1.0,0,16000,/localscratch/GSC/go/db24628d_nohash_4.wav,wav,,db24628d,string,,go,string,,go,string,
+go/68dd409e_nohash_2,1.0,0,16000,/localscratch/GSC/go/68dd409e_nohash_2.wav,wav,,68dd409e,string,,go,string,,go,string,
+go/e41a903b_nohash_3,1.0,0,16000,/localscratch/GSC/go/e41a903b_nohash_3.wav,wav,,e41a903b,string,,go,string,,go,string,
+go/fa446c16_nohash_1,1.0,0,16000,/localscratch/GSC/go/fa446c16_nohash_1.wav,wav,,fa446c16,string,,go,string,,go,string,
+go/e9901cf0_nohash_0,1.0,0,16000,/localscratch/GSC/go/e9901cf0_nohash_0.wav,wav,,e9901cf0,string,,go,string,,go,string,
+go/d5b963aa_nohash_2,1.0,0,16000,/localscratch/GSC/go/d5b963aa_nohash_2.wav,wav,,d5b963aa,string,,go,string,,go,string,
+go/837a0f64_nohash_4,1.0,0,16000,/localscratch/GSC/go/837a0f64_nohash_4.wav,wav,,837a0f64,string,,go,string,,go,string,
+go/692a88e6_nohash_2,1.0,0,16000,/localscratch/GSC/go/692a88e6_nohash_2.wav,wav,,692a88e6,string,,go,string,,go,string,
+go/85834399_nohash_0,1.0,0,16000,/localscratch/GSC/go/85834399_nohash_0.wav,wav,,85834399,string,,go,string,,go,string,
+go/5c8af87a_nohash_3,1.0,0,16000,/localscratch/GSC/go/5c8af87a_nohash_3.wav,wav,,5c8af87a,string,,go,string,,go,string,
+go/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/go/8769c34c_nohash_1.wav,wav,,8769c34c,string,,go,string,,go,string,
+go/b49caed3_nohash_3,1.0,0,16000,/localscratch/GSC/go/b49caed3_nohash_3.wav,wav,,b49caed3,string,,go,string,,go,string,
+go/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/go/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,go,string,,go,string,
+go/bb05582b_nohash_4,1.0,0,16000,/localscratch/GSC/go/bb05582b_nohash_4.wav,wav,,bb05582b,string,,go,string,,go,string,
+go/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/go/2fa39636_nohash_0.wav,wav,,2fa39636,string,,go,string,,go,string,
+go/5e3dde6b_nohash_1,1.0,0,16000,/localscratch/GSC/go/5e3dde6b_nohash_1.wav,wav,,5e3dde6b,string,,go,string,,go,string,
+go/f264e0df_nohash_1,1.0,0,16000,/localscratch/GSC/go/f264e0df_nohash_1.wav,wav,,f264e0df,string,,go,string,,go,string,
+go/a9f54d8d_nohash_0,1.0,0,16000,/localscratch/GSC/go/a9f54d8d_nohash_0.wav,wav,,a9f54d8d,string,,go,string,,go,string,
+go/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/go/a4383927_nohash_0.wav,wav,,a4383927,string,,go,string,,go,string,
+go/85d2ac4b_nohash_0,1.0,0,16000,/localscratch/GSC/go/85d2ac4b_nohash_0.wav,wav,,85d2ac4b,string,,go,string,,go,string,
+go/e49428d9_nohash_1,1.0,0,16000,/localscratch/GSC/go/e49428d9_nohash_1.wav,wav,,e49428d9,string,,go,string,,go,string,
+go/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/go/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,go,string,,go,string,
+go/aa80f517_nohash_0,1.0,0,16000,/localscratch/GSC/go/aa80f517_nohash_0.wav,wav,,aa80f517,string,,go,string,,go,string,
+go/5170b77f_nohash_2,1.0,0,16000,/localscratch/GSC/go/5170b77f_nohash_2.wav,wav,,5170b77f,string,,go,string,,go,string,
+go/dc75148d_nohash_1,1.0,0,16000,/localscratch/GSC/go/dc75148d_nohash_1.wav,wav,,dc75148d,string,,go,string,,go,string,
+go/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/go/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,go,string,,go,string,
+go/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/go/db24628d_nohash_1.wav,wav,,db24628d,string,,go,string,,go,string,
+go/bfd26d6b_nohash_1,1.0,0,16000,/localscratch/GSC/go/bfd26d6b_nohash_1.wav,wav,,bfd26d6b,string,,go,string,,go,string,
+go/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/go/beb458a4_nohash_1.wav,wav,,beb458a4,string,,go,string,,go,string,
+go/5eb5fc74_nohash_1,1.0,0,16000,/localscratch/GSC/go/5eb5fc74_nohash_1.wav,wav,,5eb5fc74,string,,go,string,,go,string,
+go/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/go/d7467392_nohash_0.wav,wav,,d7467392,string,,go,string,,go,string,
+go/c518d1b1_nohash_1,1.0,0,16000,/localscratch/GSC/go/c518d1b1_nohash_1.wav,wav,,c518d1b1,string,,go,string,,go,string,
+go/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/go/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,go,string,,go,string,
+go/a1533da4_nohash_0,1.0,0,16000,/localscratch/GSC/go/a1533da4_nohash_0.wav,wav,,a1533da4,string,,go,string,,go,string,
+go/5e3dde6b_nohash_3,1.0,0,16000,/localscratch/GSC/go/5e3dde6b_nohash_3.wav,wav,,5e3dde6b,string,,go,string,,go,string,
+go/4620dc14_nohash_0,1.0,0,16000,/localscratch/GSC/go/4620dc14_nohash_0.wav,wav,,4620dc14,string,,go,string,,go,string,
+go/4845bb10_nohash_0,1.0,0,16000,/localscratch/GSC/go/4845bb10_nohash_0.wav,wav,,4845bb10,string,,go,string,,go,string,
+go/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/go/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,go,string,,go,string,
+go/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/go/bb05582b_nohash_1.wav,wav,,bb05582b,string,,go,string,,go,string,
+go/5eb5fc74_nohash_0,1.0,0,16000,/localscratch/GSC/go/5eb5fc74_nohash_0.wav,wav,,5eb5fc74,string,,go,string,,go,string,
+go/f9643d42_nohash_2,1.0,0,16000,/localscratch/GSC/go/f9643d42_nohash_2.wav,wav,,f9643d42,string,,go,string,,go,string,
+go/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/go/f292725f_nohash_0.wav,wav,,f292725f,string,,go,string,,go,string,
+go/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/go/a7216980_nohash_2.wav,wav,,a7216980,string,,go,string,,go,string,
+go/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/go/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,go,string,,go,string,
+go/1acc97de_nohash_0,1.0,0,16000,/localscratch/GSC/go/1acc97de_nohash_0.wav,wav,,1acc97de,string,,go,string,,go,string,
+go/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/go/aa80f517_nohash_2.wav,wav,,aa80f517,string,,go,string,,go,string,
+go/b1f8326d_nohash_2,1.0,0,16000,/localscratch/GSC/go/b1f8326d_nohash_2.wav,wav,,b1f8326d,string,,go,string,,go,string,
+go/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/go/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,go,string,,go,string,
+go/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/go/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,go,string,,go,string,
+go/0d53e045_nohash_0,1.0,0,16000,/localscratch/GSC/go/0d53e045_nohash_0.wav,wav,,0d53e045,string,,go,string,,go,string,
+go/a7216980_nohash_0,1.0,0,16000,/localscratch/GSC/go/a7216980_nohash_0.wav,wav,,a7216980,string,,go,string,,go,string,
+go/b1f8326d_nohash_0,1.0,0,16000,/localscratch/GSC/go/b1f8326d_nohash_0.wav,wav,,b1f8326d,string,,go,string,,go,string,
+go/db24628d_nohash_3,1.0,0,16000,/localscratch/GSC/go/db24628d_nohash_3.wav,wav,,db24628d,string,,go,string,,go,string,
+go/9d171fee_nohash_2,1.0,0,16000,/localscratch/GSC/go/9d171fee_nohash_2.wav,wav,,9d171fee,string,,go,string,,go,string,
+go/563aa4e6_nohash_1,1.0,0,16000,/localscratch/GSC/go/563aa4e6_nohash_1.wav,wav,,563aa4e6,string,,go,string,,go,string,
+go/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/go/b49caed3_nohash_2.wav,wav,,b49caed3,string,,go,string,,go,string,
+go/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_8.wav,wav,,893705bb,string,,go,string,,go,string,
+go/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/go/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,go,string,,go,string,
+go/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/go/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,go,string,,go,string,
+go/ea356919_nohash_2,1.0,0,16000,/localscratch/GSC/go/ea356919_nohash_2.wav,wav,,ea356919,string,,go,string,,go,string,
+go/0ea0e2f4_nohash_0,1.0,0,16000,/localscratch/GSC/go/0ea0e2f4_nohash_0.wav,wav,,0ea0e2f4,string,,go,string,,go,string,
+go/cd85758f_nohash_4,1.0,0,16000,/localscratch/GSC/go/cd85758f_nohash_4.wav,wav,,cd85758f,string,,go,string,,go,string,
+go/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/go/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,go,string,,go,string,
+go/5e3dde6b_nohash_2,1.0,0,16000,/localscratch/GSC/go/5e3dde6b_nohash_2.wav,wav,,5e3dde6b,string,,go,string,,go,string,
+go/cd85758f_nohash_3,1.0,0,16000,/localscratch/GSC/go/cd85758f_nohash_3.wav,wav,,cd85758f,string,,go,string,,go,string,
+go/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/go/91b03183_nohash_2.wav,wav,,91b03183,string,,go,string,,go,string,
+go/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/go/63f7a489_nohash_3.wav,wav,,63f7a489,string,,go,string,,go,string,
+go/5e3dde6b_nohash_4,1.0,0,16000,/localscratch/GSC/go/5e3dde6b_nohash_4.wav,wav,,5e3dde6b,string,,go,string,,go,string,
+go/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/go/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,go,string,,go,string,
+go/18f8afd5_nohash_1,1.0,0,16000,/localscratch/GSC/go/18f8afd5_nohash_1.wav,wav,,18f8afd5,string,,go,string,,go,string,
+go/0ea0e2f4_nohash_1,1.0,0,16000,/localscratch/GSC/go/0ea0e2f4_nohash_1.wav,wav,,0ea0e2f4,string,,go,string,,go,string,
+go/0f250098_nohash_0,1.0,0,16000,/localscratch/GSC/go/0f250098_nohash_0.wav,wav,,0f250098,string,,go,string,,go,string,
+go/3d86b69a_nohash_3,1.0,0,16000,/localscratch/GSC/go/3d86b69a_nohash_3.wav,wav,,3d86b69a,string,,go,string,,go,string,
+go/0c40e715_nohash_1,1.0,0,16000,/localscratch/GSC/go/0c40e715_nohash_1.wav,wav,,0c40e715,string,,go,string,,go,string,
+go/0c40e715_nohash_0,1.0,0,16000,/localscratch/GSC/go/0c40e715_nohash_0.wav,wav,,0c40e715,string,,go,string,,go,string,
+go/a80f9f53_nohash_0,1.0,0,16000,/localscratch/GSC/go/a80f9f53_nohash_0.wav,wav,,a80f9f53,string,,go,string,,go,string,
+go/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/go/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,go,string,,go,string,
+go/553f1a79_nohash_0,1.0,0,16000,/localscratch/GSC/go/553f1a79_nohash_0.wav,wav,,553f1a79,string,,go,string,,go,string,
+go/563aa4e6_nohash_2,1.0,0,16000,/localscratch/GSC/go/563aa4e6_nohash_2.wav,wav,,563aa4e6,string,,go,string,,go,string,
+go/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/go/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,go,string,,go,string,
+go/f9643d42_nohash_4,1.0,0,16000,/localscratch/GSC/go/f9643d42_nohash_4.wav,wav,,f9643d42,string,,go,string,,go,string,
+go/af405b69_nohash_0,1.0,0,16000,/localscratch/GSC/go/af405b69_nohash_0.wav,wav,,af405b69,string,,go,string,,go,string,
+go/b7e9f841_nohash_1,1.0,0,16000,/localscratch/GSC/go/b7e9f841_nohash_1.wav,wav,,b7e9f841,string,,go,string,,go,string,
+go/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/go/b49caed3_nohash_0.wav,wav,,b49caed3,string,,go,string,,go,string,
+go/63f7a489_nohash_4,1.0,0,16000,/localscratch/GSC/go/63f7a489_nohash_4.wav,wav,,63f7a489,string,,go,string,,go,string,
+go/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/go/881583a6_nohash_0.wav,wav,,881583a6,string,,go,string,,go,string,
+go/b83c1acf_nohash_2,1.0,0,16000,/localscratch/GSC/go/b83c1acf_nohash_2.wav,wav,,b83c1acf,string,,go,string,,go,string,
+go/5e3dde6b_nohash_0,1.0,0,16000,/localscratch/GSC/go/5e3dde6b_nohash_0.wav,wav,,5e3dde6b,string,,go,string,,go,string,
+go/af7a8296_nohash_0,1.0,0,16000,/localscratch/GSC/go/af7a8296_nohash_0.wav,wav,,af7a8296,string,,go,string,,go,string,
+go/d5b963aa_nohash_0,1.0,0,16000,/localscratch/GSC/go/d5b963aa_nohash_0.wav,wav,,d5b963aa,string,,go,string,,go,string,
+go/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/go/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,go,string,,go,string,
+go/26b28ea7_nohash_2,1.0,0,16000,/localscratch/GSC/go/26b28ea7_nohash_2.wav,wav,,26b28ea7,string,,go,string,,go,string,
+go/4290ca61_nohash_1,1.0,0,16000,/localscratch/GSC/go/4290ca61_nohash_1.wav,wav,,4290ca61,string,,go,string,,go,string,
+go/692a88e6_nohash_1,1.0,0,16000,/localscratch/GSC/go/692a88e6_nohash_1.wav,wav,,692a88e6,string,,go,string,,go,string,
+go/d962e5ac_nohash_4,1.0,0,16000,/localscratch/GSC/go/d962e5ac_nohash_4.wav,wav,,d962e5ac,string,,go,string,,go,string,
+go/7bae88ed_nohash_0,1.0,0,16000,/localscratch/GSC/go/7bae88ed_nohash_0.wav,wav,,7bae88ed,string,,go,string,,go,string,
+go/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/go/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,go,string,,go,string,
+go/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/go/37dca74f_nohash_2.wav,wav,,37dca74f,string,,go,string,,go,string,
+go/bb05582b_nohash_2,1.0,0,16000,/localscratch/GSC/go/bb05582b_nohash_2.wav,wav,,bb05582b,string,,go,string,,go,string,
+go/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/go/692a88e6_nohash_0.wav,wav,,692a88e6,string,,go,string,,go,string,
+go/8056e897_nohash_1,1.0,0,16000,/localscratch/GSC/go/8056e897_nohash_1.wav,wav,,8056e897,string,,go,string,,go,string,
+go/475b61f1_nohash_1,1.0,0,16000,/localscratch/GSC/go/475b61f1_nohash_1.wav,wav,,475b61f1,string,,go,string,,go,string,
+go/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/go/aa80f517_nohash_3.wav,wav,,aa80f517,string,,go,string,,go,string,
+go/6b889021_nohash_1,1.0,0,16000,/localscratch/GSC/go/6b889021_nohash_1.wav,wav,,6b889021,string,,go,string,,go,string,
+go/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_1.wav,wav,,893705bb,string,,go,string,,go,string,
+go/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/go/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,go,string,,go,string,
+go/85834399_nohash_2,1.0,0,16000,/localscratch/GSC/go/85834399_nohash_2.wav,wav,,85834399,string,,go,string,,go,string,
+go/18f8afd5_nohash_4,1.0,0,16000,/localscratch/GSC/go/18f8afd5_nohash_4.wav,wav,,18f8afd5,string,,go,string,,go,string,
+go/f264e0df_nohash_0,1.0,0,16000,/localscratch/GSC/go/f264e0df_nohash_0.wav,wav,,f264e0df,string,,go,string,,go,string,
+go/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/go/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,go,string,,go,string,
+go/c518d1b1_nohash_0,1.0,0,16000,/localscratch/GSC/go/c518d1b1_nohash_0.wav,wav,,c518d1b1,string,,go,string,,go,string,
+go/bfd26d6b_nohash_3,1.0,0,16000,/localscratch/GSC/go/bfd26d6b_nohash_3.wav,wav,,bfd26d6b,string,,go,string,,go,string,
+go/85834399_nohash_1,1.0,0,16000,/localscratch/GSC/go/85834399_nohash_1.wav,wav,,85834399,string,,go,string,,go,string,
+go/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/go/cd85758f_nohash_2.wav,wav,,cd85758f,string,,go,string,,go,string,
+go/3df9a3d4_nohash_1,1.0,0,16000,/localscratch/GSC/go/3df9a3d4_nohash_1.wav,wav,,3df9a3d4,string,,go,string,,go,string,
+go/0fa1e7a9_nohash_1,1.0,0,16000,/localscratch/GSC/go/0fa1e7a9_nohash_1.wav,wav,,0fa1e7a9,string,,go,string,,go,string,
+go/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/go/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,go,string,,go,string,
+go/63f7a489_nohash_1,1.0,0,16000,/localscratch/GSC/go/63f7a489_nohash_1.wav,wav,,63f7a489,string,,go,string,,go,string,
+go/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/go/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,go,string,,go,string,
+go/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/go/189cbabe_nohash_2.wav,wav,,189cbabe,string,,go,string,,go,string,
+go/d0faf7e4_nohash_1,1.0,0,16000,/localscratch/GSC/go/d0faf7e4_nohash_1.wav,wav,,d0faf7e4,string,,go,string,,go,string,
+go/a591c2ea_nohash_0,1.0,0,16000,/localscratch/GSC/go/a591c2ea_nohash_0.wav,wav,,a591c2ea,string,,go,string,,go,string,
+go/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/go/422d3197_nohash_0.wav,wav,,422d3197,string,,go,string,,go,string,
+go/893705bb_nohash_6,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_6.wav,wav,,893705bb,string,,go,string,,go,string,
+go/189cbabe_nohash_0,1.0,0,16000,/localscratch/GSC/go/189cbabe_nohash_0.wav,wav,,189cbabe,string,,go,string,,go,string,
+go/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/go/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,go,string,,go,string,
+go/d91a159e_nohash_0,1.0,0,16000,/localscratch/GSC/go/d91a159e_nohash_0.wav,wav,,d91a159e,string,,go,string,,go,string,
+go/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/go/7257420c_nohash_0.wav,wav,,7257420c,string,,go,string,,go,string,
+go/2796ac50_nohash_1,1.0,0,16000,/localscratch/GSC/go/2796ac50_nohash_1.wav,wav,,2796ac50,string,,go,string,,go,string,
+go/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/go/beb458a4_nohash_0.wav,wav,,beb458a4,string,,go,string,,go,string,
+go/964e8cfd_nohash_0,1.0,0,16000,/localscratch/GSC/go/964e8cfd_nohash_0.wav,wav,,964e8cfd,string,,go,string,,go,string,
+go/62ff07ef_nohash_0,1.0,0,16000,/localscratch/GSC/go/62ff07ef_nohash_0.wav,wav,,62ff07ef,string,,go,string,,go,string,
+go/87070229_nohash_2,1.0,0,16000,/localscratch/GSC/go/87070229_nohash_2.wav,wav,,87070229,string,,go,string,,go,string,
+go/ea356919_nohash_1,1.0,0,16000,/localscratch/GSC/go/ea356919_nohash_1.wav,wav,,ea356919,string,,go,string,,go,string,
+go/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/go/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,go,string,,go,string,
+go/3659fc1c_nohash_0,1.0,0,16000,/localscratch/GSC/go/3659fc1c_nohash_0.wav,wav,,3659fc1c,string,,go,string,,go,string,
+go/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/go/1acc97de_nohash_3.wav,wav,,1acc97de,string,,go,string,,go,string,
+go/5c8af87a_nohash_4,1.0,0,16000,/localscratch/GSC/go/5c8af87a_nohash_4.wav,wav,,5c8af87a,string,,go,string,,go,string,
+go/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/go/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,go,string,,go,string,
+go/563aa4e6_nohash_4,1.0,0,16000,/localscratch/GSC/go/563aa4e6_nohash_4.wav,wav,,563aa4e6,string,,go,string,,go,string,
+go/87070229_nohash_0,1.0,0,16000,/localscratch/GSC/go/87070229_nohash_0.wav,wav,,87070229,string,,go,string,,go,string,
+go/6021f08b_nohash_0,1.0,0,16000,/localscratch/GSC/go/6021f08b_nohash_0.wav,wav,,6021f08b,string,,go,string,,go,string,
+go/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/go/97f4c236_nohash_4.wav,wav,,97f4c236,string,,go,string,,go,string,
+go/837a0f64_nohash_0,1.0,0,16000,/localscratch/GSC/go/837a0f64_nohash_0.wav,wav,,837a0f64,string,,go,string,,go,string,
+go/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/go/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,go,string,,go,string,
+go/b737ee80_nohash_1,1.0,0,16000,/localscratch/GSC/go/b737ee80_nohash_1.wav,wav,,b737ee80,string,,go,string,,go,string,
+go/8769c34c_nohash_2,1.0,0,16000,/localscratch/GSC/go/8769c34c_nohash_2.wav,wav,,8769c34c,string,,go,string,,go,string,
+go/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/go/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,go,string,,go,string,
+go/0d53e045_nohash_1,1.0,0,16000,/localscratch/GSC/go/0d53e045_nohash_1.wav,wav,,0d53e045,string,,go,string,,go,string,
+go/b737ee80_nohash_0,1.0,0,16000,/localscratch/GSC/go/b737ee80_nohash_0.wav,wav,,b737ee80,string,,go,string,,go,string,
+go/881583a6_nohash_1,1.0,0,16000,/localscratch/GSC/go/881583a6_nohash_1.wav,wav,,881583a6,string,,go,string,,go,string,
+go/a7216980_nohash_3,1.0,0,16000,/localscratch/GSC/go/a7216980_nohash_3.wav,wav,,a7216980,string,,go,string,,go,string,
+go/9a7c1f83_nohash_0,1.0,0,16000,/localscratch/GSC/go/9a7c1f83_nohash_0.wav,wav,,9a7c1f83,string,,go,string,,go,string,
+go/8fe67225_nohash_1,1.0,0,16000,/localscratch/GSC/go/8fe67225_nohash_1.wav,wav,,8fe67225,string,,go,string,,go,string,
+go/022cd682_nohash_1,1.0,0,16000,/localscratch/GSC/go/022cd682_nohash_1.wav,wav,,022cd682,string,,go,string,,go,string,
+go/837a0f64_nohash_2,1.0,0,16000,/localscratch/GSC/go/837a0f64_nohash_2.wav,wav,,837a0f64,string,,go,string,,go,string,
+go/db24628d_nohash_2,1.0,0,16000,/localscratch/GSC/go/db24628d_nohash_2.wav,wav,,db24628d,string,,go,string,,go,string,
+go/a8e25ebb_nohash_0,1.0,0,16000,/localscratch/GSC/go/a8e25ebb_nohash_0.wav,wav,,a8e25ebb,string,,go,string,,go,string,
+go/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/go/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,go,string,,go,string,
+go/d962e5ac_nohash_3,1.0,0,16000,/localscratch/GSC/go/d962e5ac_nohash_3.wav,wav,,d962e5ac,string,,go,string,,go,string,
+go/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/go/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,go,string,,go,string,
+go/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/go/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,go,string,,go,string,
+go/d5ca80c6_nohash_0,1.0,0,16000,/localscratch/GSC/go/d5ca80c6_nohash_0.wav,wav,,d5ca80c6,string,,go,string,,go,string,
+go/893705bb_nohash_2,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_2.wav,wav,,893705bb,string,,go,string,,go,string,
+go/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/go/9a69672b_nohash_2.wav,wav,,9a69672b,string,,go,string,,go,string,
+go/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/go/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,go,string,,go,string,
+go/553f1a79_nohash_2,1.0,0,16000,/localscratch/GSC/go/553f1a79_nohash_2.wav,wav,,553f1a79,string,,go,string,,go,string,
+go/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/go/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,go,string,,go,string,
+go/e41a903b_nohash_4,1.0,0,16000,/localscratch/GSC/go/e41a903b_nohash_4.wav,wav,,e41a903b,string,,go,string,,go,string,
+go/7192fddc_nohash_0,1.0,0,16000,/localscratch/GSC/go/7192fddc_nohash_0.wav,wav,,7192fddc,string,,go,string,,go,string,
+go/1b4c9b89_nohash_1,1.0,0,16000,/localscratch/GSC/go/1b4c9b89_nohash_1.wav,wav,,1b4c9b89,string,,go,string,,go,string,
+go/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/go/beb458a4_nohash_2.wav,wav,,beb458a4,string,,go,string,,go,string,
+go/0cb74144_nohash_2,1.0,0,16000,/localscratch/GSC/go/0cb74144_nohash_2.wav,wav,,0cb74144,string,,go,string,,go,string,
+go/370844f7_nohash_2,1.0,0,16000,/localscratch/GSC/go/370844f7_nohash_2.wav,wav,,370844f7,string,,go,string,,go,string,
+go/d0faf7e4_nohash_4,1.0,0,16000,/localscratch/GSC/go/d0faf7e4_nohash_4.wav,wav,,d0faf7e4,string,,go,string,,go,string,
+go/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/go/97f4c236_nohash_3.wav,wav,,97f4c236,string,,go,string,,go,string,
+go/44260689_nohash_1,1.0,0,16000,/localscratch/GSC/go/44260689_nohash_1.wav,wav,,44260689,string,,go,string,,go,string,
+go/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/go/8769c34c_nohash_0.wav,wav,,8769c34c,string,,go,string,,go,string,
+go/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/go/8fe67225_nohash_3.wav,wav,,8fe67225,string,,go,string,,go,string,
+go/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/go/189cbabe_nohash_3.wav,wav,,189cbabe,string,,go,string,,go,string,
+go/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/go/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,go,string,,go,string,
+go/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/go/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,go,string,,go,string,
+go/5ff3f9a1_nohash_1,1.0,0,16000,/localscratch/GSC/go/5ff3f9a1_nohash_1.wav,wav,,5ff3f9a1,string,,go,string,,go,string,
+go/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/go/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,go,string,,go,string,
+go/e9901cf0_nohash_1,1.0,0,16000,/localscratch/GSC/go/e9901cf0_nohash_1.wav,wav,,e9901cf0,string,,go,string,,go,string,
+go/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/go/0cb74144_nohash_0.wav,wav,,0cb74144,string,,go,string,,go,string,
+go/6f2f57c1_nohash_0,1.0,0,16000,/localscratch/GSC/go/6f2f57c1_nohash_0.wav,wav,,6f2f57c1,string,,go,string,,go,string,
+go/e0c782d5_nohash_2,1.0,0,16000,/localscratch/GSC/go/e0c782d5_nohash_2.wav,wav,,e0c782d5,string,,go,string,,go,string,
+go/37dca74f_nohash_1,1.0,0,16000,/localscratch/GSC/go/37dca74f_nohash_1.wav,wav,,37dca74f,string,,go,string,,go,string,
+go/d5ca80c6_nohash_1,1.0,0,16000,/localscratch/GSC/go/d5ca80c6_nohash_1.wav,wav,,d5ca80c6,string,,go,string,,go,string,
+go/94de6a6a_nohash_0,1.0,0,16000,/localscratch/GSC/go/94de6a6a_nohash_0.wav,wav,,94de6a6a,string,,go,string,,go,string,
+go/4c6167ca_nohash_6,1.0,0,16000,/localscratch/GSC/go/4c6167ca_nohash_6.wav,wav,,4c6167ca,string,,go,string,,go,string,
+go/370844f7_nohash_0,1.0,0,16000,/localscratch/GSC/go/370844f7_nohash_0.wav,wav,,370844f7,string,,go,string,,go,string,
+go/d103dd6e_nohash_0,1.0,0,16000,/localscratch/GSC/go/d103dd6e_nohash_0.wav,wav,,d103dd6e,string,,go,string,,go,string,
+go/553f1a79_nohash_3,1.0,0,16000,/localscratch/GSC/go/553f1a79_nohash_3.wav,wav,,553f1a79,string,,go,string,,go,string,
+go/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/go/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,go,string,,go,string,
+go/4c6167ca_nohash_7,1.0,0,16000,/localscratch/GSC/go/4c6167ca_nohash_7.wav,wav,,4c6167ca,string,,go,string,,go,string,
+go/7192fddc_nohash_1,1.0,0,16000,/localscratch/GSC/go/7192fddc_nohash_1.wav,wav,,7192fddc,string,,go,string,,go,string,
+go/ca4d5368_nohash_2,1.0,0,16000,/localscratch/GSC/go/ca4d5368_nohash_2.wav,wav,,ca4d5368,string,,go,string,,go,string,
+go/68dd409e_nohash_0,1.0,0,16000,/localscratch/GSC/go/68dd409e_nohash_0.wav,wav,,68dd409e,string,,go,string,,go,string,
+go/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/go/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,go,string,,go,string,
+go/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/go/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,go,string,,go,string,
+go/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_3.wav,wav,,893705bb,string,,go,string,,go,string,
+go/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/go/f428ca69_nohash_0.wav,wav,,f428ca69,string,,go,string,,go,string,
+go/475b61f1_nohash_0,1.0,0,16000,/localscratch/GSC/go/475b61f1_nohash_0.wav,wav,,475b61f1,string,,go,string,,go,string,
+go/ea356919_nohash_0,1.0,0,16000,/localscratch/GSC/go/ea356919_nohash_0.wav,wav,,ea356919,string,,go,string,,go,string,
+go/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/go/e41a903b_nohash_0.wav,wav,,e41a903b,string,,go,string,,go,string,
+go/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/go/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,go,string,,go,string,
+go/c781f251_nohash_0,1.0,0,16000,/localscratch/GSC/go/c781f251_nohash_0.wav,wav,,c781f251,string,,go,string,,go,string,
+go/91b03183_nohash_3,1.0,0,16000,/localscratch/GSC/go/91b03183_nohash_3.wav,wav,,91b03183,string,,go,string,,go,string,
+go/4290ca61_nohash_3,1.0,0,16000,/localscratch/GSC/go/4290ca61_nohash_3.wav,wav,,4290ca61,string,,go,string,,go,string,
+go/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/go/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,go,string,,go,string,
+go/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/go/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,go,string,,go,string,
+go/d5b963aa_nohash_1,1.0,0,16000,/localscratch/GSC/go/d5b963aa_nohash_1.wav,wav,,d5b963aa,string,,go,string,,go,string,
+go/c22d3f18_nohash_4,1.0,0,16000,/localscratch/GSC/go/c22d3f18_nohash_4.wav,wav,,c22d3f18,string,,go,string,,go,string,
+go/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/go/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,go,string,,go,string,
+go/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/go/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,go,string,,go,string,
+go/e1469561_nohash_3,1.0,0,16000,/localscratch/GSC/go/e1469561_nohash_3.wav,wav,,e1469561,string,,go,string,,go,string,
+go/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/go/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,go,string,,go,string,
+go/ca48dc76_nohash_0,1.0,0,16000,/localscratch/GSC/go/ca48dc76_nohash_0.wav,wav,,ca48dc76,string,,go,string,,go,string,
+go/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/go/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,go,string,,go,string,
+go/412c675c_nohash_0,1.0,0,16000,/localscratch/GSC/go/412c675c_nohash_0.wav,wav,,412c675c,string,,go,string,,go,string,
+go/022cd682_nohash_0,1.0,0,16000,/localscratch/GSC/go/022cd682_nohash_0.wav,wav,,022cd682,string,,go,string,,go,string,
+go/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/go/2d82a556_nohash_0.wav,wav,,2d82a556,string,,go,string,,go,string,
+go/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/go/1acc97de_nohash_1.wav,wav,,1acc97de,string,,go,string,,go,string,
+go/3b4f8f24_nohash_3,1.0,0,16000,/localscratch/GSC/go/3b4f8f24_nohash_3.wav,wav,,3b4f8f24,string,,go,string,,go,string,
+go/6e916de8_nohash_0,1.0,0,16000,/localscratch/GSC/go/6e916de8_nohash_0.wav,wav,,6e916de8,string,,go,string,,go,string,
+go/1f3bece8_nohash_0,1.0,0,16000,/localscratch/GSC/go/1f3bece8_nohash_0.wav,wav,,1f3bece8,string,,go,string,,go,string,
+go/6021f08b_nohash_1,1.0,0,16000,/localscratch/GSC/go/6021f08b_nohash_1.wav,wav,,6021f08b,string,,go,string,,go,string,
+go/553f1a79_nohash_1,1.0,0,16000,/localscratch/GSC/go/553f1a79_nohash_1.wav,wav,,553f1a79,string,,go,string,,go,string,
+go/5170b77f_nohash_4,1.0,0,16000,/localscratch/GSC/go/5170b77f_nohash_4.wav,wav,,5170b77f,string,,go,string,,go,string,
+go/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/go/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,go,string,,go,string,
+go/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/go/87070229_nohash_3.wav,wav,,87070229,string,,go,string,,go,string,
+go/5828dfa2_nohash_0,1.0,0,16000,/localscratch/GSC/go/5828dfa2_nohash_0.wav,wav,,5828dfa2,string,,go,string,,go,string,
+go/b737ee80_nohash_2,1.0,0,16000,/localscratch/GSC/go/b737ee80_nohash_2.wav,wav,,b737ee80,string,,go,string,,go,string,
+go/893705bb_nohash_10,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_10.wav,wav,,893705bb,string,,go,string,,go,string,
+go/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/go/91b03183_nohash_1.wav,wav,,91b03183,string,,go,string,,go,string,
+go/4fd1443e_nohash_3,1.0,0,16000,/localscratch/GSC/go/4fd1443e_nohash_3.wav,wav,,4fd1443e,string,,go,string,,go,string,
+go/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/go/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,go,string,,go,string,
+go/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/go/bb05582b_nohash_0.wav,wav,,bb05582b,string,,go,string,,go,string,
+go/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/go/8fe67225_nohash_4.wav,wav,,8fe67225,string,,go,string,,go,string,
+go/fb7eb481_nohash_4,1.0,0,16000,/localscratch/GSC/go/fb7eb481_nohash_4.wav,wav,,fb7eb481,string,,go,string,,go,string,
+go/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/go/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,go,string,,go,string,
+go/b1f8326d_nohash_1,1.0,0,16000,/localscratch/GSC/go/b1f8326d_nohash_1.wav,wav,,b1f8326d,string,,go,string,,go,string,
+go/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/go/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,go,string,,go,string,
+go/f9643d42_nohash_1,1.0,0,16000,/localscratch/GSC/go/f9643d42_nohash_1.wav,wav,,f9643d42,string,,go,string,,go,string,
+go/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/go/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,go,string,,go,string,
+go/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/go/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,go,string,,go,string,
+go/837a0f64_nohash_3,1.0,0,16000,/localscratch/GSC/go/837a0f64_nohash_3.wav,wav,,837a0f64,string,,go,string,,go,string,
+go/f297e878_nohash_0,1.0,0,16000,/localscratch/GSC/go/f297e878_nohash_0.wav,wav,,f297e878,string,,go,string,,go,string,
+go/e1469561_nohash_1,1.0,0,16000,/localscratch/GSC/go/e1469561_nohash_1.wav,wav,,e1469561,string,,go,string,,go,string,
+go/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/go/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,go,string,,go,string,
+go/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/go/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,go,string,,go,string,
+go/af8b2f2c_nohash_0,1.0,0,16000,/localscratch/GSC/go/af8b2f2c_nohash_0.wav,wav,,af8b2f2c,string,,go,string,,go,string,
+go/e5e54cee_nohash_0,1.0,0,16000,/localscratch/GSC/go/e5e54cee_nohash_0.wav,wav,,e5e54cee,string,,go,string,,go,string,
+go/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/go/5170b77f_nohash_3.wav,wav,,5170b77f,string,,go,string,,go,string,
+go/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/go/f9643d42_nohash_3.wav,wav,,f9643d42,string,,go,string,,go,string,
+go/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_4.wav,wav,,893705bb,string,,go,string,,go,string,
+go/e1469561_nohash_2,1.0,0,16000,/localscratch/GSC/go/e1469561_nohash_2.wav,wav,,e1469561,string,,go,string,,go,string,
+go/50033893_nohash_0,1.0,0,16000,/localscratch/GSC/go/50033893_nohash_0.wav,wav,,50033893,string,,go,string,,go,string,
+go/5828dfa2_nohash_1,1.0,0,16000,/localscratch/GSC/go/5828dfa2_nohash_1.wav,wav,,5828dfa2,string,,go,string,,go,string,
+go/91b03183_nohash_4,1.0,0,16000,/localscratch/GSC/go/91b03183_nohash_4.wav,wav,,91b03183,string,,go,string,,go,string,
+go/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/go/c7124b73_nohash_0.wav,wav,,c7124b73,string,,go,string,,go,string,
+go/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/go/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,go,string,,go,string,
+go/af130f12_nohash_0,1.0,0,16000,/localscratch/GSC/go/af130f12_nohash_0.wav,wav,,af130f12,string,,go,string,,go,string,
+go/e1469561_nohash_4,1.0,0,16000,/localscratch/GSC/go/e1469561_nohash_4.wav,wav,,e1469561,string,,go,string,,go,string,
+go/8056e897_nohash_2,1.0,0,16000,/localscratch/GSC/go/8056e897_nohash_2.wav,wav,,8056e897,string,,go,string,,go,string,
+go/a2473d62_nohash_0,1.0,0,16000,/localscratch/GSC/go/a2473d62_nohash_0.wav,wav,,a2473d62,string,,go,string,,go,string,
+go/68dd409e_nohash_1,1.0,0,16000,/localscratch/GSC/go/68dd409e_nohash_1.wav,wav,,68dd409e,string,,go,string,,go,string,
+go/9a69672b_nohash_4,1.0,0,16000,/localscratch/GSC/go/9a69672b_nohash_4.wav,wav,,9a69672b,string,,go,string,,go,string,
+go/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/go/8769c34c_nohash_3.wav,wav,,8769c34c,string,,go,string,,go,string,
+go/1b4c9b89_nohash_3,1.0,0,16000,/localscratch/GSC/go/1b4c9b89_nohash_3.wav,wav,,1b4c9b89,string,,go,string,,go,string,
+go/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/go/bb05582b_nohash_3.wav,wav,,bb05582b,string,,go,string,,go,string,
+go/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/go/5170b77f_nohash_0.wav,wav,,5170b77f,string,,go,string,,go,string,
+go/28497c5b_nohash_0,1.0,0,16000,/localscratch/GSC/go/28497c5b_nohash_0.wav,wav,,28497c5b,string,,go,string,,go,string,
+go/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/go/0cb74144_nohash_3.wav,wav,,0cb74144,string,,go,string,,go,string,
+go/798f702a_nohash_0,1.0,0,16000,/localscratch/GSC/go/798f702a_nohash_0.wav,wav,,798f702a,string,,go,string,,go,string,
+go/9a69672b_nohash_1,1.0,0,16000,/localscratch/GSC/go/9a69672b_nohash_1.wav,wav,,9a69672b,string,,go,string,,go,string,
+go/893705bb_nohash_9,1.0,0,16000,/localscratch/GSC/go/893705bb_nohash_9.wav,wav,,893705bb,string,,go,string,,go,string,
+go/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/go/37dca74f_nohash_0.wav,wav,,37dca74f,string,,go,string,,go,string,
+backward/3b4f8f24_nohash_1,1.0,0,16000,/localscratch/GSC/backward/3b4f8f24_nohash_1.wav,wav,,3b4f8f24,string,,unknown,string,,backward,string,
+backward/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/backward/893705bb_nohash_7.wav,wav,,893705bb,string,,unknown,string,,backward,string,
+backward/3d86b69a_nohash_2,1.0,0,16000,/localscratch/GSC/backward/3d86b69a_nohash_2.wav,wav,,3d86b69a,string,,unknown,string,,backward,string,
+backward/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/backward/db24628d_nohash_1.wav,wav,,db24628d,string,,unknown,string,,backward,string,
+backward/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/backward/bb05582b_nohash_1.wav,wav,,bb05582b,string,,unknown,string,,backward,string,
+backward/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/backward/a7216980_nohash_2.wav,wav,,a7216980,string,,unknown,string,,backward,string,
+backward/3d86b69a_nohash_3,1.0,0,16000,/localscratch/GSC/backward/3d86b69a_nohash_3.wav,wav,,3d86b69a,string,,unknown,string,,backward,string,
+backward/18f8afd5_nohash_4,1.0,0,16000,/localscratch/GSC/backward/18f8afd5_nohash_4.wav,wav,,18f8afd5,string,,unknown,string,,backward,string,
+backward/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/backward/beb458a4_nohash_2.wav,wav,,beb458a4,string,,unknown,string,,backward,string,
+backward/a6f2fd71_nohash_4,1.0,0,16000,/localscratch/GSC/backward/a6f2fd71_nohash_4.wav,wav,,a6f2fd71,string,,unknown,string,,backward,string,
+backward/c9b5ff26_nohash_2,1.0,0,16000,/localscratch/GSC/backward/c9b5ff26_nohash_2.wav,wav,,c9b5ff26,string,,unknown,string,,backward,string,
+backward/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/backward/1acc97de_nohash_1.wav,wav,,1acc97de,string,,unknown,string,,backward,string,
+bed/587f3271_nohash_1,1.0,0,16000,/localscratch/GSC/bed/587f3271_nohash_1.wav,wav,,587f3271,string,,unknown,string,,bed,string,
+bed/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/bed/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,unknown,string,,bed,string,
+bed/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/bed/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,unknown,string,,bed,string,
+bed/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/bed/d7467392_nohash_0.wav,wav,,d7467392,string,,unknown,string,,bed,string,
+bed/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/bed/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,unknown,string,,bed,string,
+bed/0ea0e2f4_nohash_1,1.0,0,16000,/localscratch/GSC/bed/0ea0e2f4_nohash_1.wav,wav,,0ea0e2f4,string,,unknown,string,,bed,string,
+bed/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/bed/beb458a4_nohash_0.wav,wav,,beb458a4,string,,unknown,string,,bed,string,
+bed/220ee1ef_nohash_0,1.0,0,16000,/localscratch/GSC/bed/220ee1ef_nohash_0.wav,wav,,220ee1ef,string,,unknown,string,,bed,string,
+bed/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/bed/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,unknown,string,,bed,string,
+bed/a2473d62_nohash_0,1.0,0,16000,/localscratch/GSC/bed/a2473d62_nohash_0.wav,wav,,a2473d62,string,,unknown,string,,bed,string,
+bird/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/bird/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,unknown,string,,bird,string,
+bird/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/bird/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,unknown,string,,bird,string,
+bird/28497c5b_nohash_1,1.0,0,16000,/localscratch/GSC/bird/28497c5b_nohash_1.wav,wav,,28497c5b,string,,unknown,string,,bird,string,
+bird/d9e9f554_nohash_0,1.0,0,16000,/localscratch/GSC/bird/d9e9f554_nohash_0.wav,wav,,d9e9f554,string,,unknown,string,,bird,string,
+bird/412c675c_nohash_1,1.0,0,16000,/localscratch/GSC/bird/412c675c_nohash_1.wav,wav,,412c675c,string,,unknown,string,,bird,string,
+bird/e9901cf0_nohash_0,1.0,0,16000,/localscratch/GSC/bird/e9901cf0_nohash_0.wav,wav,,e9901cf0,string,,unknown,string,,bird,string,
+bird/4845bb10_nohash_0,1.0,0,16000,/localscratch/GSC/bird/4845bb10_nohash_0.wav,wav,,4845bb10,string,,unknown,string,,bird,string,
+bird/c9e251d2_nohash_0,1.0,0,16000,/localscratch/GSC/bird/c9e251d2_nohash_0.wav,wav,,c9e251d2,string,,unknown,string,,bird,string,
+bird/7bae88ed_nohash_0,1.0,0,16000,/localscratch/GSC/bird/7bae88ed_nohash_0.wav,wav,,7bae88ed,string,,unknown,string,,bird,string,
+bird/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/bird/8769c34c_nohash_0.wav,wav,,8769c34c,string,,unknown,string,,bird,string,
+bird/9dc1889e_nohash_0,1.0,0,16000,/localscratch/GSC/bird/9dc1889e_nohash_0.wav,wav,,9dc1889e,string,,unknown,string,,bird,string,
+cat/b2e2773a_nohash_0,1.0,0,16000,/localscratch/GSC/cat/b2e2773a_nohash_0.wav,wav,,b2e2773a,string,,unknown,string,,cat,string,
+cat/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/cat/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,unknown,string,,cat,string,
+cat/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/cat/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,unknown,string,,cat,string,
+cat/3efef882_nohash_1,1.0,0,16000,/localscratch/GSC/cat/3efef882_nohash_1.wav,wav,,3efef882,string,,unknown,string,,cat,string,
+cat/26b28ea7_nohash_0,1.0,0,16000,/localscratch/GSC/cat/26b28ea7_nohash_0.wav,wav,,26b28ea7,string,,unknown,string,,cat,string,
+cat/cd85758f_nohash_0,1.0,0,16000,/localscratch/GSC/cat/cd85758f_nohash_0.wav,wav,,cd85758f,string,,unknown,string,,cat,string,
+cat/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/cat/8056e897_nohash_0.wav,wav,,8056e897,string,,unknown,string,,cat,string,
+cat/d7467392_nohash_1,1.0,0,16000,/localscratch/GSC/cat/d7467392_nohash_1.wav,wav,,d7467392,string,,unknown,string,,cat,string,
+cat/a2473d62_nohash_1,1.0,0,16000,/localscratch/GSC/cat/a2473d62_nohash_1.wav,wav,,a2473d62,string,,unknown,string,,cat,string,
+cat/85d2ac4b_nohash_0,1.0,0,16000,/localscratch/GSC/cat/85d2ac4b_nohash_0.wav,wav,,85d2ac4b,string,,unknown,string,,cat,string,
+cat/6021f08b_nohash_0,1.0,0,16000,/localscratch/GSC/cat/6021f08b_nohash_0.wav,wav,,6021f08b,string,,unknown,string,,cat,string,
+cat/5ff3f9a1_nohash_2,1.0,0,16000,/localscratch/GSC/cat/5ff3f9a1_nohash_2.wav,wav,,5ff3f9a1,string,,unknown,string,,cat,string,
+cat/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/cat/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,unknown,string,,cat,string,
+dog/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/dog/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,unknown,string,,dog,string,
+dog/4c7c95de_nohash_1,1.0,0,16000,/localscratch/GSC/dog/4c7c95de_nohash_1.wav,wav,,4c7c95de,string,,unknown,string,,dog,string,
+dog/8fe67225_nohash_0,1.0,0,16000,/localscratch/GSC/dog/8fe67225_nohash_0.wav,wav,,8fe67225,string,,unknown,string,,dog,string,
+dog/f428ca69_nohash_1,1.0,0,16000,/localscratch/GSC/dog/f428ca69_nohash_1.wav,wav,,f428ca69,string,,unknown,string,,dog,string,
+dog/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/dog/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,unknown,string,,dog,string,
+dog/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/dog/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,unknown,string,,dog,string,
+dog/0cb74144_nohash_0,1.0,0,16000,/localscratch/GSC/dog/0cb74144_nohash_0.wav,wav,,0cb74144,string,,unknown,string,,dog,string,
+dog/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/dog/f428ca69_nohash_0.wav,wav,,f428ca69,string,,unknown,string,,dog,string,
+dog/c7124b73_nohash_1,1.0,0,16000,/localscratch/GSC/dog/c7124b73_nohash_1.wav,wav,,c7124b73,string,,unknown,string,,dog,string,
+eight/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/eight/cd85758f_nohash_1.wav,wav,,cd85758f,string,,unknown,string,,eight,string,
+eight/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/eight/e41a903b_nohash_2.wav,wav,,e41a903b,string,,unknown,string,,eight,string,
+eight/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/eight/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,unknown,string,,eight,string,
+eight/d103dd6e_nohash_3,1.0,0,16000,/localscratch/GSC/eight/d103dd6e_nohash_3.wav,wav,,d103dd6e,string,,unknown,string,,eight,string,
+eight/d0faf7e4_nohash_2,1.0,0,16000,/localscratch/GSC/eight/d0faf7e4_nohash_2.wav,wav,,d0faf7e4,string,,unknown,string,,eight,string,
+eight/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/eight/8fe67225_nohash_2.wav,wav,,8fe67225,string,,unknown,string,,eight,string,
+eight/dfdabe19_nohash_0,1.0,0,16000,/localscratch/GSC/eight/dfdabe19_nohash_0.wav,wav,,dfdabe19,string,,unknown,string,,eight,string,
+eight/94de6a6a_nohash_3,1.0,0,16000,/localscratch/GSC/eight/94de6a6a_nohash_3.wav,wav,,94de6a6a,string,,unknown,string,,eight,string,
+eight/3b4f8f24_nohash_2,1.0,0,16000,/localscratch/GSC/eight/3b4f8f24_nohash_2.wav,wav,,3b4f8f24,string,,unknown,string,,eight,string,
+eight/b83c1acf_nohash_4,1.0,0,16000,/localscratch/GSC/eight/b83c1acf_nohash_4.wav,wav,,b83c1acf,string,,unknown,string,,eight,string,
+eight/cd85758f_nohash_4,1.0,0,16000,/localscratch/GSC/eight/cd85758f_nohash_4.wav,wav,,cd85758f,string,,unknown,string,,eight,string,
+eight/4f8ef132_nohash_0,1.0,0,16000,/localscratch/GSC/eight/4f8ef132_nohash_0.wav,wav,,4f8ef132,string,,unknown,string,,eight,string,
+eight/80c45ed6_nohash_1,1.0,0,16000,/localscratch/GSC/eight/80c45ed6_nohash_1.wav,wav,,80c45ed6,string,,unknown,string,,eight,string,
+eight/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/eight/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,unknown,string,,eight,string,
+eight/8ea6dec6_nohash_0,1.0,0,16000,/localscratch/GSC/eight/8ea6dec6_nohash_0.wav,wav,,8ea6dec6,string,,unknown,string,,eight,string,
+eight/893705bb_nohash_4,1.0,0,16000,/localscratch/GSC/eight/893705bb_nohash_4.wav,wav,,893705bb,string,,unknown,string,,eight,string,
+eight/42beb5eb_nohash_2,1.0,0,16000,/localscratch/GSC/eight/42beb5eb_nohash_2.wav,wav,,42beb5eb,string,,unknown,string,,eight,string,
+eight/4620dc14_nohash_1,1.0,0,16000,/localscratch/GSC/eight/4620dc14_nohash_1.wav,wav,,4620dc14,string,,unknown,string,,eight,string,
+five/7dc95912_nohash_0,1.0,0,16000,/localscratch/GSC/five/7dc95912_nohash_0.wav,wav,,7dc95912,string,,unknown,string,,five,string,
+five/9a69672b_nohash_3,1.0,0,16000,/localscratch/GSC/five/9a69672b_nohash_3.wav,wav,,9a69672b,string,,unknown,string,,five,string,
+five/3d86b69a_nohash_1,1.0,0,16000,/localscratch/GSC/five/3d86b69a_nohash_1.wav,wav,,3d86b69a,string,,unknown,string,,five,string,
+five/c9b5ff26_nohash_4,1.0,0,16000,/localscratch/GSC/five/c9b5ff26_nohash_4.wav,wav,,c9b5ff26,string,,unknown,string,,five,string,
+five/c0e0f834_nohash_0,1.0,0,16000,/localscratch/GSC/five/c0e0f834_nohash_0.wav,wav,,c0e0f834,string,,unknown,string,,five,string,
+five/44260689_nohash_0,1.0,0,16000,/localscratch/GSC/five/44260689_nohash_0.wav,wav,,44260689,string,,unknown,string,,five,string,
+five/4290ca61_nohash_2,1.0,0,16000,/localscratch/GSC/five/4290ca61_nohash_2.wav,wav,,4290ca61,string,,unknown,string,,five,string,
+five/44715c1c_nohash_0,1.0,0,16000,/localscratch/GSC/five/44715c1c_nohash_0.wav,wav,,44715c1c,string,,unknown,string,,five,string,
+five/4f8ef132_nohash_1,1.0,0,16000,/localscratch/GSC/five/4f8ef132_nohash_1.wav,wav,,4f8ef132,string,,unknown,string,,five,string,
+five/5c8af87a_nohash_1,1.0,0,16000,/localscratch/GSC/five/5c8af87a_nohash_1.wav,wav,,5c8af87a,string,,unknown,string,,five,string,
+five/aa80f517_nohash_4,1.0,0,16000,/localscratch/GSC/five/aa80f517_nohash_4.wav,wav,,aa80f517,string,,unknown,string,,five,string,
+five/fb7eb481_nohash_1,1.0,0,16000,/localscratch/GSC/five/fb7eb481_nohash_1.wav,wav,,fb7eb481,string,,unknown,string,,five,string,
+five/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/five/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,unknown,string,,five,string,
+five/c22d3f18_nohash_3,1.0,0,16000,/localscratch/GSC/five/c22d3f18_nohash_3.wav,wav,,c22d3f18,string,,unknown,string,,five,string,
+five/563aa4e6_nohash_3,1.0,0,16000,/localscratch/GSC/five/563aa4e6_nohash_3.wav,wav,,563aa4e6,string,,unknown,string,,five,string,
+five/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/five/e41a903b_nohash_0.wav,wav,,e41a903b,string,,unknown,string,,five,string,
+five/9dc1889e_nohash_0,1.0,0,16000,/localscratch/GSC/five/9dc1889e_nohash_0.wav,wav,,9dc1889e,string,,unknown,string,,five,string,
+five/022cd682_nohash_0,1.0,0,16000,/localscratch/GSC/five/022cd682_nohash_0.wav,wav,,022cd682,string,,unknown,string,,five,string,
+five/91b03183_nohash_1,1.0,0,16000,/localscratch/GSC/five/91b03183_nohash_1.wav,wav,,91b03183,string,,unknown,string,,five,string,
+five/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/five/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,unknown,string,,five,string,
+five/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/five/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,unknown,string,,five,string,
+follow/cfbedff9_nohash_1,1.0,0,16000,/localscratch/GSC/follow/cfbedff9_nohash_1.wav,wav,,cfbedff9,string,,unknown,string,,follow,string,
+follow/cfbedff9_nohash_3,1.0,0,16000,/localscratch/GSC/follow/cfbedff9_nohash_3.wav,wav,,cfbedff9,string,,unknown,string,,follow,string,
+follow/bb05582b_nohash_1,1.0,0,16000,/localscratch/GSC/follow/bb05582b_nohash_1.wav,wav,,bb05582b,string,,unknown,string,,follow,string,
+follow/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/follow/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,unknown,string,,follow,string,
+follow/d5b963aa_nohash_3,1.0,0,16000,/localscratch/GSC/follow/d5b963aa_nohash_3.wav,wav,,d5b963aa,string,,unknown,string,,follow,string,
+follow/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/follow/189cbabe_nohash_2.wav,wav,,189cbabe,string,,unknown,string,,follow,string,
+follow/beb458a4_nohash_0,1.0,0,16000,/localscratch/GSC/follow/beb458a4_nohash_0.wav,wav,,beb458a4,string,,unknown,string,,follow,string,
+follow/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/follow/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,unknown,string,,follow,string,
+follow/4c6167ca_nohash_4,1.0,0,16000,/localscratch/GSC/follow/4c6167ca_nohash_4.wav,wav,,4c6167ca,string,,unknown,string,,follow,string,
+follow/a6f2fd71_nohash_4,1.0,0,16000,/localscratch/GSC/follow/a6f2fd71_nohash_4.wav,wav,,a6f2fd71,string,,unknown,string,,follow,string,
+follow/893705bb_nohash_3,1.0,0,16000,/localscratch/GSC/follow/893705bb_nohash_3.wav,wav,,893705bb,string,,unknown,string,,follow,string,
+follow/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/follow/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,unknown,string,,follow,string,
+follow/bb05582b_nohash_3,1.0,0,16000,/localscratch/GSC/follow/bb05582b_nohash_3.wav,wav,,bb05582b,string,,unknown,string,,follow,string,
+forward/0cb74144_nohash_1,1.0,0,16000,/localscratch/GSC/forward/0cb74144_nohash_1.wav,wav,,0cb74144,string,,unknown,string,,forward,string,
+forward/dfdabe19_nohash_0,1.0,0,16000,/localscratch/GSC/forward/dfdabe19_nohash_0.wav,wav,,dfdabe19,string,,unknown,string,,forward,string,
+forward/f2e59fea_nohash_3,1.0,0,16000,/localscratch/GSC/forward/f2e59fea_nohash_3.wav,wav,,f2e59fea,string,,unknown,string,,forward,string,
+forward/a6f2fd71_nohash_1,1.0,0,16000,/localscratch/GSC/forward/a6f2fd71_nohash_1.wav,wav,,a6f2fd71,string,,unknown,string,,forward,string,
+forward/18f8afd5_nohash_0,1.0,0,16000,/localscratch/GSC/forward/18f8afd5_nohash_0.wav,wav,,18f8afd5,string,,unknown,string,,forward,string,
+forward/8769c34c_nohash_0,1.0,0,16000,/localscratch/GSC/forward/8769c34c_nohash_0.wav,wav,,8769c34c,string,,unknown,string,,forward,string,
+forward/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/forward/87070229_nohash_3.wav,wav,,87070229,string,,unknown,string,,forward,string,
+four/9a356ab9_nohash_1,1.0,0,16000,/localscratch/GSC/four/9a356ab9_nohash_1.wav,wav,,9a356ab9,string,,unknown,string,,four,string,
+four/7dc95912_nohash_0,1.0,0,16000,/localscratch/GSC/four/7dc95912_nohash_0.wav,wav,,7dc95912,string,,unknown,string,,four,string,
+four/c7dc7278_nohash_2,1.0,0,16000,/localscratch/GSC/four/c7dc7278_nohash_2.wav,wav,,c7dc7278,string,,unknown,string,,four,string,
+four/e41a903b_nohash_1,1.0,0,16000,/localscratch/GSC/four/e41a903b_nohash_1.wav,wav,,e41a903b,string,,unknown,string,,four,string,
+four/1acc97de_nohash_4,1.0,0,16000,/localscratch/GSC/four/1acc97de_nohash_4.wav,wav,,1acc97de,string,,unknown,string,,four,string,
+four/aa80f517_nohash_1,1.0,0,16000,/localscratch/GSC/four/aa80f517_nohash_1.wav,wav,,aa80f517,string,,unknown,string,,four,string,
+four/bb31b82b_nohash_0,1.0,0,16000,/localscratch/GSC/four/bb31b82b_nohash_0.wav,wav,,bb31b82b,string,,unknown,string,,four,string,
+four/bb05582b_nohash_4,1.0,0,16000,/localscratch/GSC/four/bb05582b_nohash_4.wav,wav,,bb05582b,string,,unknown,string,,four,string,
+four/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/four/a7216980_nohash_2.wav,wav,,a7216980,string,,unknown,string,,four,string,
+four/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/four/893705bb_nohash_8.wav,wav,,893705bb,string,,unknown,string,,four,string,
+four/b97c9f77_nohash_1,1.0,0,16000,/localscratch/GSC/four/b97c9f77_nohash_1.wav,wav,,b97c9f77,string,,unknown,string,,four,string,
+four/cfde27ba_nohash_0,1.0,0,16000,/localscratch/GSC/four/cfde27ba_nohash_0.wav,wav,,cfde27ba,string,,unknown,string,,four,string,
+four/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/four/aa80f517_nohash_3.wav,wav,,aa80f517,string,,unknown,string,,four,string,
+four/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/four/893705bb_nohash_1.wav,wav,,893705bb,string,,unknown,string,,four,string,
+four/a591c2ea_nohash_1,1.0,0,16000,/localscratch/GSC/four/a591c2ea_nohash_1.wav,wav,,a591c2ea,string,,unknown,string,,four,string,
+four/4fd1443e_nohash_4,1.0,0,16000,/localscratch/GSC/four/4fd1443e_nohash_4.wav,wav,,4fd1443e,string,,unknown,string,,four,string,
+four/563aa4e6_nohash_0,1.0,0,16000,/localscratch/GSC/four/563aa4e6_nohash_0.wav,wav,,563aa4e6,string,,unknown,string,,four,string,
+four/189cbabe_nohash_2,1.0,0,16000,/localscratch/GSC/four/189cbabe_nohash_2.wav,wav,,189cbabe,string,,unknown,string,,four,string,
+four/1cb788bc_nohash_0,1.0,0,16000,/localscratch/GSC/four/1cb788bc_nohash_0.wav,wav,,1cb788bc,string,,unknown,string,,four,string,
+four/1acc97de_nohash_1,1.0,0,16000,/localscratch/GSC/four/1acc97de_nohash_1.wav,wav,,1acc97de,string,,unknown,string,,four,string,
+four/6e916de8_nohash_0,1.0,0,16000,/localscratch/GSC/four/6e916de8_nohash_0.wav,wav,,6e916de8,string,,unknown,string,,four,string,
+four/bb05582b_nohash_0,1.0,0,16000,/localscratch/GSC/four/bb05582b_nohash_0.wav,wav,,bb05582b,string,,unknown,string,,four,string,
+four/c7124b73_nohash_0,1.0,0,16000,/localscratch/GSC/four/c7124b73_nohash_0.wav,wav,,c7124b73,string,,unknown,string,,four,string,
+four/cfbedff9_nohash_4,1.0,0,16000,/localscratch/GSC/four/cfbedff9_nohash_4.wav,wav,,cfbedff9,string,,unknown,string,,four,string,
+four/20d3f11f_nohash_0,1.0,0,16000,/localscratch/GSC/four/20d3f11f_nohash_0.wav,wav,,20d3f11f,string,,unknown,string,,four,string,
+four/9a356ab9_nohash_0,1.0,0,16000,/localscratch/GSC/four/9a356ab9_nohash_0.wav,wav,,9a356ab9,string,,unknown,string,,four,string,
+four/8769c34c_nohash_3,1.0,0,16000,/localscratch/GSC/four/8769c34c_nohash_3.wav,wav,,8769c34c,string,,unknown,string,,four,string,
+four/7bae88ed_nohash_1,1.0,0,16000,/localscratch/GSC/four/7bae88ed_nohash_1.wav,wav,,7bae88ed,string,,unknown,string,,four,string,
+happy/1b4c9b89_nohash_0,1.0,0,16000,/localscratch/GSC/happy/1b4c9b89_nohash_0.wav,wav,,1b4c9b89,string,,unknown,string,,happy,string,
+happy/f17d21b5_nohash_0,1.0,0,16000,/localscratch/GSC/happy/f17d21b5_nohash_0.wav,wav,,f17d21b5,string,,unknown,string,,happy,string,
+happy/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/happy/dc75148d_nohash_0.wav,wav,,dc75148d,string,,unknown,string,,happy,string,
+happy/ffa76c4a_nohash_0,1.0,0,16000,/localscratch/GSC/happy/ffa76c4a_nohash_0.wav,wav,,ffa76c4a,string,,unknown,string,,happy,string,
+happy/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/happy/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,unknown,string,,happy,string,
+happy/6021f08b_nohash_0,1.0,0,16000,/localscratch/GSC/happy/6021f08b_nohash_0.wav,wav,,6021f08b,string,,unknown,string,,happy,string,
+happy/e41a903b_nohash_0,1.0,0,16000,/localscratch/GSC/happy/e41a903b_nohash_0.wav,wav,,e41a903b,string,,unknown,string,,happy,string,
+happy/f2e59fea_nohash_0,1.0,0,16000,/localscratch/GSC/happy/f2e59fea_nohash_0.wav,wav,,f2e59fea,string,,unknown,string,,happy,string,
+happy/37dca74f_nohash_0,1.0,0,16000,/localscratch/GSC/happy/37dca74f_nohash_0.wav,wav,,37dca74f,string,,unknown,string,,happy,string,
+house/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/house/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,unknown,string,,house,string,
+house/5525ff66_nohash_0,1.0,0,16000,/localscratch/GSC/house/5525ff66_nohash_0.wav,wav,,5525ff66,string,,unknown,string,,house,string,
+house/8056e897_nohash_0,1.0,0,16000,/localscratch/GSC/house/8056e897_nohash_0.wav,wav,,8056e897,string,,unknown,string,,house,string,
+house/7e1054e7_nohash_0,1.0,0,16000,/localscratch/GSC/house/7e1054e7_nohash_0.wav,wav,,7e1054e7,string,,unknown,string,,house,string,
+house/b97c9f77_nohash_0,1.0,0,16000,/localscratch/GSC/house/b97c9f77_nohash_0.wav,wav,,b97c9f77,string,,unknown,string,,house,string,
+house/3df9a3d4_nohash_1,1.0,0,16000,/localscratch/GSC/house/3df9a3d4_nohash_1.wav,wav,,3df9a3d4,string,,unknown,string,,house,string,
+house/d91a159e_nohash_0,1.0,0,16000,/localscratch/GSC/house/d91a159e_nohash_0.wav,wav,,d91a159e,string,,unknown,string,,house,string,
+house/7257420c_nohash_0,1.0,0,16000,/localscratch/GSC/house/7257420c_nohash_0.wav,wav,,7257420c,string,,unknown,string,,house,string,
+house/a6f2fd71_nohash_0,1.0,0,16000,/localscratch/GSC/house/a6f2fd71_nohash_0.wav,wav,,a6f2fd71,string,,unknown,string,,house,string,
+house/3f170018_nohash_0,1.0,0,16000,/localscratch/GSC/house/3f170018_nohash_0.wav,wav,,3f170018,string,,unknown,string,,house,string,
+house/d0faf7e4_nohash_0,1.0,0,16000,/localscratch/GSC/house/d0faf7e4_nohash_0.wav,wav,,d0faf7e4,string,,unknown,string,,house,string,
+house/b83c1acf_nohash_0,1.0,0,16000,/localscratch/GSC/house/b83c1acf_nohash_0.wav,wav,,b83c1acf,string,,unknown,string,,house,string,
+house/4fd1443e_nohash_0,1.0,0,16000,/localscratch/GSC/house/4fd1443e_nohash_0.wav,wav,,4fd1443e,string,,unknown,string,,house,string,
+house/5ff3f9a1_nohash_0,1.0,0,16000,/localscratch/GSC/house/5ff3f9a1_nohash_0.wav,wav,,5ff3f9a1,string,,unknown,string,,house,string,
+house/1f653d27_nohash_2,1.0,0,16000,/localscratch/GSC/house/1f653d27_nohash_2.wav,wav,,1f653d27,string,,unknown,string,,house,string,
+house/3659fc1c_nohash_1,1.0,0,16000,/localscratch/GSC/house/3659fc1c_nohash_1.wav,wav,,3659fc1c,string,,unknown,string,,house,string,
+learn/91b03183_nohash_0,1.0,0,16000,/localscratch/GSC/learn/91b03183_nohash_0.wav,wav,,91b03183,string,,unknown,string,,learn,string,
+learn/e49428d9_nohash_3,1.0,0,16000,/localscratch/GSC/learn/e49428d9_nohash_3.wav,wav,,e49428d9,string,,unknown,string,,learn,string,
+learn/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/learn/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,unknown,string,,learn,string,
+learn/8769c34c_nohash_1,1.0,0,16000,/localscratch/GSC/learn/8769c34c_nohash_1.wav,wav,,8769c34c,string,,unknown,string,,learn,string,
+learn/e49428d9_nohash_5,1.0,0,16000,/localscratch/GSC/learn/e49428d9_nohash_5.wav,wav,,e49428d9,string,,unknown,string,,learn,string,
+learn/91b03183_nohash_2,1.0,0,16000,/localscratch/GSC/learn/91b03183_nohash_2.wav,wav,,91b03183,string,,unknown,string,,learn,string,
+learn/cfbedff9_nohash_4,1.0,0,16000,/localscratch/GSC/learn/cfbedff9_nohash_4.wav,wav,,cfbedff9,string,,unknown,string,,learn,string,
+marvin/f17d21b5_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/f17d21b5_nohash_0.wav,wav,,f17d21b5,string,,unknown,string,,marvin,string,
+marvin/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/db24628d_nohash_0.wav,wav,,db24628d,string,,unknown,string,,marvin,string,
+marvin/8625475c_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/8625475c_nohash_0.wav,wav,,8625475c,string,,unknown,string,,marvin,string,
+marvin/6b889021_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/6b889021_nohash_0.wav,wav,,6b889021,string,,unknown,string,,marvin,string,
+marvin/85834399_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/85834399_nohash_0.wav,wav,,85834399,string,,unknown,string,,marvin,string,
+marvin/8fe52b97_nohash_1,1.0,0,16000,/localscratch/GSC/marvin/8fe52b97_nohash_1.wav,wav,,8fe52b97,string,,unknown,string,,marvin,string,
+marvin/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/881583a6_nohash_0.wav,wav,,881583a6,string,,unknown,string,,marvin,string,
+marvin/81dc4a94_nohash_1,1.0,0,16000,/localscratch/GSC/marvin/81dc4a94_nohash_1.wav,wav,,81dc4a94,string,,unknown,string,,marvin,string,
+marvin/3659fc1c_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/3659fc1c_nohash_0.wav,wav,,3659fc1c,string,,unknown,string,,marvin,string,
+marvin/6f689791_nohash_1,1.0,0,16000,/localscratch/GSC/marvin/6f689791_nohash_1.wav,wav,,6f689791,string,,unknown,string,,marvin,string,
+marvin/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/2d82a556_nohash_0.wav,wav,,2d82a556,string,,unknown,string,,marvin,string,
+marvin/2c6d3924_nohash_0,1.0,0,16000,/localscratch/GSC/marvin/2c6d3924_nohash_0.wav,wav,,2c6d3924,string,,unknown,string,,marvin,string,
+nine/cd85758f_nohash_1,1.0,0,16000,/localscratch/GSC/nine/cd85758f_nohash_1.wav,wav,,cd85758f,string,,unknown,string,,nine,string,
+nine/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/nine/e41a903b_nohash_2.wav,wav,,e41a903b,string,,unknown,string,,nine,string,
+nine/dc75148d_nohash_0,1.0,0,16000,/localscratch/GSC/nine/dc75148d_nohash_0.wav,wav,,dc75148d,string,,unknown,string,,nine,string,
+nine/cfbedff9_nohash_0,1.0,0,16000,/localscratch/GSC/nine/cfbedff9_nohash_0.wav,wav,,cfbedff9,string,,unknown,string,,nine,string,
+nine/d962e5ac_nohash_0,1.0,0,16000,/localscratch/GSC/nine/d962e5ac_nohash_0.wav,wav,,d962e5ac,string,,unknown,string,,nine,string,
+nine/d1a4fb3f_nohash_0,1.0,0,16000,/localscratch/GSC/nine/d1a4fb3f_nohash_0.wav,wav,,d1a4fb3f,string,,unknown,string,,nine,string,
+nine/8fe67225_nohash_2,1.0,0,16000,/localscratch/GSC/nine/8fe67225_nohash_2.wav,wav,,8fe67225,string,,unknown,string,,nine,string,
+nine/caf9fceb_nohash_0,1.0,0,16000,/localscratch/GSC/nine/caf9fceb_nohash_0.wav,wav,,caf9fceb,string,,unknown,string,,nine,string,
+nine/97f4c236_nohash_0,1.0,0,16000,/localscratch/GSC/nine/97f4c236_nohash_0.wav,wav,,97f4c236,string,,unknown,string,,nine,string,
+nine/e0c782d5_nohash_3,1.0,0,16000,/localscratch/GSC/nine/e0c782d5_nohash_3.wav,wav,,e0c782d5,string,,unknown,string,,nine,string,
+nine/87070229_nohash_1,1.0,0,16000,/localscratch/GSC/nine/87070229_nohash_1.wav,wav,,87070229,string,,unknown,string,,nine,string,
+nine/beb458a4_nohash_1,1.0,0,16000,/localscratch/GSC/nine/beb458a4_nohash_1.wav,wav,,beb458a4,string,,unknown,string,,nine,string,
+nine/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/nine/a7216980_nohash_2.wav,wav,,a7216980,string,,unknown,string,,nine,string,
+nine/b97c9f77_nohash_2,1.0,0,16000,/localscratch/GSC/nine/b97c9f77_nohash_2.wav,wav,,b97c9f77,string,,unknown,string,,nine,string,
+nine/a591c2ea_nohash_3,1.0,0,16000,/localscratch/GSC/nine/a591c2ea_nohash_3.wav,wav,,a591c2ea,string,,unknown,string,,nine,string,
+nine/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/nine/b49caed3_nohash_0.wav,wav,,b49caed3,string,,unknown,string,,nine,string,
+nine/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/nine/881583a6_nohash_0.wav,wav,,881583a6,string,,unknown,string,,nine,string,
+nine/18f8afd5_nohash_2,1.0,0,16000,/localscratch/GSC/nine/18f8afd5_nohash_2.wav,wav,,18f8afd5,string,,unknown,string,,nine,string,
+nine/210f3aa9_nohash_0,1.0,0,16000,/localscratch/GSC/nine/210f3aa9_nohash_0.wav,wav,,210f3aa9,string,,unknown,string,,nine,string,
+nine/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/nine/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,unknown,string,,nine,string,
+nine/ef2a3cfb_nohash_1,1.0,0,16000,/localscratch/GSC/nine/ef2a3cfb_nohash_1.wav,wav,,ef2a3cfb,string,,unknown,string,,nine,string,
+nine/d0faf7e4_nohash_3,1.0,0,16000,/localscratch/GSC/nine/d0faf7e4_nohash_3.wav,wav,,d0faf7e4,string,,unknown,string,,nine,string,
+nine/a591c2ea_nohash_2,1.0,0,16000,/localscratch/GSC/nine/a591c2ea_nohash_2.wav,wav,,a591c2ea,string,,unknown,string,,nine,string,
+nine/9a356ab9_nohash_0,1.0,0,16000,/localscratch/GSC/nine/9a356ab9_nohash_0.wav,wav,,9a356ab9,string,,unknown,string,,nine,string,
+nine/82b99576_nohash_0,1.0,0,16000,/localscratch/GSC/nine/82b99576_nohash_0.wav,wav,,82b99576,string,,unknown,string,,nine,string,
+one/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/one/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,unknown,string,,one,string,
+one/1acc97de_nohash_4,1.0,0,16000,/localscratch/GSC/one/1acc97de_nohash_4.wav,wav,,1acc97de,string,,unknown,string,,one,string,
+one/87070229_nohash_4,1.0,0,16000,/localscratch/GSC/one/87070229_nohash_4.wav,wav,,87070229,string,,unknown,string,,one,string,
+one/e0c782d5_nohash_4,1.0,0,16000,/localscratch/GSC/one/e0c782d5_nohash_4.wav,wav,,e0c782d5,string,,unknown,string,,one,string,
+one/c518d1b1_nohash_1,1.0,0,16000,/localscratch/GSC/one/c518d1b1_nohash_1.wav,wav,,c518d1b1,string,,unknown,string,,one,string,
+one/d962e5ac_nohash_1,1.0,0,16000,/localscratch/GSC/one/d962e5ac_nohash_1.wav,wav,,d962e5ac,string,,unknown,string,,one,string,
+one/aa80f517_nohash_2,1.0,0,16000,/localscratch/GSC/one/aa80f517_nohash_2.wav,wav,,aa80f517,string,,unknown,string,,one,string,
+one/5b1db3ee_nohash_0,1.0,0,16000,/localscratch/GSC/one/5b1db3ee_nohash_0.wav,wav,,5b1db3ee,string,,unknown,string,,one,string,
+one/0f250098_nohash_0,1.0,0,16000,/localscratch/GSC/one/0f250098_nohash_0.wav,wav,,0f250098,string,,unknown,string,,one,string,
+one/37dca74f_nohash_2,1.0,0,16000,/localscratch/GSC/one/37dca74f_nohash_2.wav,wav,,37dca74f,string,,unknown,string,,one,string,
+one/81dc4a94_nohash_1,1.0,0,16000,/localscratch/GSC/one/81dc4a94_nohash_1.wav,wav,,81dc4a94,string,,unknown,string,,one,string,
+one/27c30960_nohash_1,1.0,0,16000,/localscratch/GSC/one/27c30960_nohash_1.wav,wav,,27c30960,string,,unknown,string,,one,string,
+one/9a7c1f83_nohash_1,1.0,0,16000,/localscratch/GSC/one/9a7c1f83_nohash_1.wav,wav,,9a7c1f83,string,,unknown,string,,one,string,
+one/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/one/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,unknown,string,,one,string,
+one/c22d3f18_nohash_2,1.0,0,16000,/localscratch/GSC/one/c22d3f18_nohash_2.wav,wav,,c22d3f18,string,,unknown,string,,one,string,
+one/189cbabe_nohash_3,1.0,0,16000,/localscratch/GSC/one/189cbabe_nohash_3.wav,wav,,189cbabe,string,,unknown,string,,one,string,
+one/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/one/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,unknown,string,,one,string,
+one/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/one/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,unknown,string,,one,string,
+one/4845bb10_nohash_1,1.0,0,16000,/localscratch/GSC/one/4845bb10_nohash_1.wav,wav,,4845bb10,string,,unknown,string,,one,string,
+one/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/one/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,unknown,string,,one,string,
+one/f9643d42_nohash_3,1.0,0,16000,/localscratch/GSC/one/f9643d42_nohash_3.wav,wav,,f9643d42,string,,unknown,string,,one,string,
+one/28497c5b_nohash_0,1.0,0,16000,/localscratch/GSC/one/28497c5b_nohash_0.wav,wav,,28497c5b,string,,unknown,string,,one,string,
+one/0cb74144_nohash_3,1.0,0,16000,/localscratch/GSC/one/0cb74144_nohash_3.wav,wav,,0cb74144,string,,unknown,string,,one,string,
+seven/1b4c9b89_nohash_2,1.0,0,16000,/localscratch/GSC/seven/1b4c9b89_nohash_2.wav,wav,,1b4c9b89,string,,unknown,string,,seven,string,
+seven/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/seven/e41a903b_nohash_2.wav,wav,,e41a903b,string,,unknown,string,,seven,string,
+seven/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/seven/db24628d_nohash_0.wav,wav,,db24628d,string,,unknown,string,,seven,string,
+seven/43fc47a7_nohash_0,1.0,0,16000,/localscratch/GSC/seven/43fc47a7_nohash_0.wav,wav,,43fc47a7,string,,unknown,string,,seven,string,
+seven/b97c9f77_nohash_3,1.0,0,16000,/localscratch/GSC/seven/b97c9f77_nohash_3.wav,wav,,b97c9f77,string,,unknown,string,,seven,string,
+seven/4f8ef132_nohash_1,1.0,0,16000,/localscratch/GSC/seven/4f8ef132_nohash_1.wav,wav,,4f8ef132,string,,unknown,string,,seven,string,
+seven/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/seven/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,unknown,string,,seven,string,
+seven/1f653d27_nohash_3,1.0,0,16000,/localscratch/GSC/seven/1f653d27_nohash_3.wav,wav,,1f653d27,string,,unknown,string,,seven,string,
+seven/c518d1b1_nohash_1,1.0,0,16000,/localscratch/GSC/seven/c518d1b1_nohash_1.wav,wav,,c518d1b1,string,,unknown,string,,seven,string,
+seven/4845bb10_nohash_0,1.0,0,16000,/localscratch/GSC/seven/4845bb10_nohash_0.wav,wav,,4845bb10,string,,unknown,string,,seven,string,
+seven/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/seven/893705bb_nohash_8.wav,wav,,893705bb,string,,unknown,string,,seven,string,
+seven/3d86b69a_nohash_3,1.0,0,16000,/localscratch/GSC/seven/3d86b69a_nohash_3.wav,wav,,3d86b69a,string,,unknown,string,,seven,string,
+seven/e0c782d5_nohash_0,1.0,0,16000,/localscratch/GSC/seven/e0c782d5_nohash_0.wav,wav,,e0c782d5,string,,unknown,string,,seven,string,
+seven/b83c1acf_nohash_1,1.0,0,16000,/localscratch/GSC/seven/b83c1acf_nohash_1.wav,wav,,b83c1acf,string,,unknown,string,,seven,string,
+seven/cd85758f_nohash_2,1.0,0,16000,/localscratch/GSC/seven/cd85758f_nohash_2.wav,wav,,cd85758f,string,,unknown,string,,seven,string,
+seven/8ec6dab6_nohash_0,1.0,0,16000,/localscratch/GSC/seven/8ec6dab6_nohash_0.wav,wav,,8ec6dab6,string,,unknown,string,,seven,string,
+seven/3659fc1c_nohash_0,1.0,0,16000,/localscratch/GSC/seven/3659fc1c_nohash_0.wav,wav,,3659fc1c,string,,unknown,string,,seven,string,
+seven/b737ee80_nohash_1,1.0,0,16000,/localscratch/GSC/seven/b737ee80_nohash_1.wav,wav,,b737ee80,string,,unknown,string,,seven,string,
+seven/97f4c236_nohash_3,1.0,0,16000,/localscratch/GSC/seven/97f4c236_nohash_3.wav,wav,,97f4c236,string,,unknown,string,,seven,string,
+seven/f2e59fea_nohash_4,1.0,0,16000,/localscratch/GSC/seven/f2e59fea_nohash_4.wav,wav,,f2e59fea,string,,unknown,string,,seven,string,
+seven/ca4d5368_nohash_0,1.0,0,16000,/localscratch/GSC/seven/ca4d5368_nohash_0.wav,wav,,ca4d5368,string,,unknown,string,,seven,string,
+seven/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/seven/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,unknown,string,,seven,string,
+seven/3b4f8f24_nohash_0,1.0,0,16000,/localscratch/GSC/seven/3b4f8f24_nohash_0.wav,wav,,3b4f8f24,string,,unknown,string,,seven,string,
+seven/8fe67225_nohash_4,1.0,0,16000,/localscratch/GSC/seven/8fe67225_nohash_4.wav,wav,,8fe67225,string,,unknown,string,,seven,string,
+seven/5170b77f_nohash_0,1.0,0,16000,/localscratch/GSC/seven/5170b77f_nohash_0.wav,wav,,5170b77f,string,,unknown,string,,seven,string,
+sheila/3df9a3d4_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/3df9a3d4_nohash_0.wav,wav,,3df9a3d4,string,,unknown,string,,sheila,string,
+sheila/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,unknown,string,,sheila,string,
+sheila/9b3ea809_nohash_1,1.0,0,16000,/localscratch/GSC/sheila/9b3ea809_nohash_1.wav,wav,,9b3ea809,string,,unknown,string,,sheila,string,
+sheila/e9901cf0_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/e9901cf0_nohash_0.wav,wav,,e9901cf0,string,,unknown,string,,sheila,string,
+sheila/a9f54d8d_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/a9f54d8d_nohash_0.wav,wav,,a9f54d8d,string,,unknown,string,,sheila,string,
+sheila/5eb5fc74_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/5eb5fc74_nohash_0.wav,wav,,5eb5fc74,string,,unknown,string,,sheila,string,
+sheila/5f01c798_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/5f01c798_nohash_0.wav,wav,,5f01c798,string,,unknown,string,,sheila,string,
+sheila/0d53e045_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/0d53e045_nohash_0.wav,wav,,0d53e045,string,,unknown,string,,sheila,string,
+sheila/0f250098_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/0f250098_nohash_0.wav,wav,,0f250098,string,,unknown,string,,sheila,string,
+sheila/caf9fceb_nohash_1,1.0,0,16000,/localscratch/GSC/sheila/caf9fceb_nohash_1.wav,wav,,caf9fceb,string,,unknown,string,,sheila,string,
+sheila/fb7eb481_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/fb7eb481_nohash_0.wav,wav,,fb7eb481,string,,unknown,string,,sheila,string,
+sheila/24ad3ebe_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/24ad3ebe_nohash_0.wav,wav,,24ad3ebe,string,,unknown,string,,sheila,string,
+sheila/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,unknown,string,,sheila,string,
+sheila/3efef882_nohash_0,1.0,0,16000,/localscratch/GSC/sheila/3efef882_nohash_0.wav,wav,,3efef882,string,,unknown,string,,sheila,string,
+sheila/6021f08b_nohash_2,1.0,0,16000,/localscratch/GSC/sheila/6021f08b_nohash_2.wav,wav,,6021f08b,string,,unknown,string,,sheila,string,
+six/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/six/b49caed3_nohash_1.wav,wav,,b49caed3,string,,unknown,string,,six,string,
+six/837a0f64_nohash_1,1.0,0,16000,/localscratch/GSC/six/837a0f64_nohash_1.wav,wav,,837a0f64,string,,unknown,string,,six,string,
+six/a7216980_nohash_4,1.0,0,16000,/localscratch/GSC/six/a7216980_nohash_4.wav,wav,,a7216980,string,,unknown,string,,six,string,
+six/94de6a6a_nohash_1,1.0,0,16000,/localscratch/GSC/six/94de6a6a_nohash_1.wav,wav,,94de6a6a,string,,unknown,string,,six,string,
+six/beb458a4_nohash_4,1.0,0,16000,/localscratch/GSC/six/beb458a4_nohash_4.wav,wav,,beb458a4,string,,unknown,string,,six,string,
+six/2796ac50_nohash_0,1.0,0,16000,/localscratch/GSC/six/2796ac50_nohash_0.wav,wav,,2796ac50,string,,unknown,string,,six,string,
+six/9b3ea809_nohash_0,1.0,0,16000,/localscratch/GSC/six/9b3ea809_nohash_0.wav,wav,,9b3ea809,string,,unknown,string,,six,string,
+six/e49428d9_nohash_3,1.0,0,16000,/localscratch/GSC/six/e49428d9_nohash_3.wav,wav,,e49428d9,string,,unknown,string,,six,string,
+six/47d01978_nohash_0,1.0,0,16000,/localscratch/GSC/six/47d01978_nohash_0.wav,wav,,47d01978,string,,unknown,string,,six,string,
+six/63f7a489_nohash_3,1.0,0,16000,/localscratch/GSC/six/63f7a489_nohash_3.wav,wav,,63f7a489,string,,unknown,string,,six,string,
+six/f9643d42_nohash_4,1.0,0,16000,/localscratch/GSC/six/f9643d42_nohash_4.wav,wav,,f9643d42,string,,unknown,string,,six,string,
+six/881583a6_nohash_0,1.0,0,16000,/localscratch/GSC/six/881583a6_nohash_0.wav,wav,,881583a6,string,,unknown,string,,six,string,
+six/0d53e045_nohash_1,1.0,0,16000,/localscratch/GSC/six/0d53e045_nohash_1.wav,wav,,0d53e045,string,,unknown,string,,six,string,
+six/1b4c9b89_nohash_4,1.0,0,16000,/localscratch/GSC/six/1b4c9b89_nohash_4.wav,wav,,1b4c9b89,string,,unknown,string,,six,string,
+six/e9901cf0_nohash_1,1.0,0,16000,/localscratch/GSC/six/e9901cf0_nohash_1.wav,wav,,e9901cf0,string,,unknown,string,,six,string,
+six/f428ca69_nohash_0,1.0,0,16000,/localscratch/GSC/six/f428ca69_nohash_0.wav,wav,,f428ca69,string,,unknown,string,,six,string,
+six/4c7c95de_nohash_0,1.0,0,16000,/localscratch/GSC/six/4c7c95de_nohash_0.wav,wav,,4c7c95de,string,,unknown,string,,six,string,
+six/4c6167ca_nohash_3,1.0,0,16000,/localscratch/GSC/six/4c6167ca_nohash_3.wav,wav,,4c6167ca,string,,unknown,string,,six,string,
+three/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/three/b49caed3_nohash_1.wav,wav,,b49caed3,string,,unknown,string,,three,string,
+three/6f2f57c1_nohash_1,1.0,0,16000,/localscratch/GSC/three/6f2f57c1_nohash_1.wav,wav,,6f2f57c1,string,,unknown,string,,three,string,
+three/4c6167ca_nohash_0,1.0,0,16000,/localscratch/GSC/three/4c6167ca_nohash_0.wav,wav,,4c6167ca,string,,unknown,string,,three,string,
+three/9a69672b_nohash_0,1.0,0,16000,/localscratch/GSC/three/9a69672b_nohash_0.wav,wav,,9a69672b,string,,unknown,string,,three,string,
+three/63f7a489_nohash_2,1.0,0,16000,/localscratch/GSC/three/63f7a489_nohash_2.wav,wav,,63f7a489,string,,unknown,string,,three,string,
+three/cfbedff9_nohash_2,1.0,0,16000,/localscratch/GSC/three/cfbedff9_nohash_2.wav,wav,,cfbedff9,string,,unknown,string,,three,string,
+three/a4383927_nohash_0,1.0,0,16000,/localscratch/GSC/three/a4383927_nohash_0.wav,wav,,a4383927,string,,unknown,string,,three,string,
+three/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/three/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,unknown,string,,three,string,
+three/b49caed3_nohash_2,1.0,0,16000,/localscratch/GSC/three/b49caed3_nohash_2.wav,wav,,b49caed3,string,,unknown,string,,three,string,
+three/4f8ef132_nohash_0,1.0,0,16000,/localscratch/GSC/three/4f8ef132_nohash_0.wav,wav,,4f8ef132,string,,unknown,string,,three,string,
+three/aa80f517_nohash_3,1.0,0,16000,/localscratch/GSC/three/aa80f517_nohash_3.wav,wav,,aa80f517,string,,unknown,string,,three,string,
+three/1acc97de_nohash_3,1.0,0,16000,/localscratch/GSC/three/1acc97de_nohash_3.wav,wav,,1acc97de,string,,unknown,string,,three,string,
+three/a4383927_nohash_1,1.0,0,16000,/localscratch/GSC/three/a4383927_nohash_1.wav,wav,,a4383927,string,,unknown,string,,three,string,
+three/8494fba8_nohash_0,1.0,0,16000,/localscratch/GSC/three/8494fba8_nohash_0.wav,wav,,8494fba8,string,,unknown,string,,three,string,
+three/4c6167ca_nohash_1,1.0,0,16000,/localscratch/GSC/three/4c6167ca_nohash_1.wav,wav,,4c6167ca,string,,unknown,string,,three,string,
+three/8fe67225_nohash_3,1.0,0,16000,/localscratch/GSC/three/8fe67225_nohash_3.wav,wav,,8fe67225,string,,unknown,string,,three,string,
+three/d103dd6e_nohash_0,1.0,0,16000,/localscratch/GSC/three/d103dd6e_nohash_0.wav,wav,,d103dd6e,string,,unknown,string,,three,string,
+three/5c8af87a_nohash_2,1.0,0,16000,/localscratch/GSC/three/5c8af87a_nohash_2.wav,wav,,5c8af87a,string,,unknown,string,,three,string,
+three/2d82a556_nohash_0,1.0,0,16000,/localscratch/GSC/three/2d82a556_nohash_0.wav,wav,,2d82a556,string,,unknown,string,,three,string,
+three/aef8dcf5_nohash_0,1.0,0,16000,/localscratch/GSC/three/aef8dcf5_nohash_0.wav,wav,,aef8dcf5,string,,unknown,string,,three,string,
+three/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/three/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,unknown,string,,three,string,
+three/b83c1acf_nohash_3,1.0,0,16000,/localscratch/GSC/three/b83c1acf_nohash_3.wav,wav,,b83c1acf,string,,unknown,string,,three,string,
+tree/f0ae7203_nohash_0,1.0,0,16000,/localscratch/GSC/tree/f0ae7203_nohash_0.wav,wav,,f0ae7203,string,,unknown,string,,tree,string,
+tree/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/tree/db24628d_nohash_0.wav,wav,,db24628d,string,,unknown,string,,tree,string,
+tree/caf9fceb_nohash_0,1.0,0,16000,/localscratch/GSC/tree/caf9fceb_nohash_0.wav,wav,,caf9fceb,string,,unknown,string,,tree,string,
+tree/7add4c5f_nohash_0,1.0,0,16000,/localscratch/GSC/tree/7add4c5f_nohash_0.wav,wav,,7add4c5f,string,,unknown,string,,tree,string,
+tree/d7467392_nohash_0,1.0,0,16000,/localscratch/GSC/tree/d7467392_nohash_0.wav,wav,,d7467392,string,,unknown,string,,tree,string,
+tree/4620dc14_nohash_0,1.0,0,16000,/localscratch/GSC/tree/4620dc14_nohash_0.wav,wav,,4620dc14,string,,unknown,string,,tree,string,
+tree/aa48c94a_nohash_0,1.0,0,16000,/localscratch/GSC/tree/aa48c94a_nohash_0.wav,wav,,aa48c94a,string,,unknown,string,,tree,string,
+tree/022cd682_nohash_3,1.0,0,16000,/localscratch/GSC/tree/022cd682_nohash_3.wav,wav,,022cd682,string,,unknown,string,,tree,string,
+tree/b49caed3_nohash_0,1.0,0,16000,/localscratch/GSC/tree/b49caed3_nohash_0.wav,wav,,b49caed3,string,,unknown,string,,tree,string,
+tree/cfde27ba_nohash_0,1.0,0,16000,/localscratch/GSC/tree/cfde27ba_nohash_0.wav,wav,,cfde27ba,string,,unknown,string,,tree,string,
+tree/dcb57584_nohash_0,1.0,0,16000,/localscratch/GSC/tree/dcb57584_nohash_0.wav,wav,,dcb57584,string,,unknown,string,,tree,string,
+tree/24ad3ebe_nohash_0,1.0,0,16000,/localscratch/GSC/tree/24ad3ebe_nohash_0.wav,wav,,24ad3ebe,string,,unknown,string,,tree,string,
+tree/422d3197_nohash_1,1.0,0,16000,/localscratch/GSC/tree/422d3197_nohash_1.wav,wav,,422d3197,string,,unknown,string,,tree,string,
+tree/475b61f1_nohash_0,1.0,0,16000,/localscratch/GSC/tree/475b61f1_nohash_0.wav,wav,,475b61f1,string,,unknown,string,,tree,string,
+tree/9dc1889e_nohash_0,1.0,0,16000,/localscratch/GSC/tree/9dc1889e_nohash_0.wav,wav,,9dc1889e,string,,unknown,string,,tree,string,
+tree/b2e2773a_nohash_1,1.0,0,16000,/localscratch/GSC/tree/b2e2773a_nohash_1.wav,wav,,b2e2773a,string,,unknown,string,,tree,string,
+tree/af8b2f2c_nohash_0,1.0,0,16000,/localscratch/GSC/tree/af8b2f2c_nohash_0.wav,wav,,af8b2f2c,string,,unknown,string,,tree,string,
+two/62ff07ef_nohash_1,1.0,0,16000,/localscratch/GSC/two/62ff07ef_nohash_1.wav,wav,,62ff07ef,string,,unknown,string,,two,string,
+two/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/two/b49caed3_nohash_1.wav,wav,,b49caed3,string,,unknown,string,,two,string,
+two/b49caed3_nohash_4,1.0,0,16000,/localscratch/GSC/two/b49caed3_nohash_4.wav,wav,,b49caed3,string,,unknown,string,,two,string,
+two/189cbabe_nohash_1,1.0,0,16000,/localscratch/GSC/two/189cbabe_nohash_1.wav,wav,,189cbabe,string,,unknown,string,,two,string,
+two/fb7eb481_nohash_2,1.0,0,16000,/localscratch/GSC/two/fb7eb481_nohash_2.wav,wav,,fb7eb481,string,,unknown,string,,two,string,
+two/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/two/37dca74f_nohash_3.wav,wav,,37dca74f,string,,unknown,string,,two,string,
+two/a2473d62_nohash_1,1.0,0,16000,/localscratch/GSC/two/a2473d62_nohash_1.wav,wav,,a2473d62,string,,unknown,string,,two,string,
+two/af7a8296_nohash_2,1.0,0,16000,/localscratch/GSC/two/af7a8296_nohash_2.wav,wav,,af7a8296,string,,unknown,string,,two,string,
+two/4c6167ca_nohash_2,1.0,0,16000,/localscratch/GSC/two/4c6167ca_nohash_2.wav,wav,,4c6167ca,string,,unknown,string,,two,string,
+two/8a325749_nohash_0,1.0,0,16000,/localscratch/GSC/two/8a325749_nohash_0.wav,wav,,8a325749,string,,unknown,string,,two,string,
+two/e0c782d5_nohash_4,1.0,0,16000,/localscratch/GSC/two/e0c782d5_nohash_4.wav,wav,,e0c782d5,string,,unknown,string,,two,string,
+two/ca4d5368_nohash_1,1.0,0,16000,/localscratch/GSC/two/ca4d5368_nohash_1.wav,wav,,ca4d5368,string,,unknown,string,,two,string,
+two/a7216980_nohash_2,1.0,0,16000,/localscratch/GSC/two/a7216980_nohash_2.wav,wav,,a7216980,string,,unknown,string,,two,string,
+two/840c366d_nohash_2,1.0,0,16000,/localscratch/GSC/two/840c366d_nohash_2.wav,wav,,840c366d,string,,unknown,string,,two,string,
+two/18f8afd5_nohash_3,1.0,0,16000,/localscratch/GSC/two/18f8afd5_nohash_3.wav,wav,,18f8afd5,string,,unknown,string,,two,string,
+two/8ec6dab6_nohash_1,1.0,0,16000,/localscratch/GSC/two/8ec6dab6_nohash_1.wav,wav,,8ec6dab6,string,,unknown,string,,two,string,
+two/840c366d_nohash_1,1.0,0,16000,/localscratch/GSC/two/840c366d_nohash_1.wav,wav,,840c366d,string,,unknown,string,,two,string,
+two/b97c9f77_nohash_4,1.0,0,16000,/localscratch/GSC/two/b97c9f77_nohash_4.wav,wav,,b97c9f77,string,,unknown,string,,two,string,
+two/1cb788bc_nohash_0,1.0,0,16000,/localscratch/GSC/two/1cb788bc_nohash_0.wav,wav,,1cb788bc,string,,unknown,string,,two,string,
+two/9a69672b_nohash_2,1.0,0,16000,/localscratch/GSC/two/9a69672b_nohash_2.wav,wav,,9a69672b,string,,unknown,string,,two,string,
+two/d5ca80c6_nohash_1,1.0,0,16000,/localscratch/GSC/two/d5ca80c6_nohash_1.wav,wav,,d5ca80c6,string,,unknown,string,,two,string,
+two/1093c8e7_nohash_0,1.0,0,16000,/localscratch/GSC/two/1093c8e7_nohash_0.wav,wav,,1093c8e7,string,,unknown,string,,two,string,
+two/c9b5ff26_nohash_1,1.0,0,16000,/localscratch/GSC/two/c9b5ff26_nohash_1.wav,wav,,c9b5ff26,string,,unknown,string,,two,string,
+two/87070229_nohash_3,1.0,0,16000,/localscratch/GSC/two/87070229_nohash_3.wav,wav,,87070229,string,,unknown,string,,two,string,
+two/863880b7_nohash_1,1.0,0,16000,/localscratch/GSC/two/863880b7_nohash_1.wav,wav,,863880b7,string,,unknown,string,,two,string,
+two/64f1c742_nohash_0,1.0,0,16000,/localscratch/GSC/two/64f1c742_nohash_0.wav,wav,,64f1c742,string,,unknown,string,,two,string,
+two/af130f12_nohash_0,1.0,0,16000,/localscratch/GSC/two/af130f12_nohash_0.wav,wav,,af130f12,string,,unknown,string,,two,string,
+two/9e2ce5e3_nohash_0,1.0,0,16000,/localscratch/GSC/two/9e2ce5e3_nohash_0.wav,wav,,9e2ce5e3,string,,unknown,string,,two,string,
+visual/c7dc7278_nohash_3,1.0,0,16000,/localscratch/GSC/visual/c7dc7278_nohash_3.wav,wav,,c7dc7278,string,,unknown,string,,visual,string,
+visual/a7216980_nohash_4,1.0,0,16000,/localscratch/GSC/visual/a7216980_nohash_4.wav,wav,,a7216980,string,,unknown,string,,visual,string,
+visual/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/visual/893705bb_nohash_1.wav,wav,,893705bb,string,,unknown,string,,visual,string,
+visual/e0c782d5_nohash_1,1.0,0,16000,/localscratch/GSC/visual/e0c782d5_nohash_1.wav,wav,,e0c782d5,string,,unknown,string,,visual,string,
+visual/4fd1443e_nohash_2,1.0,0,16000,/localscratch/GSC/visual/4fd1443e_nohash_2.wav,wav,,4fd1443e,string,,unknown,string,,visual,string,
+visual/bfd26d6b_nohash_2,1.0,0,16000,/localscratch/GSC/visual/bfd26d6b_nohash_2.wav,wav,,bfd26d6b,string,,unknown,string,,visual,string,
+wow/db24628d_nohash_0,1.0,0,16000,/localscratch/GSC/wow/db24628d_nohash_0.wav,wav,,db24628d,string,,unknown,string,,wow,string,
+wow/210f3aa9_nohash_1,1.0,0,16000,/localscratch/GSC/wow/210f3aa9_nohash_1.wav,wav,,210f3aa9,string,,unknown,string,,wow,string,
+wow/6e916de8_nohash_1,1.0,0,16000,/localscratch/GSC/wow/6e916de8_nohash_1.wav,wav,,6e916de8,string,,unknown,string,,wow,string,
+wow/f292725f_nohash_0,1.0,0,16000,/localscratch/GSC/wow/f292725f_nohash_0.wav,wav,,f292725f,string,,unknown,string,,wow,string,
+wow/105a0eea_nohash_0,1.0,0,16000,/localscratch/GSC/wow/105a0eea_nohash_0.wav,wav,,105a0eea,string,,unknown,string,,wow,string,
+wow/893705bb_nohash_1,1.0,0,16000,/localscratch/GSC/wow/893705bb_nohash_1.wav,wav,,893705bb,string,,unknown,string,,wow,string,
+wow/bfd26d6b_nohash_0,1.0,0,16000,/localscratch/GSC/wow/bfd26d6b_nohash_0.wav,wav,,bfd26d6b,string,,unknown,string,,wow,string,
+wow/8c7f81df_nohash_0,1.0,0,16000,/localscratch/GSC/wow/8c7f81df_nohash_0.wav,wav,,8c7f81df,string,,unknown,string,,wow,string,
+wow/210f3aa9_nohash_0,1.0,0,16000,/localscratch/GSC/wow/210f3aa9_nohash_0.wav,wav,,210f3aa9,string,,unknown,string,,wow,string,
+wow/c9b5ff26_nohash_0,1.0,0,16000,/localscratch/GSC/wow/c9b5ff26_nohash_0.wav,wav,,c9b5ff26,string,,unknown,string,,wow,string,
+wow/2005ca25_nohash_1,1.0,0,16000,/localscratch/GSC/wow/2005ca25_nohash_1.wav,wav,,2005ca25,string,,unknown,string,,wow,string,
+wow/9d4bab4f_nohash_0,1.0,0,16000,/localscratch/GSC/wow/9d4bab4f_nohash_0.wav,wav,,9d4bab4f,string,,unknown,string,,wow,string,
+zero/b49caed3_nohash_1,1.0,0,16000,/localscratch/GSC/zero/b49caed3_nohash_1.wav,wav,,b49caed3,string,,unknown,string,,zero,string,
+zero/e41a903b_nohash_2,1.0,0,16000,/localscratch/GSC/zero/e41a903b_nohash_2.wav,wav,,e41a903b,string,,unknown,string,,zero,string,
+zero/03401e93_nohash_0,1.0,0,16000,/localscratch/GSC/zero/03401e93_nohash_0.wav,wav,,03401e93,string,,unknown,string,,zero,string,
+zero/964e8cfd_nohash_3,1.0,0,16000,/localscratch/GSC/zero/964e8cfd_nohash_3.wav,wav,,964e8cfd,string,,unknown,string,,zero,string,
+zero/893705bb_nohash_0,1.0,0,16000,/localscratch/GSC/zero/893705bb_nohash_0.wav,wav,,893705bb,string,,unknown,string,,zero,string,
+zero/fa446c16_nohash_0,1.0,0,16000,/localscratch/GSC/zero/fa446c16_nohash_0.wav,wav,,fa446c16,string,,unknown,string,,zero,string,
+zero/37dca74f_nohash_3,1.0,0,16000,/localscratch/GSC/zero/37dca74f_nohash_3.wav,wav,,37dca74f,string,,unknown,string,,zero,string,
+zero/893705bb_nohash_7,1.0,0,16000,/localscratch/GSC/zero/893705bb_nohash_7.wav,wav,,893705bb,string,,unknown,string,,zero,string,
+zero/c7dc7278_nohash_1,1.0,0,16000,/localscratch/GSC/zero/c7dc7278_nohash_1.wav,wav,,c7dc7278,string,,unknown,string,,zero,string,
+zero/1f3bece8_nohash_1,1.0,0,16000,/localscratch/GSC/zero/1f3bece8_nohash_1.wav,wav,,1f3bece8,string,,unknown,string,,zero,string,
+zero/a6f2fd71_nohash_3,1.0,0,16000,/localscratch/GSC/zero/a6f2fd71_nohash_3.wav,wav,,a6f2fd71,string,,unknown,string,,zero,string,
+zero/3b4f8f24_nohash_4,1.0,0,16000,/localscratch/GSC/zero/3b4f8f24_nohash_4.wav,wav,,3b4f8f24,string,,unknown,string,,zero,string,
+zero/ca4d5368_nohash_5,1.0,0,16000,/localscratch/GSC/zero/ca4d5368_nohash_5.wav,wav,,ca4d5368,string,,unknown,string,,zero,string,
+zero/48a8a69d_nohash_1,1.0,0,16000,/localscratch/GSC/zero/48a8a69d_nohash_1.wav,wav,,48a8a69d,string,,unknown,string,,zero,string,
+zero/2fa39636_nohash_0,1.0,0,16000,/localscratch/GSC/zero/2fa39636_nohash_0.wav,wav,,2fa39636,string,,unknown,string,,zero,string,
+zero/db24628d_nohash_1,1.0,0,16000,/localscratch/GSC/zero/db24628d_nohash_1.wav,wav,,db24628d,string,,unknown,string,,zero,string,
+zero/c7dc7278_nohash_0,1.0,0,16000,/localscratch/GSC/zero/c7dc7278_nohash_0.wav,wav,,c7dc7278,string,,unknown,string,,zero,string,
+zero/db24628d_nohash_3,1.0,0,16000,/localscratch/GSC/zero/db24628d_nohash_3.wav,wav,,db24628d,string,,unknown,string,,zero,string,
+zero/893705bb_nohash_8,1.0,0,16000,/localscratch/GSC/zero/893705bb_nohash_8.wav,wav,,893705bb,string,,unknown,string,,zero,string,
+zero/af405b69_nohash_0,1.0,0,16000,/localscratch/GSC/zero/af405b69_nohash_0.wav,wav,,af405b69,string,,unknown,string,,zero,string,
+zero/692a88e6_nohash_0,1.0,0,16000,/localscratch/GSC/zero/692a88e6_nohash_0.wav,wav,,692a88e6,string,,unknown,string,,zero,string,
+zero/84d1e469_nohash_2,1.0,0,16000,/localscratch/GSC/zero/84d1e469_nohash_2.wav,wav,,84d1e469,string,,unknown,string,,zero,string,
+zero/422d3197_nohash_0,1.0,0,16000,/localscratch/GSC/zero/422d3197_nohash_0.wav,wav,,422d3197,string,,unknown,string,,zero,string,
+zero/3d86b69a_nohash_0,1.0,0,16000,/localscratch/GSC/zero/3d86b69a_nohash_0.wav,wav,,3d86b69a,string,,unknown,string,,zero,string,
+zero/97f4c236_nohash_4,1.0,0,16000,/localscratch/GSC/zero/97f4c236_nohash_4.wav,wav,,97f4c236,string,,unknown,string,,zero,string,
+zero/beb458a4_nohash_2,1.0,0,16000,/localscratch/GSC/zero/beb458a4_nohash_2.wav,wav,,beb458a4,string,,unknown,string,,zero,string,
+zero/c22d3f18_nohash_0,1.0,0,16000,/localscratch/GSC/zero/c22d3f18_nohash_0.wav,wav,,c22d3f18,string,,unknown,string,,zero,string,
+zero/d103dd6e_nohash_0,1.0,0,16000,/localscratch/GSC/zero/d103dd6e_nohash_0.wav,wav,,d103dd6e,string,,unknown,string,,zero,string,
+zero/f264e0df_nohash_2,1.0,0,16000,/localscratch/GSC/zero/f264e0df_nohash_2.wav,wav,,f264e0df,string,,unknown,string,,zero,string,
+zero/ad6a46f1_nohash_1,1.0,0,16000,/localscratch/GSC/zero/ad6a46f1_nohash_1.wav,wav,,ad6a46f1,string,,unknown,string,,zero,string,
+zero/f2e59fea_nohash_2,1.0,0,16000,/localscratch/GSC/zero/f2e59fea_nohash_2.wav,wav,,f2e59fea,string,,unknown,string,,zero,string,
+zero/5170b77f_nohash_3,1.0,0,16000,/localscratch/GSC/zero/5170b77f_nohash_3.wav,wav,,5170b77f,string,,unknown,string,,zero,string,
+zero/e49428d9_nohash_4,1.0,0,16000,/localscratch/GSC/zero/e49428d9_nohash_4.wav,wav,,e49428d9,string,,unknown,string,,zero,string,
+_background_noise_/white_noise/332777_0,1.0,332777,348777,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/386288_1,1.0,386288,402288,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/327572_2,1.0,327572,343572,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/744655_3,1.0,744655,760655,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/415040_4,1.0,415040,431040,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/355614_5,1.0,355614,371614,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/110836_6,1.0,110836,126836,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/592400_7,1.0,592400,608400,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/140549_8,1.0,140549,156549,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/392953_9,1.0,392953,408953,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/278904_10,1.0,278904,294904,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/333246_11,1.0,333246,349246,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/602673_12,1.0,602673,618673,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/939124_13,1.0,939124,955124,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/28716_14,1.0,28716,44716,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/235868_15,1.0,235868,251868,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/631017_16,1.0,631017,647017,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/801455_17,1.0,801455,817455,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/707623_18,1.0,707623,723623,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/386662_19,1.0,386662,402662,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/578566_20,1.0,578566,594566,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/221100_21,1.0,221100,237100,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/585938_22,1.0,585938,601938,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/442700_23,1.0,442700,458700,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/390435_24,1.0,390435,406435,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/756698_25,1.0,756698,772698,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/228094_26,1.0,228094,244094,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/696745_27,1.0,696745,712745,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/347060_28,1.0,347060,363060,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/910067_29,1.0,910067,926067,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/766213_30,1.0,766213,782213,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/501919_31,1.0,501919,517919,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/877776_32,1.0,877776,893776,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/875659_33,1.0,875659,891659,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/450027_34,1.0,450027,466027,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/367243_35,1.0,367243,383243,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/83196_36,1.0,83196,99196,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/424930_37,1.0,424930,440930,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/139343_38,1.0,139343,155343,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/133501_39,1.0,133501,149501,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/374324_40,1.0,374324,390324,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/16235_41,1.0,16235,32235,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/130964_42,1.0,130964,146964,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/200271_43,1.0,200271,216271,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/508100_44,1.0,508100,524100,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/64144_45,1.0,64144,80144,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/649209_46,1.0,649209,665209,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/913016_47,1.0,913016,929016,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/726187_48,1.0,726187,742187,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/619192_49,1.0,619192,635192,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/649692_50,1.0,649692,665692,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/942211_51,1.0,942211,958211,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/749781_52,1.0,749781,765781,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/815796_53,1.0,815796,831796,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/841995_54,1.0,841995,857995,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/427105_55,1.0,427105,443105,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/739426_56,1.0,739426,755426,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/532647_57,1.0,532647,548647,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/567492_58,1.0,567492,583492,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/544827_59,1.0,544827,560827,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/424222_60,1.0,424222,440222,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/258089_61,1.0,258089,274089,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/306990_62,1.0,306990,322990,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/76834_63,1.0,76834,92834,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/228212_64,1.0,228212,244212,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/845982_65,1.0,845982,861982,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/white_noise/180635_66,1.0,180635,196635,/localscratch/GSC/_background_noise_/white_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/548841_0,1.0,548841,564841,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/895141_1,1.0,895141,911141,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/21577_2,1.0,21577,37577,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/823248_3,1.0,823248,839248,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/881607_4,1.0,881607,897607,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/776612_5,1.0,776612,792612,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/816982_6,1.0,816982,832982,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/887413_7,1.0,887413,903413,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/866611_8,1.0,866611,882611,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/198865_9,1.0,198865,214865,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/636239_10,1.0,636239,652239,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/923032_11,1.0,923032,939032,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/922959_12,1.0,922959,938959,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/544387_13,1.0,544387,560387,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/575024_14,1.0,575024,591024,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/166796_15,1.0,166796,182796,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/669378_16,1.0,669378,685378,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/573787_17,1.0,573787,589787,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/849175_18,1.0,849175,865175,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/593864_19,1.0,593864,609864,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/195600_20,1.0,195600,211600,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/714531_21,1.0,714531,730531,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/255648_22,1.0,255648,271648,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/323417_23,1.0,323417,339417,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/605431_24,1.0,605431,621431,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/898529_25,1.0,898529,914529,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/148863_26,1.0,148863,164863,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/644029_27,1.0,644029,660029,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/21375_28,1.0,21375,37375,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/333549_29,1.0,333549,349549,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/607297_30,1.0,607297,623297,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/777052_31,1.0,777052,793052,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/892206_32,1.0,892206,908206,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/724405_33,1.0,724405,740405,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/403436_34,1.0,403436,419436,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/920232_35,1.0,920232,936232,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/599580_36,1.0,599580,615580,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/440122_37,1.0,440122,456122,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/727023_38,1.0,727023,743023,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/726892_39,1.0,726892,742892,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/412944_40,1.0,412944,428944,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/899878_41,1.0,899878,915878,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/830398_42,1.0,830398,846398,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/147425_43,1.0,147425,163425,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/342164_44,1.0,342164,358164,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/933177_45,1.0,933177,949177,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/908063_46,1.0,908063,924063,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/448061_47,1.0,448061,464061,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/918435_48,1.0,918435,934435,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/860040_49,1.0,860040,876040,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/138520_50,1.0,138520,154520,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/331985_51,1.0,331985,347985,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/443536_52,1.0,443536,459536,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/342856_53,1.0,342856,358856,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/518310_54,1.0,518310,534310,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/374162_55,1.0,374162,390162,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/217174_56,1.0,217174,233174,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/45340_57,1.0,45340,61340,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/859487_58,1.0,859487,875487,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/496330_59,1.0,496330,512330,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/791838_60,1.0,791838,807838,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/381293_61,1.0,381293,397293,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/20569_62,1.0,20569,36569,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/747857_63,1.0,747857,763857,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/193434_64,1.0,193434,209434,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/113712_65,1.0,113712,129712,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/dude_miaowing/443562_66,1.0,443562,459562,/localscratch/GSC/_background_noise_/dude_miaowing.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/437275_0,1.0,437275,453275,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/290212_1,1.0,290212,306212,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/746013_2,1.0,746013,762013,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/321801_3,1.0,321801,337801,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/172691_4,1.0,172691,188691,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/943424_5,1.0,943424,959424,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/59664_6,1.0,59664,75664,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/7281_7,1.0,7281,23281,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/246437_8,1.0,246437,262437,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/678803_9,1.0,678803,694803,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/504281_10,1.0,504281,520281,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/151040_11,1.0,151040,167040,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/377402_12,1.0,377402,393402,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/842550_13,1.0,842550,858550,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/176698_14,1.0,176698,192698,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/583602_15,1.0,583602,599602,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/683985_16,1.0,683985,699985,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/348941_17,1.0,348941,364941,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/600572_18,1.0,600572,616572,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/350280_19,1.0,350280,366280,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/710216_20,1.0,710216,726216,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/596776_21,1.0,596776,612776,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/878981_22,1.0,878981,894981,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/263623_23,1.0,263623,279623,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/456437_24,1.0,456437,472437,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/321748_25,1.0,321748,337748,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/145537_26,1.0,145537,161537,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/217102_27,1.0,217102,233102,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/500831_28,1.0,500831,516831,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/422719_29,1.0,422719,438719,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/432514_30,1.0,432514,448514,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/364968_31,1.0,364968,380968,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/229611_32,1.0,229611,245611,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/133841_33,1.0,133841,149841,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/350247_34,1.0,350247,366247,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/925757_35,1.0,925757,941757,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/139908_36,1.0,139908,155908,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/561171_37,1.0,561171,577171,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/942520_38,1.0,942520,958520,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/13604_39,1.0,13604,29604,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/811821_40,1.0,811821,827821,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/454528_41,1.0,454528,470528,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/867585_42,1.0,867585,883585,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/897486_43,1.0,897486,913486,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/292995_44,1.0,292995,308995,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/861787_45,1.0,861787,877787,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/797144_46,1.0,797144,813144,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/256069_47,1.0,256069,272069,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/100143_48,1.0,100143,116143,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/90125_49,1.0,90125,106125,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/288035_50,1.0,288035,304035,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/98026_51,1.0,98026,114026,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/711647_52,1.0,711647,727647,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/267528_53,1.0,267528,283528,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/603575_54,1.0,603575,619575,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/509451_55,1.0,509451,525451,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/903144_56,1.0,903144,919144,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/656803_57,1.0,656803,672803,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/587371_58,1.0,587371,603371,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/487513_59,1.0,487513,503513,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/578673_60,1.0,578673,594673,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/196687_61,1.0,196687,212687,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/432201_62,1.0,432201,448201,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/49534_63,1.0,49534,65534,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/540333_64,1.0,540333,556333,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/673744_65,1.0,673744,689744,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/running_tap/659553_66,1.0,659553,675553,/localscratch/GSC/_background_noise_/running_tap.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/147738_0,1.0,147738,163738,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1423693_1,1.0,1423693,1439693,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1131735_2,1.0,1131735,1147735,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/496136_3,1.0,496136,512136,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/118294_4,1.0,118294,134294,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/692315_5,1.0,692315,708315,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/409144_6,1.0,409144,425144,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/615568_7,1.0,615568,631568,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/406235_8,1.0,406235,422235,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/663691_9,1.0,663691,679691,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/576725_10,1.0,576725,592725,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1264124_11,1.0,1264124,1280124,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/167579_12,1.0,167579,183579,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1354259_13,1.0,1354259,1370259,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1315990_14,1.0,1315990,1331990,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1325464_15,1.0,1325464,1341464,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/218953_16,1.0,218953,234953,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/169661_17,1.0,169661,185661,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/186553_18,1.0,186553,202553,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/853022_19,1.0,853022,869022,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/899978_20,1.0,899978,915978,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/122061_21,1.0,122061,138061,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1202893_22,1.0,1202893,1218893,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1223873_23,1.0,1223873,1239873,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/15887_24,1.0,15887,31887,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1149234_25,1.0,1149234,1165234,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1053554_26,1.0,1053554,1069554,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/730722_27,1.0,730722,746722,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/167602_28,1.0,167602,183602,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/286541_29,1.0,286541,302541,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/675073_30,1.0,675073,691073,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/780394_31,1.0,780394,796394,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/59538_32,1.0,59538,75538,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/143953_33,1.0,143953,159953,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1401403_34,1.0,1401403,1417403,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/25927_35,1.0,25927,41927,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1130339_36,1.0,1130339,1146339,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/925793_37,1.0,925793,941793,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/484250_38,1.0,484250,500250,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/675934_39,1.0,675934,691934,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/672765_40,1.0,672765,688765,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/129147_41,1.0,129147,145147,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/452281_42,1.0,452281,468281,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1404375_43,1.0,1404375,1420375,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1088161_44,1.0,1088161,1104161,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1150416_45,1.0,1150416,1166416,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/130401_46,1.0,130401,146401,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1294031_47,1.0,1294031,1310031,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/657638_48,1.0,657638,673638,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/221986_49,1.0,221986,237986,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/431201_50,1.0,431201,447201,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/856029_51,1.0,856029,872029,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1130343_52,1.0,1130343,1146343,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/58509_53,1.0,58509,74509,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/636929_54,1.0,636929,652929,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/198073_55,1.0,198073,214073,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/220214_56,1.0,220214,236214,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1498226_57,1.0,1498226,1514226,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/434414_58,1.0,434414,450414,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/1030456_59,1.0,1030456,1046456,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/807535_60,1.0,807535,823535,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/645694_61,1.0,645694,661694,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/916499_62,1.0,916499,932499,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/981057_63,1.0,981057,997057,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/385690_64,1.0,385690,401690,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/476968_65,1.0,476968,492968,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/doing_the_dishes/911503_66,1.0,911503,927503,/localscratch/GSC/_background_noise_/doing_the_dishes.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/954674_0,1.0,954674,970674,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/256906_1,1.0,256906,272906,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/830653_2,1.0,830653,846653,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/538466_3,1.0,538466,554466,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/23020_4,1.0,23020,39020,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/49307_5,1.0,49307,65307,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/829608_6,1.0,829608,845608,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/204495_7,1.0,204495,220495,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/209337_8,1.0,209337,225337,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/15975_9,1.0,15975,31975,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/824970_10,1.0,824970,840970,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/194335_11,1.0,194335,210335,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/283227_12,1.0,283227,299227,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/109432_13,1.0,109432,125432,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/299697_14,1.0,299697,315697,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/212325_15,1.0,212325,228325,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/13143_16,1.0,13143,29143,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/638463_17,1.0,638463,654463,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/690989_18,1.0,690989,706989,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/624014_19,1.0,624014,640014,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/689815_20,1.0,689815,705815,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/884805_21,1.0,884805,900805,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/768015_22,1.0,768015,784015,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/264475_23,1.0,264475,280475,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/407700_24,1.0,407700,423700,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/525202_25,1.0,525202,541202,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/855159_26,1.0,855159,871159,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/414391_27,1.0,414391,430391,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/562876_28,1.0,562876,578876,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/285095_29,1.0,285095,301095,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/757060_30,1.0,757060,773060,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/793750_31,1.0,793750,809750,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/492840_32,1.0,492840,508840,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/931711_33,1.0,931711,947711,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/748657_34,1.0,748657,764657,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/430199_35,1.0,430199,446199,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/802129_36,1.0,802129,818129,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/702492_37,1.0,702492,718492,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/178453_38,1.0,178453,194453,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/227492_39,1.0,227492,243492,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/704642_40,1.0,704642,720642,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/218718_41,1.0,218718,234718,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/540107_42,1.0,540107,556107,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/24720_43,1.0,24720,40720,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/430739_44,1.0,430739,446739,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/856505_45,1.0,856505,872505,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/863342_46,1.0,863342,879342,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/758138_47,1.0,758138,774138,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/325841_48,1.0,325841,341841,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/542702_49,1.0,542702,558702,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/491824_50,1.0,491824,507824,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/735382_51,1.0,735382,751382,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/908119_52,1.0,908119,924119,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/378738_53,1.0,378738,394738,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/845054_54,1.0,845054,861054,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/101124_55,1.0,101124,117124,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/256261_56,1.0,256261,272261,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/181903_57,1.0,181903,197903,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/713151_58,1.0,713151,729151,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/222336_59,1.0,222336,238336,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/197070_60,1.0,197070,213070,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/476043_61,1.0,476043,492043,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/918374_62,1.0,918374,934374,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/923137_63,1.0,923137,939137,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/326999_64,1.0,326999,342999,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/105564_65,1.0,105564,121564,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/exercise_bike/497518_66,1.0,497518,513518,/localscratch/GSC/_background_noise_/exercise_bike.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/564765_0,1.0,564765,580765,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/530050_1,1.0,530050,546050,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/697920_2,1.0,697920,713920,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/669505_3,1.0,669505,685505,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/790785_4,1.0,790785,806785,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/880143_5,1.0,880143,896143,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/368946_6,1.0,368946,384946,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/809520_7,1.0,809520,825520,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/757275_8,1.0,757275,773275,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/229520_9,1.0,229520,245520,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/368223_10,1.0,368223,384223,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/88326_11,1.0,88326,104326,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/237190_12,1.0,237190,253190,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/21312_13,1.0,21312,37312,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/517466_14,1.0,517466,533466,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/246252_15,1.0,246252,262252,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/425773_16,1.0,425773,441773,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/93868_17,1.0,93868,109868,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/663261_18,1.0,663261,679261,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/629187_19,1.0,629187,645187,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/218655_20,1.0,218655,234655,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/161442_21,1.0,161442,177442,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/11032_22,1.0,11032,27032,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/281288_23,1.0,281288,297288,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/599937_24,1.0,599937,615937,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/69033_25,1.0,69033,85033,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/795336_26,1.0,795336,811336,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/119030_27,1.0,119030,135030,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/770064_28,1.0,770064,786064,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/724069_29,1.0,724069,740069,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/218261_30,1.0,218261,234261,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/763110_31,1.0,763110,779110,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/165015_32,1.0,165015,181015,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/694223_33,1.0,694223,710223,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/538271_34,1.0,538271,554271,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/534834_35,1.0,534834,550834,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/821113_36,1.0,821113,837113,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/159983_37,1.0,159983,175983,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/689569_38,1.0,689569,705569,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/133844_39,1.0,133844,149844,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/496283_40,1.0,496283,512283,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/115876_41,1.0,115876,131876,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/154279_42,1.0,154279,170279,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/861670_43,1.0,861670,877670,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/440917_44,1.0,440917,456917,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/56826_45,1.0,56826,72826,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/104904_46,1.0,104904,120904,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/301824_47,1.0,301824,317824,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/434169_48,1.0,434169,450169,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/261051_49,1.0,261051,277051,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/717263_50,1.0,717263,733263,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/409806_51,1.0,409806,425806,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/605698_52,1.0,605698,621698,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/80166_53,1.0,80166,96166,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/30790_54,1.0,30790,46790,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/680623_55,1.0,680623,696623,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/287301_56,1.0,287301,303301,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/295295_57,1.0,295295,311295,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/457582_58,1.0,457582,473582,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/56645_59,1.0,56645,72645,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/923864_60,1.0,923864,939864,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/62500_61,1.0,62500,78500,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/297966_62,1.0,297966,313966,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/440433_63,1.0,440433,456433,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/115777_64,1.0,115777,131777,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/380144_65,1.0,380144,396144,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
+_background_noise_/pink_noise/509768_66,1.0,509768,525768,/localscratch/GSC/_background_noise_/pink_noise.wav,wav,,,,,silence,string,,,,
diff --git a/egs2/speechcommands/asr1/path.sh b/egs2/speechcommands/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/speechcommands/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/speechcommands/asr1/pyscripts b/egs2/speechcommands/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/speechcommands/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/speechcommands/asr1/run.sh b/egs2/speechcommands/asr1/run.sh
new file mode 100755
index 00000000000..0f42e2ea416
--- /dev/null
+++ b/egs2/speechcommands/asr1/run.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+num_commands=12         # 12 or 35
+train_set=train
+valid_set=dev
+if [ ${num_commands} -eq 12 ]; then
+    test_sets="dev test test_speechbrain"
+elif [ ${num_commands} -eq 35 ]; then
+    test_sets='dev test'
+else
+    echo "invalid num_commands: ${num_commands}"
+    exit 1
+fi
+
+asr_tag=conformer_noBatchNorm_${num_commands}commands
+inference_tag=infer
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 0.95 1.0 1.05 1.1"
+
+./asr.sh                                                \
+    --local_data_opts "--num_commands ${num_commands}"  \
+    --skip_data_prep false                              \
+    --skip_train false                                  \
+    --skip_eval false                                   \
+    --ngpu 1                                            \
+    --nj 8                                              \
+    --inference_nj 8                                    \
+    --speed_perturb_factors "${speed_perturb_factors}"  \
+    --feats_type fbank_pitch                            \
+    --audio_format wav                                  \
+    --fs 16000                                          \
+    --token_type word                                   \
+    --use_lm false                                      \
+    --asr_tag "${asr_tag}"                              \
+    --asr_config "${asr_config}"                        \
+    --inference_tag "${inference_tag}"                  \
+    --inference_config "${inference_config}"            \
+    --inference_asr_model valid.acc.ave.pth             \
+    --train_set "${train_set}"                          \
+    --valid_set "${valid_set}"                          \
+    --test_sets "${test_sets}"                          \
+    --local_score_opts "--inference_tag ${inference_tag}" \
+    --lm_train_text "data/${train_set}/text" "$@"           
diff --git a/egs2/speechcommands/asr1/scripts b/egs2/speechcommands/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/speechcommands/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/speechcommands/asr1/steps b/egs2/speechcommands/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/speechcommands/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/speechcommands/asr1/utils b/egs2/speechcommands/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/speechcommands/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/spgispeech/asr1/cmd.sh b/egs2/spgispeech/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/spgispeech/asr1/cmd.sh
+++ b/egs2/spgispeech/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/su_openslr36/asr1/README.md b/egs2/su_openslr36/asr1/README.md
new file mode 100644
index 00000000000..fcf1e52da29
--- /dev/null
+++ b/egs2/su_openslr36/asr1/README.md
@@ -0,0 +1,30 @@
+# RESULTS
+## Environments
+- date: `Fri Jul  9 20:43:31 PDT 2021`
+- python version: `3.8.5 (default, Sep  4 2020, 07:30:14)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.8.1+cu102`
+- Git hash: `049c1203da14ec06a8f8290575f5a44a5b1634d1`
+  - Commit date: `Fri Jul 9 08:52:32 2021 -0700`
+
+## asr_train_asr_raw_bpe1000
+- ASR config: [conf/train_asr.yaml](conf/train_asr.yaml)
+- Pretrained model: [https://zenodo.org/record/5090135](https://zenodo.org/record/5090135)
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.best/sunda_test|2185|17916|98.5|1.2|0.3|0.1|1.6|5.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.best/sunda_test|2185|117265|99.5|0.2|0.3|0.1|0.6|5.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.best/sunda_test|2185|36414|98.5|0.9|0.6|0.2|1.6|5.0|
+
diff --git a/egs2/su_openslr36/asr1/asr.sh b/egs2/su_openslr36/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/su_openslr36/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/cmd.sh b/egs2/su_openslr36/asr1/cmd.sh
new file mode 120000
index 00000000000..f77e339f822
--- /dev/null
+++ b/egs2/su_openslr36/asr1/cmd.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/cmd.sh
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/conf/decode_asr.yaml b/egs2/su_openslr36/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/conf/fbank.conf b/egs2/su_openslr36/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/su_openslr36/asr1/conf/pbs.conf b/egs2/su_openslr36/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/su_openslr36/asr1/conf/pitch.conf b/egs2/su_openslr36/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/su_openslr36/asr1/conf/queue.conf b/egs2/su_openslr36/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/su_openslr36/asr1/conf/slurm.conf b/egs2/su_openslr36/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/su_openslr36/asr1/conf/train_asr.yaml b/egs2/su_openslr36/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/conf/train_lm.yaml b/egs2/su_openslr36/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/su_openslr36/asr1/conf/tuning/decode_rnn.yaml b/egs2/su_openslr36/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/su_openslr36/asr1/conf/tuning/decode_transformer.yaml b/egs2/su_openslr36/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/su_openslr36/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/su_openslr36/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..ebf51f799c5
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,68 @@
+# This configuration requires 4 GPUs with 32GB memory
+batch_type: numel
+batch_bins: 30000000
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d6
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/su_openslr36/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/su_openslr36/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/su_openslr36/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..c31ab13317e
--- /dev/null
+++ b/egs2/su_openslr36/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 20
+max_epoch: 200
+optim_conf:
+    lr: 10.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: chainer
diff --git a/egs2/su_openslr36/asr1/db.sh b/egs2/su_openslr36/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/su_openslr36/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/local/data.sh b/egs2/su_openslr36/asr1/local/data.sh
new file mode 100755
index 00000000000..6e6e5402c88
--- /dev/null
+++ b/egs2/su_openslr36/asr1/local/data.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=1
+# inclusive, was 100
+SECONDS=0
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+. utils/parse_options.sh
+
+log "data preparation started"
+
+mkdir -p ${SUNDA}
+
+workspace=$PWD
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "sub-stage 0: Download Data to downloads"
+
+    cd ${SUNDA}
+    idxs=("0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "a" "b" "c" "d" "e" "f")
+    for i in "${idxs[@]}"; do
+        wget https://www.openslr.org/resources/36/asr_sundanese_${i}.zip
+        unzip -o asr_sundanese_${i}.zip
+        rm -f asr_sundanese_${i}.zip
+    done
+    mv asr_sundanese/* .
+    rm -rf asr_sundanese
+    cd $workspace
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "sub-stage 1: Preparing Data for openslr"
+
+    python3 local/sunda_data_prep.py -d ${SUNDA}
+    utils/spk2utt_to_utt2spk.pl data/sunda_train/spk2utt > data/sunda_train/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/sunda_dev/spk2utt > data/sunda_dev/utt2spk
+    utils/spk2utt_to_utt2spk.pl data/sunda_test/spk2utt > data/sunda_test/utt2spk
+    utils/fix_data_dir.sh data/sunda_train
+    utils/fix_data_dir.sh data/sunda_dev
+    utils/fix_data_dir.sh data/sunda_test
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/su_openslr36/asr1/local/path.sh b/egs2/su_openslr36/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/su_openslr36/asr1/local/sunda_data_prep.py b/egs2/su_openslr36/asr1/local/sunda_data_prep.py
new file mode 100644
index 00000000000..f2196874b91
--- /dev/null
+++ b/egs2/su_openslr36/asr1/local/sunda_data_prep.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 Carnegie Mellon University (Peter Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import argparse
+import os
+import random
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", help="downloads directory", type=str, default="downloads")
+    args = parser.parse_args()
+
+    tsv_path = "%s/utt_spk_text.tsv" % args.d
+
+    with open(tsv_path, "r") as inf:
+        tsv_lines = inf.readlines()
+    tsv_lines = [line.strip() for line in tsv_lines]
+
+    spk2utt = {}
+    utt2text = {}
+    for line in tsv_lines:
+        l_list = line.split("\t")
+        fid = l_list[0]
+        spk = l_list[1]
+        text = l_list[2]
+        path = "%s/data/%s/%s.flac" % (args.d, fid[:2], fid)
+        if os.path.exists(path):
+            utt2text[fid] = text
+            if spk in spk2utt:
+                spk2utt[spk].append(fid)
+            else:
+                spk2utt[spk] = [fid]
+
+    spks = sorted(list(spk2utt.keys()))
+    num_fids = 0
+    num_test_spks = 0
+    for spk in spks:
+        num_test_spks += 1
+        fids = sorted(list(set(spk2utt[spk])))
+        num_fids += len(fids)
+        if num_fids >= 2000:
+            break
+    test_spks = spks[:num_test_spks]
+    train_dev_spks = spks[num_test_spks:]
+    random.Random(0).shuffle(train_dev_spks)
+    num_train = int(len(train_dev_spks) * 0.9)
+    train_spks = train_dev_spks[:num_train]
+    dev_spks = train_dev_spks[num_train:]
+
+    spks_by_phase = {"train": train_spks, "dev": dev_spks, "test": test_spks}
+    flac_dir = "%s/data" % args.d
+    sr = 16000
+    for phase in spks_by_phase:
+        spks = spks_by_phase[phase]
+        text_strs = []
+        wav_scp_strs = []
+        spk2utt_strs = []
+        num_fids = 0
+        for spk in spks:
+            fids = sorted(list(set(spk2utt[spk])))
+            num_fids += len(fids)
+            if phase == "test" and num_fids > 2000:
+                curr_num_fids = num_fids - 2000
+                random.Random(1).shuffle(fids)
+                fids = fids[:curr_num_fids]
+            utts = [spk + "-" + f for f in fids]
+            utts_str = " ".join(utts)
+            spk2utt_strs.append("%s %s" % (spk, utts_str))
+            for fid, utt in zip(fids, utts):
+                cmd = "ffmpeg -i %s/%s/%s.flac -f wav -ar %d -ab 16 -ac 1 - |" % (
+                    flac_dir,
+                    fid[:2],
+                    fid,
+                    sr,
+                )
+                text = utt2text[fid].upper()
+                text_strs.append("%s %s" % (utt, text))
+                wav_scp_strs.append("%s %s" % (utt, cmd))
+        phase_dir = "data/sunda_%s" % phase
+        os.makedirs(phase_dir)
+        text_strs = sorted(text_strs)
+        wav_scp_strs = sorted(wav_scp_strs)
+        spk2utt_strs = sorted(spk2utt_strs)
+        with open(os.path.join(phase_dir, "text"), "w+") as ouf:
+            for s in text_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "wav.scp"), "w+") as ouf:
+            for s in wav_scp_strs:
+                ouf.write("%s\n" % s)
+        with open(os.path.join(phase_dir, "spk2utt"), "w+") as ouf:
+            for s in spk2utt_strs:
+                ouf.write("%s\n" % s)
diff --git a/egs2/su_openslr36/asr1/path.sh b/egs2/su_openslr36/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/su_openslr36/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/pyscripts b/egs2/su_openslr36/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/su_openslr36/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/run.sh b/egs2/su_openslr36/asr1/run.sh
new file mode 100755
index 00000000000..6484c421d35
--- /dev/null
+++ b/egs2/su_openslr36/asr1/run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+lid=false # whether to use language id as additional label
+
+train_set="sunda_train"
+train_dev="sunda_dev"
+test_set="sunda_test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+ngpu=1
+
+./asr.sh \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu ${ngpu} \
+    --nj 80 \
+    --inference_nj 256 \
+    --gpu_inference true \
+    --inference_args "--batch_size 1" \
+    --use_lm false \
+    --token_type bpe \
+    --nbpe 1000 \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --lm_train_text "data/${train_set}/text" \
+    --local_score_opts "--score_lang_id ${lid}" "$@"
diff --git a/egs2/su_openslr36/asr1/scripts b/egs2/su_openslr36/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/su_openslr36/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/steps b/egs2/su_openslr36/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/su_openslr36/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/su_openslr36/asr1/utils b/egs2/su_openslr36/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/su_openslr36/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/swbd/asr1/RESULTS.md b/egs2/swbd/asr1/RESULTS.md
new file mode 100644
index 00000000000..f9ccd4b391b
--- /dev/null
+++ b/egs2/swbd/asr1/RESULTS.md
@@ -0,0 +1,61 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Fri May 14 07:43:22 UTC 2021`
+- model link: https://zenodo.org/record/4978923/files/asr_train_asr_cformer5_raw_bpe2000_sp_valid.acc.ave.zip?download=1
+- python version: `3.8.8 (default, Apr 13 2021, 19:58:26)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.9`
+- pytorch version: `pytorch 1.8.0+cu111`
+- Git hash: `64f026d35013e9f0058bcdeab86eb28fed48ed4b`
+  - Commit date: `Fri May 7 09:31:16 2021 +0000`
+
+## asr_train_asr_cformer5_raw_bpe2000_sp
+### WER
+
+```
+exp_sp/train_nodup_sp_pytorch_train_pytorch_conformer_lr5_specaug_resume/decode_eval2000_model.last10.avg.best_decode_train_transformer_lm_pytorch_swbd+fisher_bpe2000/scoring/hyp.callhm.ctm.filt.sys
+|       SPKR              |        # Snt              # Wrd        |        Corr                 Sub                  Del                 Ins                  Err               S.Err        |
+|       Sum/Avg           |        2628               21594        |        84.4                 9.6                  3.8                 2.2                 15.6                49.4        |
+exp_sp/train_nodup_sp_pytorch_train_pytorch_conformer_lr5_specaug_resume/decode_eval2000_model.last10.avg.best_decode_train_transformer_lm_pytorch_swbd+fisher_bpe2000/scoring/hyp.ctm.filt.sys
+|       SPKR              |       # Snt             # Wrd        |       Corr                 Sub                Del                 Ins                 Err              S.Err        |
+|       Sum/Avg           |       4459              42989        |       89.6                7.0                3.4                 1.6                12.0               44.9        |
+exp_sp/train_nodup_sp_pytorch_train_pytorch_conformer_lr5_specaug_resume/decode_eval2000_model.last10.avg.best_decode_train_transformer_lm_pytorch_swbd+fisher_bpe2000/scoring/hyp.swbd.ctm.filt.sys
+|       SPKR             |        # Snt              # Wrd        |       Corr                  Sub                 Del                 Ins                  Err               S.Err        |
+|       Sum/Avg          |        1831               21395        |       92.5                  4.4                 3.1                 0.9                  8.4                38.3        |
+```
+
+## Fbank-pitch Model with BPE2k, Specaug- with and without 12L, BPE2k SWBD+Fisher Transformer LM
+
+## Environments
+- date: `Sun Jun 20 20:06:18 EDT 2021`
+- python version: `3.8.8 (default, Feb 24 2021, 21:46:12)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.9`
+- pytorch version: `pytorch 1.8.0`
+- Git hash: `72a229498b4f8f8aff5251753862668a0a955ebe`
+- Commit date: `Sat Jun 19 19:42:50 2021 -0400`
+- model link: https://zenodo.org/record/5006864/files/asr_train_conformer_fbank_pitch_bpe2000_valid.acc.ave_10best.zip?download=1
+
+Without LM Rescoring
+```
+exp/asr_train_asr_conformer_fbank_pitch_bpe2000/decode_asr_asr_model_valid.acc.ave_10best/eval2000/score_wer/scoring/hyp.callhm.ctm.filt.sys
+|       SPKR              |        # Snt              # Wrd        |        Corr                 Sub                  Del                 Ins                  Err               S.Err        |
+|	Sum/Avg	|	2628	|	21594	|	87.4	|	9.6	|	3.0	|	2.0	|	14.6	|	49.7	|
+exp/asr_train_asr_conformer_fbank_pitch_bpe2000/decode_asr_asr_model_valid.acc.ave_10best/eval2000/score_wer/scoring/hyp.ctm.filt.sys	
+|       SPKR              |        # Snt              # Wrd        |        Corr                 Sub                  Del                 Ins                  Err               S.Err        |
+|	Sum/Avg	|	4459	|	42989	|	90.5	|	7.0	|	2.5	|	1.5	|	10.9	|	44.7	|
+exp/asr_train_asr_conformer_fbank_pitch_bpe2000/decode_asr_asr_model_valid.acc.ave_10best/eval2000/score_wer/scoring/hyp.swbd.ctm.filt.sys
+|       SPKR              |        # Snt              # Wrd        |        Corr                 Sub                  Del                 Ins                  Err               S.Err        |
+|	Sum/Avg	|	1831	|	21395	|	93.7	|	4.3	|	2.0	|	0.9	|	7.2	|	37.7	|
+```
+ With Transformer LM
+```
+exp/asr_train_asr_conformer_fbank_pitch_bpe2000/decode_lm_transformer_valid.loss.ave_asr_model_valid.acc.ave_10best/eval2000/score_wer/scoring/hyp.callhm.ctm.filt.sys
+|       SPKR              |        # Snt              # Wrd        |        Corr                 Sub                  Del                 Ins                  Err               S.Err        |
+|	Sum/Avg	|	2628	|	21594	|	88.0	|	8.9	|	3.1	|	2.0	|	14.0	|	48.0	|
+exp/asr_train_asr_conformer_fbank_pitch_bpe2000/decode_lm_transformer_valid.loss.ave_asr_model_valid.acc.ave_10best/eval2000/score_wer/scoring/hyp.ctm.filt.sys
+|       SPKR              |        # Snt              # Wrd        |        Corr                 Sub                  Del                 Ins                  Err               S.Err        |
+|	Sum/Avg	|	4459	|	42989	|	91.0	|	6.5	|	2.5	|	1.4	|	10.4	|	43.0	|
+exp/asr_train_asr_conformer_fbank_pitch_bpe2000/decode_lm_transformer_valid.loss.ave_asr_model_valid.acc.ave_10best/eval2000/score_wer/scoring/hyp.swbd.ctm.filt.sys
+|       SPKR              |        # Snt              # Wrd        |        Corr                 Sub                  Del                 Ins                  Err               S.Err        |
+|	Sum/Avg	|	1831	|	21395	|	94.0	|	4.0	|	2.0	|	0.9	|	6.8	|	35.9	|
+```
diff --git a/egs2/swbd/asr1/asr.sh b/egs2/swbd/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/swbd/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/cmd.sh b/egs2/swbd/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/swbd/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/swbd/asr1/conf/decode_asr.yaml b/egs2/swbd/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..2f2d0df3642
--- /dev/null
+++ b/egs2/swbd/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.1
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/swbd/asr1/conf/fbank.conf b/egs2/swbd/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/swbd/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/swbd/asr1/conf/pbs.conf b/egs2/swbd/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/swbd/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/swbd/asr1/conf/pitch.conf b/egs2/swbd/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/swbd/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/swbd/asr1/conf/queue.conf b/egs2/swbd/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/swbd/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/swbd/asr1/conf/slurm.conf b/egs2/swbd/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/swbd/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/swbd/asr1/conf/train_asr.yaml b/egs2/swbd/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..28ade203eb2
--- /dev/null
+++ b/egs2/swbd/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer2.yaml
\ No newline at end of file
diff --git a/egs2/swbd/asr1/conf/train_lm.yaml b/egs2/swbd/asr1/conf/train_lm.yaml
new file mode 120000
index 00000000000..132f2843d3c
--- /dev/null
+++ b/egs2/swbd/asr1/conf/train_lm.yaml
@@ -0,0 +1 @@
+tuning/train_lm_transformer2.yaml
\ No newline at end of file
diff --git a/egs2/swbd/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/swbd/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..a4dd43a21a9
--- /dev/null
+++ b/egs2/swbd/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,74 @@
+# This configuration requires TITAN RTX (12GB) x 4 GPUs It takes about XX days.
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 2
+grad_clip: 5
+max_epoch: 50
+patience: 3
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+ctc_conf:
+    ignore_nan_grad: true
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.003
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: True
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/swbd/asr1/conf/tuning/train_asr_conformer2.yaml b/egs2/swbd/asr1/conf/tuning/train_asr_conformer2.yaml
new file mode 100644
index 00000000000..19ad0336e80
--- /dev/null
+++ b/egs2/swbd/asr1/conf/tuning/train_asr_conformer2.yaml
@@ -0,0 +1,81 @@
+# Trained with Tesla V100-SXM2(32GB) x 8 GPUs. It takes about 1.5 days.
+batch_type: numel
+batch_bins: 75000000
+fold_length:
+        - 80000
+        - 150
+accum_grad: 1
+max_epoch: 150
+patience: none
+init: None
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+log_interval: 3000
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.006
+scheduler: warmuplr
+scheduler_conf: 
+    warmup_steps: 25000
+ctc_conf:
+        ignore_nan_grad: true
+
+
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+  fs: 16000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/swbd/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/swbd/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..25a30cd0008
--- /dev/null
+++ b/egs2/swbd/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,66 @@
+# This configuration requires TITAN RTX (12GB) x 4 GPUs It takes about XX days.
+batch_type: numel
+batch_bins: 15000000
+accum_grad: 8
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/swbd/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/swbd/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..49250c29699
--- /dev/null
+++ b/egs2/swbd/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,30 @@
+use_amp: true
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 80000000
+accum_grad: 2
+max_epoch: 25
+
+optim: adam
+optim_conf:
+   lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
diff --git a/egs2/swbd/asr1/conf/tuning/train_lm_transformer2.yaml b/egs2/swbd/asr1/conf/tuning/train_lm_transformer2.yaml
new file mode 100644
index 00000000000..f8e5117a500
--- /dev/null
+++ b/egs2/swbd/asr1/conf/tuning/train_lm_transformer2.yaml
@@ -0,0 +1,34 @@
+# This configuration requires Tesla V100-SXM2(32GB) x 8 GPUs It takes about 3.5 hours.
+#use_amp: true
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 256
+    head: 8
+    unit: 1024
+    layer: 12
+    dropout_rate: 0.1
+
+log_interval: 3000
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 90000000
+#batch_type: folded
+#batch_size: 2048
+accum_grad: 4
+max_epoch: 100
+
+optim: adam
+optim_conf:
+   lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
diff --git a/egs2/swbd/asr1/db.sh b/egs2/swbd/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/swbd/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/MSU_single_letter.txt b/egs2/swbd/asr1/local/MSU_single_letter.txt
new file mode 120000
index 00000000000..dd1bbcd661f
--- /dev/null
+++ b/egs2/swbd/asr1/local/MSU_single_letter.txt
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/MSU_single_letter.txt
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/data.sh b/egs2/swbd/asr1/local/data.sh
new file mode 100755
index 00000000000..506e05971eb
--- /dev/null
+++ b/egs2/swbd/asr1/local/data.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=3
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SWBD}" ]; then
+    log "Fill the value of 'SWBD' of db.sh"
+    exit 1
+fi
+
+
+# we assume the following data structure
+  # SWBD: LDC97S62 LDC2002S09 LDC2002T43 LDC2004T19 LDC2005T19 LDC2004S13 LDC2005S13
+swbd1_dir=${SWBD}/LDC97S62
+eval2000_dir="${SWBD}/LDC2002S09/hub5e_00 ${SWBD}/LDC2002T43"
+fisher_dir="${SWBD}/LDC2004T19 ${SWBD}/LDC2005T19 ${SWBD}/LDC2004S13 ${SWBD}/LDC2005S13"
+
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log " Data Preparation"
+    local/swbd1_data_download.sh ${swbd1_dir}
+    local/swbd1_prepare_dict.sh
+    local/swbd1_data_prep.sh ${swbd1_dir}
+    local/eval2000_data_prep.sh ${eval2000_dir}
+    if [ -n "${fisher_dir}" ]; then
+         local/fisher_data_prep.sh ${fisher_dir}
+    fi 
+    # upsample audio from 8k to 16k to make a recipe consistent with others
+    for x in train eval2000; do
+        sed -i.bak -e "s/$/ sox -R -t wav - -t wav - rate 16000 dither | /" data/${x}/wav.scp
+    done
+
+    x=eval2000
+    cp data/${x}/text data/${x}/text.org
+    paste -d "" \
+         <(cut -f 1 -d" " data/${x}/text.org) \
+         <(awk '{$1=""; print tolower($0)}' data/${x}/text.org \
+         | perl -pe 's| \(\%.*\)||g' | perl -pe 's| \<.*\>||g' \
+         | sed -e "s/(//g" -e "s/)//g") \
+         | sed -e 's/\s\+/ /g' > data/${x}/text.org2 # for ci check
+    # remove the file with empty text, otherwise bug in stage calc perplexity 
+    awk -F ' ' '{if(length($2)!=0)print $0}' data/${x}/text.org2 > data/${x}/text 
+    
+
+    utils/fix_data_dir.sh data/train
+    utils/fix_data_dir.sh data/eval2000
+    utils/subset_data_dir.sh --first data/train 4000 data/train_dev # 5hr 6min
+    n=$(($(wc -l < data/train/segments) - 4000))
+    utils/subset_data_dir.sh --last data/train ${n} data/train_nodev
+    utils/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup # 286hr
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log " Data Formatting"
+     # remove ._ . _1 symbols from text  
+     cp data/train_nodup/text data/train_nodup/text.backup
+     cp data/train_dev/text data/train_dev/text.backup
+     sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/train_nodup/text
+     sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/train_dev/text
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    if [ -n "${fisher_dir}" ]; then
+           log "Fisher LM Train Data Preparation"
+           local/fisher_data_prep.sh ${fisher_dir}
+           utils/fix_data_dir.sh data/train_fisher
+           cp data/train_fisher/text data/train_fisher/text.backup
+           sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/train_fisher/text
+           cat data/train_fisher/text data/train_nodup/text > data/lm_train.txt
+     fi
+fi
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/swbd/asr1/local/dict.patch b/egs2/swbd/asr1/local/dict.patch
new file mode 120000
index 00000000000..f3e0d14e91a
--- /dev/null
+++ b/egs2/swbd/asr1/local/dict.patch
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/dict.patch
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/eval2000_data_prep.sh b/egs2/swbd/asr1/local/eval2000_data_prep.sh
new file mode 120000
index 00000000000..bddd91fc565
--- /dev/null
+++ b/egs2/swbd/asr1/local/eval2000_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/eval2000_data_prep.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/extend_segments.pl b/egs2/swbd/asr1/local/extend_segments.pl
new file mode 120000
index 00000000000..63065555357
--- /dev/null
+++ b/egs2/swbd/asr1/local/extend_segments.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/extend_segments.pl
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/fisher_data_prep.sh b/egs2/swbd/asr1/local/fisher_data_prep.sh
new file mode 120000
index 00000000000..352cfec2b52
--- /dev/null
+++ b/egs2/swbd/asr1/local/fisher_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/fisher_data_prep.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/format_acronyms_dict.py b/egs2/swbd/asr1/local/format_acronyms_dict.py
new file mode 120000
index 00000000000..29f6093eade
--- /dev/null
+++ b/egs2/swbd/asr1/local/format_acronyms_dict.py
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/format_acronyms_dict.py
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/map_acronyms_transcripts.py b/egs2/swbd/asr1/local/map_acronyms_transcripts.py
new file mode 120000
index 00000000000..1d1cda1cc86
--- /dev/null
+++ b/egs2/swbd/asr1/local/map_acronyms_transcripts.py
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/map_acronyms_transcripts.py
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/path.sh b/egs2/swbd/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/swbd/asr1/local/rt03_data_prep.sh b/egs2/swbd/asr1/local/rt03_data_prep.sh
new file mode 120000
index 00000000000..d555ab3f706
--- /dev/null
+++ b/egs2/swbd/asr1/local/rt03_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/rt03_data_prep.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/score.sh b/egs2/swbd/asr1/local/score.sh
new file mode 100755
index 00000000000..a0316ac7c49
--- /dev/null
+++ b/egs2/swbd/asr1/local/score.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+data=data/eval2000
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <asr-exp-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  exit 1;
+fi
+
+asr_expdir=$1
+
+hubscr=${KALDI_ROOT}/tools/sctk/bin/hubscr.pl
+[ ! -f ${hubscr} ] && echo "Cannot find scoring program at $hubscr" && exit 1;
+hubdir=$(dirname ${hubscr})
+
+for f in ${data}/stm ${data}/glm; do
+  [ ! -f ${f} ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+name=$(basename ${data}) # e.g. eval2000
+for dir in ${asr_expdir}/decode_*/${name}/score_wer; do
+    score_dir=${dir}/scoring
+    ctm=${score_dir}/hyp.ctm
+    # The WER seems to be lower without converting stm
+    #stm=${score_dir}/ref.stm
+    stm=${data}/stm
+    mkdir -p ${score_dir}
+    if [ ${stage} -le 0 ]; then
+        
+            # Assuming trn files exist
+            #ref=${dir}/ref.wrd.trn
+            hyp=${dir}/hyp.trn
+            # The WER seems to be lower without converting stm
+            #trn2stm.py --orig-stm ${data}/stm ${ref} ${stm}
+            trn2ctm.py ${hyp} ${ctm}
+        
+    fi
+
+    if [ ${stage} -le 1 ]; then
+      # Remove some stuff we don't want to score, from the ctm.
+      # the big expression in parentheses contains all the things that get mapped
+      # by the glm file, into hesitations.
+      # The -$ expression removes partial words.
+      # the aim here is to remove all the things that appear in the reference as optionally
+      # deletable (inside parentheses), as if we delete these there is no loss, while
+      # if we get them correct there is no gain.
+      cp ${ctm} ${score_dir}/tmpf;
+      cat ${score_dir}/tmpf | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
+      grep -i -v -E '<UNK>' | \
+      grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW)$' | \
+      grep -v -- '-$' > ${ctm};
+    fi
+
+    # Score the set...
+    if [ ${stage} -le 2 ]; then
+        ${cmd} ${score_dir}/score.log ${hubscr} -p ${hubdir} -V -l english -h hub5 -g ${data}/glm -r ${stm} ${ctm} || exit 1;
+    fi
+
+    # For eval2000 score the subsets
+    case "$name" in
+      eval2000*)
+        # Score only the, swbd part...
+        if [ ${stage} -le 3 ]; then
+            swbd_stm=${score_dir}/ref.swbd.stm
+            swbd_ctm=${score_dir}/hyp.swbd.ctm
+            ${cmd} ${score_dir}/score.swbd.log \
+              grep -v '^en_' ${stm} '>' ${swbd_stm} '&&' \
+              grep -v '^en_' ${ctm} '>' ${swbd_ctm} '&&' \
+              ${hubscr} -p ${hubdir} -V -l english -h hub5 -g ${data}/glm -r ${swbd_stm} ${swbd_ctm} || exit 1;
+        fi
+        # Score only the, callhome part...
+        if [ ${stage} -le 3 ]; then
+            callhm_stm=${score_dir}/ref.callhm.stm
+            callhm_ctm=${score_dir}/hyp.callhm.ctm
+            ${cmd} ${score_dir}/score.callhm.log \
+              grep -v '^sw_' ${stm} '>' ${callhm_stm} '&&' \
+              grep -v '^sw_' ${ctm} '>' ${callhm_ctm} '&&' \
+              ${hubscr} -p ${hubdir} -V -l english -h hub5 -g ${data}/glm -r ${callhm_stm} ${callhm_ctm} || exit 1;
+        fi
+        ;;
+    rt03* )
+
+      # Score only the swbd part...
+      if [ ${stage} -le 3 ]; then
+            swbd_stm=${score_dir}/ref.swbd.stm
+            swbd_ctm=${score_dir}/hyp.swbd.ctm
+            ${cmd} ${score_dir}/score.swbd.log \
+              grep -v '^fsh_' ${stm} '>' ${swbd_stm} '&&' \
+              grep -v '^fsh_' ${ctm} '>' ${swbd_ctm} '&&' \
+              ${hubscr} -p ${hubdir} -V -l english -h hub5 -g ${data}/glm -r ${swbd_stm} ${swbd_ctm} || exit 1;
+      fi
+      # Score only the fisher part...
+      if [ ${stage} -le 3 ]; then
+            fsh_stm=${score_dir}/ref.fsh.stm
+            fsh_ctm=${score_dir}/hyp.fsh.ctm
+            ${cmd} ${score_dir}/score.fsh.log \
+              grep -v '^sw_' ${stm} '>' ${fsh_stm} '&&' \
+              grep -v '^sw_' ${ctm} '>' ${fsh_ctm} '&&' \
+              ${hubscr} -p ${hubdir} -V -l english -h hub5 -g ${data}/glm -r ${fsh_stm} ${fsh_ctm} || exit 1;
+      fi
+    ;;
+    esac
+
+    grep 'Percent Total Error' ${score_dir}/hyp.*ctm.filt.dtl
+done
+exit 0
diff --git a/egs2/swbd/asr1/local/swbd1_data_download.sh b/egs2/swbd/asr1/local/swbd1_data_download.sh
new file mode 120000
index 00000000000..dfc7b6be51e
--- /dev/null
+++ b/egs2/swbd/asr1/local/swbd1_data_download.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_data_download.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/swbd1_data_prep.sh b/egs2/swbd/asr1/local/swbd1_data_prep.sh
new file mode 120000
index 00000000000..2c88651a694
--- /dev/null
+++ b/egs2/swbd/asr1/local/swbd1_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_data_prep.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/swbd1_fix_speakerid.pl b/egs2/swbd/asr1/local/swbd1_fix_speakerid.pl
new file mode 120000
index 00000000000..895ea088e4a
--- /dev/null
+++ b/egs2/swbd/asr1/local/swbd1_fix_speakerid.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_fix_speakerid.pl
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/swbd1_map_words.pl b/egs2/swbd/asr1/local/swbd1_map_words.pl
new file mode 120000
index 00000000000..7c0014e683a
--- /dev/null
+++ b/egs2/swbd/asr1/local/swbd1_map_words.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_map_words.pl
\ No newline at end of file
diff --git a/egs2/swbd/asr1/local/swbd1_prepare_dict.sh b/egs2/swbd/asr1/local/swbd1_prepare_dict.sh
new file mode 120000
index 00000000000..88fa6f959b1
--- /dev/null
+++ b/egs2/swbd/asr1/local/swbd1_prepare_dict.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_prepare_dict.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/path.sh b/egs2/swbd/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/swbd/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/swbd/asr1/pyscripts b/egs2/swbd/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/swbd/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/swbd/asr1/run.sh b/egs2/swbd/asr1/run.sh
new file mode 100755
index 00000000000..a6e0af226fa
--- /dev/null
+++ b/egs2/swbd/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_nodup
+valid_set=train_dev
+test_sets="eval2000"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+lm_config=conf/train_lm.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="1.1 0.9 1.0"
+
+bpe_train_text=dump/fbank_pitch/train_nodup_sp/text
+lm_train_text=data/lm_train.txt
+
+# NOTE: The default settings require 8 GPUs with 32 GB memory
+./asr.sh \
+    --ngpu 8 \
+    --token_type bpe \
+    --nbpe 2000 \
+    --bpe_train_text ${bpe_train_text} \
+    --lm_train_text ${lm_train_text} \
+    --feats_type fbank_pitch \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --inference_lm valid.loss.best.pth \
+    --lm_config "${lm_config}" \
+    --score_opts "-s" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    "$@"
diff --git a/egs2/swbd/asr1/scripts b/egs2/swbd/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/swbd/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/swbd/asr1/steps b/egs2/swbd/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/swbd/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/swbd/asr1/utils b/egs2/swbd/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/swbd/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/README.md b/egs2/swbd_da/asr1/README.md
new file mode 100644
index 00000000000..be6ed3ec93d
--- /dev/null
+++ b/egs2/swbd_da/asr1/README.md
@@ -0,0 +1,41 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Using Conformer encoder with Hubert pre-encoder, SpecAugment, speed perturbation and 3 context utterances
+
+- ASR config: [conf/tuning/train_asr_conformer_hubert.yaml](conf/tuning/train_asr_conformer_hubert.yaml)
+- Pretrained model:
+  - Zenodo: https://zenodo.org/record/5817199#.YdQ9_YTMKkA
+  - Hugging Face Hub: https://huggingface.co/espnet/akreal_swbd_da_hubert_conformer
+
+|Dataset|Snt|Dialogue Act Classification (%)|
+|---|---|---|
+|decode_asr_asr_model_valid.loss.ave/test|2379|66.3|
+|decode_asr_asr_model_valid.loss.ave/valid|8117|69.5|
+
+## Using Conformer encoder, SpecAugment and speed perturbation
+
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+
+|Dataset|Snt|Dialogue Act Classification (%)|
+|---|---|---|
+|decode_asr_asr_model_valid.loss.ave/test|2379|52.9|
+|decode_asr_asr_model_valid.loss.ave/valid|8117|56.1|
+
+## Using Transformer based encoder-decoder and word token type
+
+- ASR config: [conf/tuning/train_asr_transformer.yaml](conf/tuning/train_asr_transformer.yaml)
+
+|Dataset|Snt|Dialogue Act Classification (%)|
+|---|---|---|
+|decode_asr_asr_model_valid.acc.best/test|2379|51.9|
+|decode_asr_asr_model_valid.acc.best/valid|8117|56.8|
+
+## Using Transformer based encoder-decoder with `bert-base-cased` NLU post-encoder and word token type
+
+- ASR config: [conf/tuning/train_asr_transformer_postencoder.yaml](conf/tuning/train_asr_transformer_postencoder.yaml)
+
+|Dataset|Snt|Dialogue Act Classification (%)|
+|---|---|---|
+|decode_asr_asr_model_valid.acc.best/test|2379|35.9|
+|decode_asr_asr_model_valid.acc.best/valid|8117|39.4|
diff --git a/egs2/swbd_da/asr1/asr.sh b/egs2/swbd_da/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/swbd_da/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/cmd.sh b/egs2/swbd_da/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/swbd_da/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/swbd_da/asr1/conf/decode_asr.yaml b/egs2/swbd_da/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..bc4fdccba34
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/decode_asr.yaml
@@ -0,0 +1,4 @@
+lm_weight: 0.0
+ctc_weight: 0.0
+beam_size: 1
+maxlenratio: -1
diff --git a/egs2/swbd_da/asr1/conf/fbank.conf b/egs2/swbd_da/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/swbd_da/asr1/conf/pbs.conf b/egs2/swbd_da/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/swbd_da/asr1/conf/pitch.conf b/egs2/swbd_da/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/swbd_da/asr1/conf/queue.conf b/egs2/swbd_da/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/swbd_da/asr1/conf/slurm.conf b/egs2/swbd_da/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/swbd_da/asr1/conf/train_asr.yaml b/egs2/swbd_da/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..1f20a2c51d7
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer_hubert.yaml
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..c0b803eba9a
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,71 @@
+batch_type: numel
+batch_bins: 7000000
+accum_grad: 1
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 4
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.0
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+frontend_conf:
+  n_fft: 512
+  hop_length: 256
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer_hubert.yaml b/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer_hubert.yaml
new file mode 100644
index 00000000000..adc8ca8001e
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/tuning/train_asr_conformer_hubert.yaml
@@ -0,0 +1,82 @@
+batch_type: numel
+batch_bins: 4000000
+accum_grad: 1
+max_epoch: 35
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 7
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.0
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.0001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/swbd_da/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/swbd_da/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..7a0217e606d
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,29 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.0
+
+max_epoch: 10
diff --git a/egs2/swbd_da/asr1/conf/tuning/train_asr_transformer_postencoder.yaml b/egs2/swbd_da/asr1/conf/tuning/train_asr_transformer_postencoder.yaml
new file mode 100644
index 00000000000..101a4e5a332
--- /dev/null
+++ b/egs2/swbd_da/asr1/conf/tuning/train_asr_transformer_postencoder.yaml
@@ -0,0 +1,38 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# postencoder
+postencoder: hugging_face_transformers
+postencoder_conf:
+    # pick up a model from https://huggingface.co/models?filter=transformers
+    # most of models should work, but maybe some don't
+    # known to work: bert, gpt2, xlnet, roberta, mpnet, t5, bart
+    # xlnet currently works for single gpu only
+    model_name_or_path: "bert-base-cased"
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.0
+
+max_epoch: 10
diff --git a/egs2/swbd_da/asr1/db.sh b/egs2/swbd_da/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/swbd_da/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/local/data.sh b/egs2/swbd_da/asr1/local/data.sh
new file mode 100755
index 00000000000..913aa4a0b7b
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/data.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=100000
+context=0
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SWBD}" ]; then
+    log "Fill the value of 'SWBD' of db.sh"
+    exit 1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: Data Preparation"
+    python3 local/data_prep.py --context ${context} ${SWBD}/LDC97S62 ${SWBD_NXT}/LDC2009T26
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/swbd_da/asr1/local/data_prep.py b/egs2/swbd_da/asr1/local/data_prep.py
new file mode 100644
index 00000000000..57bbc138225
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/data_prep.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import glob
+import os
+import subprocess
+import xml.etree.ElementTree as ET
+
+parser = argparse.ArgumentParser(
+    description="Prepare Switchboard Dialogue Act dataset."
+)
+
+parser.add_argument("audio_path", type=str, help="Path to audio (LDC97S62)")
+parser.add_argument("nxt_path", type=str, help="Path to NXT annotation (LDC2009T26)")
+parser.add_argument(
+    "--context", type=int, default=0, help="Number of utterances in the context"
+)
+
+args = parser.parse_args()
+
+xml_path = os.path.join(args.nxt_path, "nxt_switchboard_ann", "xml")
+
+channel = {"A": 1, "B": 2}
+speaker = {}
+
+corpus_resources_root = ET.parse(
+    os.path.join(xml_path, "corpus-resources", "dialogues.xml")
+).getroot()
+for dialogue in corpus_resources_root.findall(".//dialogue"):
+    dialogue_id = "sw" + dialogue.attrib["swbdid"]
+    speaker[dialogue_id] = {}
+    for pointer in dialogue.findall(".//{http://nite.sourceforge.net/}pointer"):
+        speaker[dialogue_id][pointer.attrib["role"]] = pointer.attrib["href"].split(
+            "#"
+        )[1][3:-1]
+
+sph = {}
+
+for sph_file in glob.glob(os.path.join(args.audio_path, "*/swb1/sw*.sph")):
+    dialogue_id = sph_file[-10:-4]
+    sph[dialogue_id] = sph_file
+
+# Data splits local/{train,valid,test}.lst
+# from the paper: Ji Young Lee*, Franck Dernoncourt*.
+# Sequential Short-Text Classification with Recurrent and Convolutional Neural Networks.
+# NAACL 2016. (* indicates equal contribution)
+
+for subset in ["train", "valid", "test"]:
+    subset_dir = subset
+
+    if args.context > 0:
+        subset_dir += "_context" + str(args.context)
+
+    odir = os.path.join("data", subset_dir)
+    os.makedirs(odir, exist_ok=True)
+
+    with open(os.path.join(odir, "text"), "w") as text_f, open(
+        os.path.join(odir, "wav.scp"), "w"
+    ) as wav_scp_f, open(os.path.join(odir, "utt2spk"), "w") as utt2spk_f, open(
+        os.path.join("local", subset + ".lst")
+    ) as dialogues_f:
+        for line in dialogues_f:
+            dialogue_id = line.strip()
+
+            dial_acts = {}
+
+            for role in ["A", "B"]:
+                terminals = {}
+
+                terminals_file = os.path.join(
+                    xml_path, "terminals", f"{dialogue_id}.{role}.terminals.xml"
+                )
+
+                if not os.path.exists(terminals_file):
+                    continue
+
+                terminals_root = ET.parse(terminals_file).getroot()
+
+                for terminal in terminals_root.findall(".//word"):
+                    start_str = terminal.attrib["{http://nite.sourceforge.net/}start"]
+                    end_str = terminal.attrib["{http://nite.sourceforge.net/}end"]
+                    if (
+                        start_str != "non-aligned"
+                        and start_str != "n/a"
+                        and end_str != "n/a"
+                    ):
+                        terminals[
+                            terminal.attrib["{http://nite.sourceforge.net/}id"]
+                        ] = {
+                            "start": float(start_str),
+                            "end": float(end_str),
+                        }
+
+                dial_act_root = ET.parse(
+                    os.path.join(
+                        xml_path, "dialAct", f"{dialogue_id}.{role}.dialAct.xml"
+                    )
+                ).getroot()
+
+                for dial_act in dial_act_root.findall(".//da"):
+                    words = dial_act.attrib["niteType"]
+                    if words == "excluded":
+                        continue
+
+                    dial_act_id = dial_act.attrib["{http://nite.sourceforge.net/}id"][
+                        2:
+                    ]
+
+                    utt_id = (
+                        speaker[dialogue_id][role]
+                        + "_"
+                        + dialogue_id
+                        + "_"
+                        + dial_act_id
+                    )
+
+                    dial_act_children = dial_act.findall(
+                        ".//{http://nite.sourceforge.net/}child"
+                    )
+
+                    start_terminal_id = (
+                        dial_act_children[0].attrib["href"].split("#")[1][3:-1]
+                    )
+                    end_terminal_id = (
+                        dial_act_children[-1].attrib["href"].split("#")[1][3:-1]
+                    )
+
+                    if (
+                        start_terminal_id not in terminals
+                        or end_terminal_id not in terminals
+                    ):
+                        continue
+
+                    start = terminals[start_terminal_id]["start"]
+                    end = terminals[end_terminal_id]["end"]
+                    dur = end - start
+
+                    if dur < 0.005:
+                        continue
+
+                    dial_acts[dial_act_id] = {
+                        "utt": utt_id,
+                        "start": start,
+                        "dur": dur,
+                        "channel": channel[role],
+                        "text": words,
+                        "spk": speaker[dialogue_id][role],
+                    }
+
+            for dial_act_id in dial_acts:
+                context = [dial_act_id]
+
+                for i in range(1, args.context + 1):
+                    context_dial_act_id = str(int(dial_act_id) - i)
+                    if context_dial_act_id in dial_acts:
+                        context.append(context_dial_act_id)
+
+                context.reverse()
+
+                wav = " ".join(
+                    [
+                        f'"| sox {sph[dialogue_id]} -r 16k -t wav'
+                        + " -c 1 -b 16 -e signed - "
+                        + f'trim {dial_acts[c]["start"]} {dial_acts[c]["dur"]}'
+                        + f' remix {dial_acts[c]["channel"]}"'
+                        for c in context
+                    ]
+                )
+
+                wav_scp_f.write(
+                    "{} sox {} -t wav - |\n".format(dial_acts[dial_act_id]["utt"], wav)
+                )
+                text_f.write(
+                    dial_acts[dial_act_id]["utt"]
+                    + " "
+                    + dial_acts[dial_act_id]["text"]
+                    + "\n"
+                )
+                utt2spk_f.write(
+                    dial_acts[dial_act_id]["utt"]
+                    + " "
+                    + dial_acts[dial_act_id]["spk"]
+                    + "\n"
+                )
+    subprocess.call("utils/fix_data_dir.sh {}".format(odir), shell=True)
diff --git a/egs2/swbd_da/asr1/local/path.sh b/egs2/swbd_da/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/swbd_da/asr1/local/score.py b/egs2/swbd_da/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/local/score.sh b/egs2/swbd_da/asr1/local/score.sh
new file mode 100755
index 00000000000..5fd40c0e21c
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/score.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright 2022 University of Stuttgart (Pavel Denisov)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ]; then
+  echo "Usage: local/score.sh <asr-exp-dir> <valid_inference_folder> <test_inference_folder>"
+  exit 1;
+fi
+
+asr_expdir=$1
+if [ $# -gt 1 ]; then
+        valid_inference_folder=$2
+        test_inference_folder=$3
+else
+        valid_inference_folder="decode_asr_asr_model_valid.loss.ave/valid_context3/"
+        test_inference_folder="decode_asr_asr_model_valid.loss.ave/test_context3/"
+fi
+
+python local/score.py \
+		--exp_root ${asr_expdir} \
+		--valid_folder ${valid_inference_folder} \
+		--test_folder ${test_inference_folder} \
+	| sed 's/Intent/Dialog Act/g'
+
+exit 0
+
diff --git a/egs2/swbd_da/asr1/local/test.lst b/egs2/swbd_da/asr1/local/test.lst
new file mode 100644
index 00000000000..9d164c12585
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/test.lst
@@ -0,0 +1,19 @@
+sw2121
+sw2131
+sw2151
+sw2229
+sw2335
+sw2434
+sw2441
+sw2461
+sw2503
+sw2632
+sw2724
+sw2752
+sw2753
+sw2836
+sw2838
+sw3528
+sw3756
+sw3942
+sw3994
diff --git a/egs2/swbd_da/asr1/local/train.lst b/egs2/swbd_da/asr1/local/train.lst
new file mode 100644
index 00000000000..b0ce877d09e
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/train.lst
@@ -0,0 +1,1003 @@
+sw2005
+sw2006
+sw2008
+sw2010
+sw2012
+sw2015
+sw2018
+sw2019
+sw2020
+sw2022
+sw2024
+sw2025
+sw2027
+sw2028
+sw2032
+sw2035
+sw2038
+sw2039
+sw2040
+sw2041
+sw2051
+sw2060
+sw2061
+sw2062
+sw2064
+sw2065
+sw2073
+sw2078
+sw2079
+sw2085
+sw2086
+sw2090
+sw2092
+sw2093
+sw2094
+sw2095
+sw2101
+sw2102
+sw2104
+sw2105
+sw2107
+sw2109
+sw2110
+sw2111
+sw2113
+sw2120
+sw2122
+sw2124
+sw2125
+sw2130
+sw2137
+sw2139
+sw2145
+sw2149
+sw2154
+sw2155
+sw2157
+sw2168
+sw2171
+sw2177
+sw2178
+sw2180
+sw2181
+sw2184
+sw2185
+sw2187
+sw2190
+sw2191
+sw2197
+sw2205
+sw2220
+sw2221
+sw2226
+sw2227
+sw2228
+sw2231
+sw2232
+sw2234
+sw2235
+sw2237
+sw2241
+sw2244
+sw2247
+sw2248
+sw2249
+sw2252
+sw2259
+sw2260
+sw2262
+sw2263
+sw2264
+sw2265
+sw2266
+sw2268
+sw2275
+sw2278
+sw2279
+sw2283
+sw2285
+sw2287
+sw2290
+sw2292
+sw2293
+sw2295
+sw2296
+sw2300
+sw2301
+sw2302
+sw2303
+sw2304
+sw2305
+sw2308
+sw2309
+sw2313
+sw2314
+sw2316
+sw2323
+sw2324
+sw2325
+sw2330
+sw2331
+sw2334
+sw2336
+sw2339
+sw2342
+sw2344
+sw2349
+sw2353
+sw2354
+sw2355
+sw2362
+sw2365
+sw2366
+sw2368
+sw2370
+sw2372
+sw2376
+sw2379
+sw2380
+sw2382
+sw2383
+sw2386
+sw2387
+sw2389
+sw2393
+sw2397
+sw2405
+sw2406
+sw2407
+sw2413
+sw2418
+sw2421
+sw2423
+sw2424
+sw2426
+sw2427
+sw2429
+sw2431
+sw2432
+sw2433
+sw2435
+sw2436
+sw2437
+sw2439
+sw2442
+sw2445
+sw2446
+sw2448
+sw2450
+sw2451
+sw2452
+sw2457
+sw2460
+sw2465
+sw2466
+sw2467
+sw2469
+sw2471
+sw2472
+sw2476
+sw2477
+sw2478
+sw2479
+sw2482
+sw2483
+sw2485
+sw2486
+sw2488
+sw2490
+sw2492
+sw2495
+sw2499
+sw2502
+sw2504
+sw2506
+sw2510
+sw2511
+sw2514
+sw2515
+sw2519
+sw2521
+sw2524
+sw2525
+sw2526
+sw2527
+sw2528
+sw2533
+sw2537
+sw2539
+sw2540
+sw2543
+sw2545
+sw2546
+sw2547
+sw2548
+sw2549
+sw2552
+sw2554
+sw2557
+sw2559
+sw2562
+sw2565
+sw2566
+sw2568
+sw2570
+sw2571
+sw2575
+sw2576
+sw2578
+sw2579
+sw2584
+sw2585
+sw2586
+sw2587
+sw2589
+sw2597
+sw2599
+sw2602
+sw2603
+sw2604
+sw2608
+sw2609
+sw2610
+sw2611
+sw2614
+sw2615
+sw2616
+sw2617
+sw2619
+sw2622
+sw2627
+sw2628
+sw2631
+sw2634
+sw2638
+sw2640
+sw2641
+sw2642
+sw2645
+sw2647
+sw2648
+sw2650
+sw2652
+sw2657
+sw2658
+sw2661
+sw2662
+sw2663
+sw2667
+sw2669
+sw2672
+sw2675
+sw2676
+sw2678
+sw2679
+sw2684
+sw2689
+sw2690
+sw2691
+sw2692
+sw2693
+sw2703
+sw2707
+sw2708
+sw2709
+sw2710
+sw2711
+sw2716
+sw2717
+sw2719
+sw2723
+sw2726
+sw2729
+sw2734
+sw2736
+sw2741
+sw2743
+sw2744
+sw2749
+sw2751
+sw2754
+sw2756
+sw2759
+sw2761
+sw2766
+sw2767
+sw2768
+sw2770
+sw2773
+sw2774
+sw2775
+sw2780
+sw2782
+sw2784
+sw2785
+sw2788
+sw2789
+sw2792
+sw2793
+sw2794
+sw2797
+sw2800
+sw2803
+sw2806
+sw2812
+sw2818
+sw2819
+sw2820
+sw2821
+sw2826
+sw2827
+sw2828
+sw2830
+sw2834
+sw2835
+sw2837
+sw2840
+sw2844
+sw2847
+sw2849
+sw2851
+sw2858
+sw2860
+sw2862
+sw2866
+sw2868
+sw2870
+sw2871
+sw2875
+sw2876
+sw2877
+sw2879
+sw2883
+sw2884
+sw2887
+sw2893
+sw2896
+sw2897
+sw2898
+sw2900
+sw2909
+sw2910
+sw2913
+sw2915
+sw2917
+sw2921
+sw2924
+sw2926
+sw2927
+sw2929
+sw2930
+sw2932
+sw2934
+sw2935
+sw2938
+sw2942
+sw2945
+sw2950
+sw2952
+sw2953
+sw2954
+sw2955
+sw2956
+sw2957
+sw2960
+sw2962
+sw2963
+sw2965
+sw2967
+sw2968
+sw2969
+sw2970
+sw2982
+sw2983
+sw2984
+sw2991
+sw2992
+sw2993
+sw2994
+sw2995
+sw2996
+sw2998
+sw2999
+sw3000
+sw3001
+sw3002
+sw3003
+sw3004
+sw3007
+sw3009
+sw3011
+sw3012
+sw3013
+sw3014
+sw3016
+sw3018
+sw3019
+sw3020
+sw3021
+sw3023
+sw3025
+sw3028
+sw3029
+sw3030
+sw3034
+sw3036
+sw3038
+sw3039
+sw3040
+sw3041
+sw3042
+sw3045
+sw3047
+sw3049
+sw3050
+sw3051
+sw3052
+sw3054
+sw3055
+sw3056
+sw3057
+sw3059
+sw3061
+sw3062
+sw3063
+sw3064
+sw3065
+sw3067
+sw3068
+sw3069
+sw3070
+sw3071
+sw3073
+sw3074
+sw3075
+sw3076
+sw3077
+sw3080
+sw3081
+sw3082
+sw3083
+sw3085
+sw3086
+sw3087
+sw3088
+sw3090
+sw3092
+sw3093
+sw3095
+sw3097
+sw3099
+sw3102
+sw3103
+sw3104
+sw3105
+sw3107
+sw3108
+sw3111
+sw3113
+sw3115
+sw3118
+sw3120
+sw3121
+sw3124
+sw3130
+sw3131
+sw3133
+sw3134
+sw3135
+sw3136
+sw3138
+sw3140
+sw3142
+sw3143
+sw3144
+sw3146
+sw3150
+sw3151
+sw3152
+sw3154
+sw3155
+sw3158
+sw3159
+sw3161
+sw3162
+sw3166
+sw3167
+sw3168
+sw3169
+sw3170
+sw3171
+sw3173
+sw3174
+sw3175
+sw3182
+sw3185
+sw3186
+sw3187
+sw3188
+sw3189
+sw3194
+sw3195
+sw3196
+sw3198
+sw3200
+sw3201
+sw3203
+sw3204
+sw3205
+sw3206
+sw3208
+sw3214
+sw3215
+sw3216
+sw3219
+sw3221
+sw3223
+sw3225
+sw3226
+sw3227
+sw3228
+sw3229
+sw3230
+sw3231
+sw3232
+sw3233
+sw3234
+sw3235
+sw3236
+sw3237
+sw3238
+sw3242
+sw3244
+sw3245
+sw3247
+sw3252
+sw3253
+sw3254
+sw3256
+sw3259
+sw3260
+sw3265
+sw3266
+sw3267
+sw3268
+sw3269
+sw3270
+sw3271
+sw3272
+sw3275
+sw3276
+sw3279
+sw3280
+sw3282
+sw3283
+sw3284
+sw3286
+sw3293
+sw3294
+sw3296
+sw3300
+sw3303
+sw3304
+sw3306
+sw3309
+sw3310
+sw3311
+sw3313
+sw3315
+sw3317
+sw3319
+sw3320
+sw3324
+sw3325
+sw3326
+sw3327
+sw3328
+sw3330
+sw3331
+sw3332
+sw3333
+sw3338
+sw3340
+sw3342
+sw3343
+sw3344
+sw3345
+sw3349
+sw3351
+sw3353
+sw3355
+sw3359
+sw3360
+sw3361
+sw3362
+sw3363
+sw3364
+sw3365
+sw3367
+sw3368
+sw3369
+sw3371
+sw3372
+sw3373
+sw3375
+sw3377
+sw3379
+sw3381
+sw3383
+sw3384
+sw3386
+sw3387
+sw3389
+sw3393
+sw3397
+sw3398
+sw3399
+sw3402
+sw3403
+sw3405
+sw3406
+sw3408
+sw3409
+sw3411
+sw3414
+sw3417
+sw3419
+sw3420
+sw3421
+sw3424
+sw3425
+sw3426
+sw3427
+sw3428
+sw3429
+sw3431
+sw3435
+sw3439
+sw3441
+sw3443
+sw3447
+sw3448
+sw3449
+sw3450
+sw3451
+sw3453
+sw3454
+sw3455
+sw3457
+sw3458
+sw3460
+sw3463
+sw3464
+sw3467
+sw3473
+sw3476
+sw3487
+sw3489
+sw3495
+sw3496
+sw3503
+sw3504
+sw3508
+sw3513
+sw3514
+sw3515
+sw3517
+sw3518
+sw3521
+sw3523
+sw3524
+sw3525
+sw3526
+sw3527
+sw3530
+sw3533
+sw3535
+sw3537
+sw3539
+sw3541
+sw3543
+sw3549
+sw3550
+sw3551
+sw3556
+sw3557
+sw3561
+sw3563
+sw3565
+sw3567
+sw3569
+sw3570
+sw3573
+sw3574
+sw3580
+sw3586
+sw3591
+sw3595
+sw3596
+sw3597
+sw3606
+sw3607
+sw3615
+sw3624
+sw3626
+sw3628
+sw3633
+sw3636
+sw3638
+sw3639
+sw3642
+sw3646
+sw3647
+sw3651
+sw3655
+sw3657
+sw3660
+sw3662
+sw3663
+sw3665
+sw3676
+sw3680
+sw3681
+sw3682
+sw3688
+sw3691
+sw3692
+sw3693
+sw3694
+sw3696
+sw3699
+sw3703
+sw3707
+sw3709
+sw3716
+sw3720
+sw3723
+sw3725
+sw3727
+sw3728
+sw3734
+sw3735
+sw3736
+sw3738
+sw3743
+sw3745
+sw3746
+sw3747
+sw3750
+sw3751
+sw3754
+sw3760
+sw3763
+sw3764
+sw3768
+sw3770
+sw3773
+sw3774
+sw3776
+sw3777
+sw3781
+sw3784
+sw3788
+sw3791
+sw3796
+sw3798
+sw3801
+sw3802
+sw3803
+sw3804
+sw3805
+sw3809
+sw3813
+sw3815
+sw3821
+sw3825
+sw3828
+sw3830
+sw3838
+sw3841
+sw3845
+sw3847
+sw3850
+sw3852
+sw3855
+sw3862
+sw3870
+sw3876
+sw3883
+sw3887
+sw3898
+sw3902
+sw3903
+sw3908
+sw3911
+sw3917
+sw3925
+sw3926
+sw3946
+sw3952
+sw3956
+sw3962
+sw3965
+sw3971
+sw3979
+sw3983
+sw3985
+sw3988
+sw3993
+sw4008
+sw4013
+sw4019
+sw4022
+sw4023
+sw4028
+sw4032
+sw4033
+sw4036
+sw4038
+sw4049
+sw4050
+sw4051
+sw4055
+sw4056
+sw4060
+sw4064
+sw4071
+sw4074
+sw4077
+sw4078
+sw4079
+sw4080
+sw4082
+sw4090
+sw4092
+sw4096
+sw4099
+sw4101
+sw4103
+sw4104
+sw4108
+sw4109
+sw4113
+sw4114
+sw4123
+sw4127
+sw4129
+sw4130
+sw4133
+sw4137
+sw4138
+sw4147
+sw4148
+sw4149
+sw4150
+sw4151
+sw4152
+sw4153
+sw4154
+sw4155
+sw4158
+sw4159
+sw4165
+sw4166
+sw4168
+sw4171
+sw4174
+sw4175
+sw4177
+sw4181
+sw4184
+sw4311
+sw4312
+sw4314
+sw4316
+sw4319
+sw4320
+sw4325
+sw4327
+sw4329
+sw4330
+sw4333
+sw4334
+sw4336
+sw4339
+sw4340
+sw4341
+sw4342
+sw4345
+sw4346
+sw4349
+sw4353
+sw4358
+sw4360
+sw4362
+sw4363
+sw4364
+sw4366
+sw4370
+sw4376
+sw4378
+sw4379
+sw4380
+sw4382
+sw4443
+sw4483
+sw4519
+sw4548
+sw4565
+sw4603
+sw4605
+sw4608
+sw4611
+sw4615
+sw4617
+sw4618
+sw4619
+sw4626
+sw4628
+sw4630
+sw4642
+sw4644
+sw4646
+sw4649
+sw4655
+sw4659
+sw4666
+sw4675
+sw4679
+sw4681
+sw4682
+sw4688
+sw4691
+sw4698
+sw4703
+sw4709
+sw4720
+sw4721
+sw4723
+sw4725
+sw4726
+sw4728
+sw4733
+sw4735
+sw4745
+sw4752
+sw4758
+sw4759
+sw4765
+sw4770
+sw4774
+sw4784
+sw4785
+sw4788
+sw4792
+sw4796
+sw4799
+sw4801
+sw4812
+sw4814
+sw4821
+sw4822
+sw4826
+sw4829
+sw4830
+sw4831
+sw4834
+sw4840
+sw4856
+sw4858
+sw4859
+sw4868
+sw4876
+sw4877
+sw4880
+sw4886
+sw4902
+sw4905
+sw4908
+sw4927
+sw4928
+sw4936
+sw4940
diff --git a/egs2/swbd_da/asr1/local/valid.lst b/egs2/swbd_da/asr1/local/valid.lst
new file mode 100644
index 00000000000..45d05fcddf0
--- /dev/null
+++ b/egs2/swbd_da/asr1/local/valid.lst
@@ -0,0 +1,112 @@
+sw2053
+sw2067
+sw2071
+sw2072
+sw2160
+sw2163
+sw2175
+sw2253
+sw2289
+sw2299
+sw2340
+sw2373
+sw2395
+sw2399
+sw2455
+sw2501
+sw2534
+sw2558
+sw2593
+sw2594
+sw2598
+sw2620
+sw2621
+sw2623
+sw2630
+sw2653
+sw2713
+sw2755
+sw2772
+sw2776
+sw2790
+sw2832
+sw2839
+sw2842
+sw2854
+sw2874
+sw2888
+sw2889
+sw2944
+sw2959
+sw2981
+sw2989
+sw3015
+sw3046
+sw3072
+sw3096
+sw3148
+sw3156
+sw3181
+sw3184
+sw3190
+sw3191
+sw3202
+sw3207
+sw3239
+sw3246
+sw3250
+sw3251
+sw3255
+sw3257
+sw3281
+sw3288
+sw3290
+sw3291
+sw3334
+sw3346
+sw3352
+sw3354
+sw3382
+sw3433
+sw3445
+sw3491
+sw3497
+sw3500
+sw3506
+sw3509
+sw3554
+sw3576
+sw3584
+sw3587
+sw3658
+sw3659
+sw3666
+sw3675
+sw3686
+sw3697
+sw3711
+sw3769
+sw3797
+sw3810
+sw3811
+sw3921
+sw4004
+sw4026
+sw4037
+sw4048
+sw4072
+sw4318
+sw4321
+sw4347
+sw4356
+sw4372
+sw4572
+sw4633
+sw4660
+sw4697
+sw4707
+sw4716
+sw4736
+sw4802
+sw4890
+sw4917
diff --git a/egs2/swbd_da/asr1/path.sh b/egs2/swbd_da/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/swbd_da/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/pyscripts b/egs2/swbd_da/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/swbd_da/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/run.sh b/egs2/swbd_da/asr1/run.sh
new file mode 100755
index 00000000000..4e68f250ecd
--- /dev/null
+++ b/egs2/swbd_da/asr1/run.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+context=3
+train_set="train_context${context}"
+valid_set="valid_context${context}"
+test_sets="test_context${context} valid_context${context}"
+
+asr_config="conf/train_asr.yaml"
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --token_type word \
+    --max_wav_duration 30 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --inference_asr_model valid.loss.ave.pth \
+    --local_data_opts "--context ${context}" \
+    --asr_stats_dir "exp/asr_stats_context${context}_raw_en_word_sp" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --lm_train_text "data/${train_set}/text" \
+    --feats-normalize null "$@"
diff --git a/egs2/swbd_da/asr1/scripts b/egs2/swbd_da/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/swbd_da/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/steps b/egs2/swbd_da/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/swbd_da/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/swbd_da/asr1/utils b/egs2/swbd_da/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/swbd_da/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/README.md b/egs2/swbd_sentiment/asr1/README.md
new file mode 100644
index 00000000000..84ee7efbbf1
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/README.md
@@ -0,0 +1,35 @@
+# RESULTS
+## Dataset
+- Speech Sentiment Annotations (Switchboard Sentiment)
+   - Data: https://catalog.ldc.upenn.edu/LDC2020T14
+   - Paper: https://catalog.ldc.upenn.edu/docs/LDC2020T14/LREC_2020_Switchboard_Senti.pdf
+
+## Environments
+- date: `Thu Mar  3 21:34:18 EST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.9.0+cu102`
+- Git hash: `3b53aedc654fd30a828689c2139a1e130adac077`
+  - Commit date: `Fri Feb 25 00:13:16 2022 -0500`
+
+## Using Conformer based encoder and Transformer based decoder with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer.yaml](conf/tuning/train_asr_conformer.yaml)
+- token_type: word
+- labels: Positive, Neutral, Negative
+- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|61.0|65.0|65.6|
+|decode_asr_asr_model_valid.acc.ave_10best/test|2438|61.4|64.4|64.6|
+
+## Using Conformer based encoder, Transformer based decoder and self-supervised learning features (Wav2vec2.0) with spectral augmentation and predicting transcript along with sentiment
+- ASR config: [conf/tuning/train_asr_conformer_wav2vec2.yaml](conf/tuning/train_asr_conformer_wav2vec2.yaml)
+- token_type: word
+- labels: Positive, Neutral, Negative
+- Pre-trained Model: https://huggingface.co/espnet/YushiUeda_swbd_sentiment_asr_train_asr_conformer_wav2vec2
+
+|dataset|Snt|Intent Classification Macro F1 (%)| Weighted F1 (%)| Micro F1 (%)|
+|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave_10best/valid|2415|64.5|67.5|67.4|
+|decode_asr_asr_model_valid.acc.ave_10best/test|2438|64.1|66.5|66.3|
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/asr.sh b/egs2/swbd_sentiment/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/cmd.sh b/egs2/swbd_sentiment/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml b/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..e2408c35e82
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+beam_size: 1
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/conf/fbank.conf b/egs2/swbd_sentiment/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/swbd_sentiment/asr1/conf/pbs.conf b/egs2/swbd_sentiment/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/swbd_sentiment/asr1/conf/pitch.conf b/egs2/swbd_sentiment/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/swbd_sentiment/asr1/conf/queue.conf b/egs2/swbd_sentiment/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/swbd_sentiment/asr1/conf/slurm.conf b/egs2/swbd_sentiment/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/swbd_sentiment/asr1/conf/train_asr.yaml b/egs2/swbd_sentiment/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..f5104e2d6ea
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,62 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: true
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 31
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.0025
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 40000
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 50
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
diff --git a/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
new file mode 100644
index 00000000000..92b01329911
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/conf/tuning/train_asr_conformer_wav2vec2.yaml
@@ -0,0 +1,90 @@
+# network architecture
+# encoder related
+
+encoder: conformer
+encoder_conf:
+  output_size: 512
+  attention_heads: 8
+  linear_units: 2048
+  num_blocks: 12
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  attention_dropout_rate: 0.1
+  input_layer: conv2d
+  normalize_before: true
+  macaron_style: true
+  pos_enc_layer_type: "rel_pos"
+  selfattention_layer_type: "rel_selfattn"
+  activation_type: "swish"
+  use_cnn_module: true
+  cnn_module_kernel: 31
+
+decoder: transformer
+decoder_conf:
+  attention_heads: 8
+  linear_units: 2048
+  num_blocks: 6
+  dropout_rate: 0.1
+  positional_dropout_rate: 0.1
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+  lr: 0.0025
+scheduler: warmuplr   # pytorch v1.1.0+ required #Tune warmup steps
+scheduler_conf:
+  warmup_steps: 25000
+batch_type: numel
+batch_bins: 40000000
+accum_grad: 3
+max_epoch: 50
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend_conf:
+ n_fft: 512
+ hop_length: 256
+
+frontend: s3prl
+frontend_conf:
+  frontend_conf:
+    upstream: wav2vec2_large_ll60k # Note: If the upstream is changed, please change the input_size in the preencoder.
+    # If using hubert, change the above line to "upstream: hubert_large_ll60k"
+  download_dir: ./hub
+  multilayer_feature: True
+
+preencoder: linear
+preencoder_conf:
+  input_size: 1024 # Note: If the upstream is changed, please change this value accordingly.
+  output_size: 80
+
+model_conf:
+  ctc_weight: 0.3
+  lsm_weight: 0.1
+  length_normalized_loss: false
+  extract_feats_in_collect_stats: false  # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+specaug: specaug
+specaug_conf:
+  apply_time_warp: true
+  time_warp_window: 5
+  time_warp_mode: bicubic
+  apply_freq_mask: true
+  freq_mask_width_range:
+  - 0
+  - 30
+  num_freq_mask: 2
+  apply_time_mask: true
+  time_mask_width_range:
+  - 0
+  - 40
+  num_time_mask: 2
+
+best_model_criterion:
+- - valid
+  - acc
+  - max
+keep_nbest_models: 10
diff --git a/egs2/swbd_sentiment/asr1/db.sh b/egs2/swbd_sentiment/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt b/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt
new file mode 120000
index 00000000000..dd1bbcd661f
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/MSU_single_letter.txt
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/MSU_single_letter.txt
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/data.sh b/egs2/swbd_sentiment/asr1/local/data.sh
new file mode 100755
index 00000000000..54df2f8eb97
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/data.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+
+stage=1
+stop_stage=4
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+if [ -z "${SWBD}" ]; then
+    log "Fill the value of 'SWBD' of db.sh"
+    exit 1
+fi
+
+
+# we assume that LDC97S62 & speech_sentiment_annotations are placed under SWBD
+swbd1_dir=${SWBD}/LDC97S62
+swbd_sentiment=${SWBD}/speech_sentiment_annotations/data/sentiment_labels.tsv
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log " Data Preparation"
+    local/swbd1_data_download.sh ${swbd1_dir}
+    local/swbd1_prepare_dict.sh
+    local/swbd1_data_prep.sh ${swbd1_dir}
+    # upsample audio from 8k to 16k to make a recipe consistent with others
+    sed -i.bak -e "s/$/ sox -R -t wav - -t wav - rate 16000 dither | /" data/train/wav.scp
+    utils/fix_data_dir.sh data/train
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log " Data Formatting"
+     # remove ._ . _1 symbols from text  
+     cp data/train/text data/train/text.backup
+     sed -i 's/\._/ /g; s/\.//g; s/them_1/them/g' data/train/text
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log " Concatenate Sentiment with Transcription"
+    # Concatenate sentiment (Positive, Negative, Neutral) with transcription. 
+    # Using sentiment annotation reconciliation strategy based on majority voting as in
+    # https://catalog.ldc.upenn.edu/docs/LDC2020T14/LREC_2020_Switchboard_Senti.pdf
+    # This stage may take a while
+    mkdir -p data/local/tmp/
+    mv -f data/train/* data/local/tmp/.
+    mkdir -p data/dev/
+    mkdir -p data/test/
+    python3 local/prepare_sentiment.py \
+        --train_dir data/train/ \
+        --dev_dir data/dev/ \
+        --test_dir data/test/ \
+        --sentiment_file ${swbd_sentiment} \
+        --text_file data/local/tmp/text \
+        --wavscp_file data/local/tmp/wav.scp
+    for dir in train dev test; do
+    utils/utt2spk_to_spk2utt.pl data/$dir/utt2spk > data/$dir/spk2utt
+    utils/fix_data_dir.sh data/$dir
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/swbd_sentiment/asr1/local/dict.patch b/egs2/swbd_sentiment/asr1/local/dict.patch
new file mode 120000
index 00000000000..f3e0d14e91a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/dict.patch
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/dict.patch
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/extend_segments.pl b/egs2/swbd_sentiment/asr1/local/extend_segments.pl
new file mode 120000
index 00000000000..63065555357
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/extend_segments.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/extend_segments.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py b/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py
new file mode 120000
index 00000000000..29f6093eade
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/format_acronyms_dict.py
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/format_acronyms_dict.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py b/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py
new file mode 120000
index 00000000000..1d1cda1cc86
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/map_acronyms_transcripts.py
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/map_acronyms_transcripts.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/path.sh b/egs2/swbd_sentiment/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py b/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py
new file mode 100755
index 00000000000..8921fa4272d
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/prepare_sentiment.py
@@ -0,0 +1,174 @@
+import os
+import re
+import argparse
+import math
+
+
+def float2str(number, size=6):
+    number = str(math.ceil(number * 100))
+    return (size - len(number)) * "0" + number
+
+
+def majorityvote(line):
+    count_pos = line.count("Positive")
+    count_neu = line.count("Neutral")
+    count_neg = line.count("Negative")
+    dic = {"Positive": count_pos, "Neutral": count_neu, "Negative": count_neg}
+    max_value = max(dic.values())
+    # make sure max_value is unique
+    keys = [key for key, value in dic.items() if value == max_value]
+    label = keys[0] if len(keys) == 1 else -1
+    return label
+
+
+def normalize_transcript(transcript):
+    # remove punctuation except apostrophes
+    transcript = re.sub(r"(\.|\,|\?|\!|\-|\:|\;)", " \\1 ", transcript)
+    transcript = re.sub(r"\.|\,|\?|\!|\-|\:|\;", "", transcript)
+    # remove tag (e.g. [LAUGHTER])
+    transcript = re.sub(r"\[.+\]", "", transcript)
+    # Detect valid apostrophe cases and split those into two words
+    transcript = re.sub("([a-z])'([a-z])", "\\1 '\\2", transcript)
+    # Clean up special cases of standalone apostrophes
+    transcript = re.sub("([a-z])' ", "\\1 ", transcript)
+    # remove extra spaces
+    transcript = re.sub(" +", " ", transcript)
+    # remove space at the beginning of the utterance
+    transcript = re.sub("^ ", "", transcript)
+    return transcript
+
+
+def process_data(
+    target_dir, sentiment_file, text_file, wavscp_file, start_linenum, end_linenum
+):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    utt2spk_list = []
+    segments_list = []
+    text_list = []
+    wavscp_list = []
+    reco2file_list = []
+
+    with open(sentiment_file, "r", encoding="utf-8") as sf, open(
+        text_file, "r", encoding="utf-8"
+    ) as tf, open(wavscp_file, "r", encoding="utf-8") as wf:
+        prev_spk_id_tf = 0
+        prev_linenum_tf = 0
+        prev_linenum_wf = 0
+        for linenum, line_sf in enumerate(sf):
+            if linenum >= start_linenum and linenum < end_linenum:
+                # "sw02005_0[tab]0.0[tab]11.287375[tab]
+                # Neutral-{Questioning}#Neutral-{No emotion}#Neutral-{No emotion}"
+                utt_id_sf, start, end, sentiment = line_sf.strip().split("\t")
+                # "sw02005_0" -> "sw02005"
+                reco_id_sf = utt_id_sf.split("_")[0]
+                label = majorityvote(sentiment)
+                if label != -1:
+                    tf.seek(0)
+                    for linenum_tf, line_tf in enumerate(tf):
+                        if linenum_tf >= prev_linenum_tf:
+                            # "sw02001-A_018732-018950 oh i see uh-huh"
+                            # -> "sw02001-A_018732-018950" "oh i see uh-huh"
+                            utt_id_tf, transcript = line_tf.strip("\n").split(" ", 1)
+                            # "sw02001-A_018732-018950" -> "sw02001-A" "018732-018950"
+                            spk_id_tf, time_id = utt_id_tf.split("_")
+                            # "sw02001-A" -> "sw02001"
+                            reco_id_tf = spk_id_tf.split("-")[0]
+                            # "018732-018950" -> "018732" "018950"
+                            start_time_id, end_time_id = time_id.split("-")
+                            # in case start and end time slightly differ
+                            # in text and sentiment annotation
+                            eps = 0.05
+                            if (
+                                reco_id_tf == reco_id_sf
+                                and start_time_id >= float2str(float(start) - eps)
+                                and start_time_id <= float2str(float(start) + eps)
+                                and end_time_id >= float2str(float(end) - eps)
+                                and end_time_id <= float2str(float(end) + eps)
+                            ):
+                                # normalize transcript
+                                transcript = normalize_transcript(transcript)
+                                utt2spk_list.append(
+                                    "{} {}".format(utt_id_tf, spk_id_tf)
+                                )
+                                segments_list.append(
+                                    "{} {} {:.2f} {:.2f}".format(
+                                        utt_id_tf, spk_id_tf, float(start), float(end)
+                                    )
+                                )
+                                text_list.append(
+                                    "{} {} {}".format(utt_id_tf, label, transcript)
+                                )
+
+                                if prev_spk_id_tf != spk_id_tf:
+                                    wf.seek(0)
+                                    for linenum_wf, line_wf in enumerate(wf):
+                                        if linenum_wf >= prev_linenum_wf:
+                                            spk_id_wf = line_wf.split(" ")[0]
+                                            if spk_id_wf == spk_id_tf:
+                                                wavscp_list.append(
+                                                    "{}".format(line_wf.strip("\n"))
+                                                )
+                                                (
+                                                    reco_id_wf,
+                                                    channel_id,
+                                                ) = spk_id_wf.split("-")
+                                                reco2file_list.append(
+                                                    "{} {} {}".format(
+                                                        spk_id_wf,
+                                                        reco_id_wf,
+                                                        channel_id,
+                                                    )
+                                                )
+                                                prev_linenum_wf = linenum_wf
+                                                break
+                                prev_spk_id_tf = spk_id_tf
+                                prev_linenum_tf = linenum_tf
+                                break
+    with open(
+        os.path.join(target_dir, "utt2spk"), "w", encoding="utf-8"
+    ) as utt2spk, open(
+        os.path.join(target_dir, "segments"), "w", encoding="utf-8"
+    ) as segments, open(
+        os.path.join(target_dir, "text"), "w", encoding="utf-8"
+    ) as text, open(
+        os.path.join(target_dir, "wav.scp"), "w", encoding="utf-8"
+    ) as wavscp, open(
+        os.path.join(target_dir, "reco2file_and_channel"), "w", encoding="utf-8"
+    ) as reco2file:
+        utt2spk.write("\n".join(utt2spk_list) + "\n")
+        segments.write("\n".join(segments_list) + "\n")
+        text.write("\n".join(text_list) + "\n")
+        wavscp.write("\n".join(wavscp_list) + "\n")
+        reco2file.write("\n".join(reco2file_list) + "\n")
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--train_dir", type=str, default="data/train")
+parser.add_argument("--dev_dir", type=str, default="data/dev")
+parser.add_argument("--test_dir", type=str, default="data/test")
+parser.add_argument("--sentiment_file", type=str, required=True)
+parser.add_argument("--text_file", type=str, default="data/train/tmp/text")
+parser.add_argument("--wavscp_file", type=str, default="data/train/tmp/wav.scp")
+
+args = parser.parse_args()
+
+# Split into train, dev, test
+# Note that there is no "official" split provided.
+# Using the proportion of train 90%, dev 5%, test 5% as in
+# https://arxiv.org/pdf/1911.09762.pdf
+print("start train file preparation...this may take a while")
+process_data(
+    args.train_dir, args.sentiment_file, args.text_file, args.wavscp_file, 0, 47056
+)
+print("start dev file preparation")
+process_data(
+    args.dev_dir, args.sentiment_file, args.text_file, args.wavscp_file, 47056, 49673
+)
+print("start test file preparation")
+process_data(
+    args.test_dir, args.sentiment_file, args.text_file, args.wavscp_file, 49673, 52293
+)
+
+print("Successfully finished text, utt2spk, segments, wavescp preparation")
diff --git a/egs2/swbd_sentiment/asr1/local/score.py b/egs2/swbd_sentiment/asr1/local/score.py
new file mode 120000
index 00000000000..32c1318cf74
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score.py
@@ -0,0 +1 @@
+../../../TEMPLATE/asr1/pyscripts/utils/score_intent.py
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/score.sh b/egs2/swbd_sentiment/asr1/local/score.sh
new file mode 120000
index 00000000000..938c01f1250
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score.sh
@@ -0,0 +1 @@
+../../../slue-voxceleb/asr1/local/score.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/score_f1.py b/egs2/swbd_sentiment/asr1/local/score_f1.py
new file mode 100755
index 00000000000..a36c37c7b1f
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/score_f1.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright 2022  Yushi Ueda
+#           2022  Carnegie Mellon University
+# Apache 2.0
+
+
+import os
+import re
+import sys
+import pandas as pd
+import argparse
+from sklearn.metrics import f1_score
+
+
+def get_classification_result(hyp_file, ref_file):
+    hyp_lines = [line for line in hyp_file]
+    ref_lines = [line for line in ref_file]
+    hyp_list = []
+    ref_list = []
+    for line_count in range(len(hyp_lines)):
+        hyp_list.append(hyp_lines[line_count].split(" ")[0])
+        ref_list.append(ref_lines[line_count].split(" ")[0])
+    macro_f1 = f1_score(
+        ref_list, hyp_list, average="macro", labels=["Positive", "Neutral", "Negative"]
+    )
+    weighted_f1 = f1_score(
+        ref_list,
+        hyp_list,
+        average="weighted",
+        labels=["Positive", "Neutral", "Negative"],
+    )
+    micro_f1 = f1_score(
+        ref_list, hyp_list, average="micro", labels=["Positive", "Neutral", "Negative"]
+    )
+    return macro_f1, weighted_f1, micro_f1
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--exp_root", required=True, help="Directory to save experiments")
+parser.add_argument(
+    "--valid_folder",
+    default="inference_asr_model_valid.acc.ave_10best/devel/",
+    help="Directory inside exp_root containing inference on valid set",
+)
+parser.add_argument(
+    "--test_folder",
+    default="inference_asr_model_valid.acc.ave_10best/test/",
+    help="Directory inside exp_root containing inference on test set",
+)
+parser.add_argument(
+    "--utterance_test_folder",
+    default=None,
+    help="Directory inside exp_root containing inference on utterance test set",
+)
+
+args = parser.parse_args()
+
+exp_root = args.exp_root
+valid_inference_folder = args.valid_folder
+test_inference_folder = args.test_folder
+
+valid_hyp_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/hyp.trn")
+)
+valid_ref_file = open(
+    os.path.join(exp_root, valid_inference_folder + "score_wer/ref.trn")
+)
+
+macro_f1, weighted_f1, micro_f1 = get_classification_result(
+    valid_hyp_file, valid_ref_file
+)
+print("Valid Intent Classification Result")
+print(
+    "macro f1:{}, weighted f1:{}, micro f1:{}".format(macro_f1, weighted_f1, micro_f1)
+)
+
+test_hyp_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/hyp.trn")
+)
+test_ref_file = open(
+    os.path.join(exp_root, test_inference_folder + "score_wer/ref.trn")
+)
+
+macro_f1, weighted_f1, micro_f1 = get_classification_result(
+    test_hyp_file, test_ref_file
+)
+print("Test Intent Classification Result")
+print(
+    "macro f1:{}, weighted f1:{}, micro f1:{}".format(macro_f1, weighted_f1, micro_f1)
+)
+
+if args.utterance_test_folder is not None:
+    utt_test_inference_folder = args.utterance_test_folder
+    utt_test_hyp_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/hyp.trn")
+    )
+    utt_test_ref_file = open(
+        os.path.join(exp_root, utt_test_inference_folder + "score_wer/ref.trn")
+    )
+    macro_f1, weighted_f1, micro_f1 = get_classification_result(
+        utt_test_hyp_file, utt_test_ref_file
+    )
+    print("Unseen Utterance Test Intent Classification Result")
+    print(
+        "macro f1:{}, weighted f1:{}, micro f1:{}".format(
+            macro_f1, weighted_f1, micro_f1
+        )
+    )
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh b/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh
new file mode 120000
index 00000000000..dfc7b6be51e
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_data_download.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_data_download.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh b/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh
new file mode 120000
index 00000000000..2c88651a694
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_data_prep.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_data_prep.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl b/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl
new file mode 120000
index 00000000000..895ea088e4a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_fix_speakerid.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_fix_speakerid.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl b/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl
new file mode 120000
index 00000000000..7c0014e683a
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_map_words.pl
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_map_words.pl
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh b/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh
new file mode 120000
index 00000000000..88fa6f959b1
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/local/swbd1_prepare_dict.sh
@@ -0,0 +1 @@
+../../../../egs/swbd/asr1/local/swbd1_prepare_dict.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/path.sh b/egs2/swbd_sentiment/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/pyscripts b/egs2/swbd_sentiment/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/run.sh b/egs2/swbd_sentiment/asr1/run.sh
new file mode 100755
index 00000000000..498511133e9
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/run.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test dev"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --lang en \
+    --ngpu 1 \
+    --use_lm false \
+    --nbpe 5000 \
+    --token_type word\
+    --feats_type raw\
+    --max_wav_duration 30 \
+    --inference_nj 8 \
+    --inference_asr_model valid.acc.ave_10best.pth\
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --feats_normalize "utterance_mvn" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" "$@"
diff --git a/egs2/swbd_sentiment/asr1/scripts b/egs2/swbd_sentiment/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/steps b/egs2/swbd_sentiment/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/swbd_sentiment/asr1/utils b/egs2/swbd_sentiment/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/swbd_sentiment/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/tedlium2/asr1/README.md b/egs2/tedlium2/asr1/README.md
new file mode 100755
index 00000000000..75a63b289bb
--- /dev/null
+++ b/egs2/tedlium2/asr1/README.md
@@ -0,0 +1,34 @@
+# RESULTS
+## Environments
+- date: `Thu Nov 11 09:45:45 CST 2021`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.9.8`
+- pytorch version: `pytorch 1.5.1`
+- Git hash: `456e6517a47ef71d1b569cfa38b107538d9ef581`
+  - Commit date: `Thu Aug 19 00:48:13 2021 +0800`
+
+## asr_train_asr_streaming_transformer_raw_en_bpe500_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|466|14671|90.3|7.5|2.3|1.7|11.4|82.4|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|27500|90.6|6.7|2.8|1.4|10.8|77.5|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|1155|27500|89.1|7.6|3.3|1.6|12.4|80.8|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|466|78259|96.1|1.6|2.3|1.5|5.4|82.4|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|145066|95.9|1.5|2.6|1.2|5.3|77.5|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|1155|145066|95.1|1.7|3.2|1.4|6.3|80.8|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/dev|466|27927|92.2|5.1|2.7|1.8|9.6|82.4|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.ave/test|1155|51430|92.4|4.6|3.0|1.3|8.9|77.5|
+|decode_asr_streaming_lm_lm_train_lm_en_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|1155|51430|91.0|5.3|3.7|1.5|10.5|80.8|
+
diff --git a/egs2/tedlium2/asr1/asr.sh b/egs2/tedlium2/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/tedlium2/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/tedlium2/asr1/cmd.sh b/egs2/tedlium2/asr1/cmd.sh
new file mode 100755
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/tedlium2/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/tedlium2/asr1/conf/decode_asr_streaming.yaml b/egs2/tedlium2/asr1/conf/decode_asr_streaming.yaml
new file mode 100755
index 00000000000..0bfda98ae38
--- /dev/null
+++ b/egs2/tedlium2/asr1/conf/decode_asr_streaming.yaml
@@ -0,0 +1,11 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
+sim_chunk_length: 640
+disable_repetition_detection: true
+decoder_text_length_limit: 0
+encoded_feat_length_limit: 0
+#streaming: True
diff --git a/egs2/tedlium2/asr1/conf/fbank.conf b/egs2/tedlium2/asr1/conf/fbank.conf
new file mode 100755
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/tedlium2/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/tedlium2/asr1/conf/pbs.conf b/egs2/tedlium2/asr1/conf/pbs.conf
new file mode 100755
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/tedlium2/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/tedlium2/asr1/conf/pitch.conf b/egs2/tedlium2/asr1/conf/pitch.conf
new file mode 100755
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/tedlium2/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/tedlium2/asr1/conf/queue.conf b/egs2/tedlium2/asr1/conf/queue.conf
new file mode 100755
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/tedlium2/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/tedlium2/asr1/conf/slurm.conf b/egs2/tedlium2/asr1/conf/slurm.conf
new file mode 100755
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/tedlium2/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/tedlium2/asr1/conf/train_asr_streaming_transformer.yaml b/egs2/tedlium2/asr1/conf/train_asr_streaming_transformer.yaml
new file mode 100755
index 00000000000..18ad0cb6fe6
--- /dev/null
+++ b/egs2/tedlium2/asr1/conf/train_asr_streaming_transformer.yaml
@@ -0,0 +1,92 @@
+# network architecture
+# encoder related
+encoder: contextual_block_transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    block_size: 40
+    hop_size: 16
+    look_ahead: 16
+    init_average: true
+    ctx_pos_enc: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 128
+#valid_batch_size: 1
+# optimization related
+accum_grad: 1
+grad_clip: 5
+patience: 3
+max_epoch: 20
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+early_stopping_criterion:
+    - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+# NoamLR is deprecated. Use WarmupLR.
+# The following is equivalent setting for NoamLR:
+#
+#    optim: adam
+#    optim_conf:
+#        lr: 10.
+#    scheduler: noamlr
+#    scheduler_conf:
+#        model_size: 256
+#        warmup_steps: 25000
+#
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/tedlium2/asr1/conf/train_lm.yaml b/egs2/tedlium2/asr1/conf/train_lm.yaml
new file mode 100755
index 00000000000..26ea1e4f96e
--- /dev/null
+++ b/egs2/tedlium2/asr1/conf/train_lm.yaml
@@ -0,0 +1,19 @@
+# rnnlm related
+lm: seq_rnn
+lm_conf:
+    unit: 650
+    nlayers: 2
+
+# optimization related
+grad_clip: 5.0
+batch_type: folded
+batch_size: 64  # batch size in LM training
+max_epoch: 20   # if the data size is large, we can reduce this
+patience: 3
+optim: sgd
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/tedlium2/asr1/db.sh b/egs2/tedlium2/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/tedlium2/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/tedlium2/asr1/local/data.sh b/egs2/tedlium2/asr1/local/data.sh
new file mode 100755
index 00000000000..76f31f6129f
--- /dev/null
+++ b/egs2/tedlium2/asr1/local/data.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Copyright 2021 UCAS (Author: Keqi Deng)
+# Apache 2.0
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0
+
+Options:
+    --remove_archive (bool): true or false
+      With remove_archive=True, the archives will be removed after being successfully downloaded and un-tarred.
+EOF
+)
+SECONDS=0
+
+log "$0 $*"
+
+
+. ./utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+if [ -z "${TEDLIUM2}" ]; then
+  log "Error: \$TEDLIUM2 is not set in db.sh."
+  exit 2
+fi
+
+log "Download data to ${TEDLIUM2}"
+if [ ! -d "${TEDLIUM2}" ]; then
+    mkdir -p "${TEDLIUM2}"
+fi
+
+#To absolute path
+TEDLIUM2=$(cd ${TEDLIUM2}; pwd)
+
+echo local/download_data.sh
+local/download_data.sh "${TEDLIUM2}"
+echo local/prepare_data.sh
+local/prepare_data.sh "${TEDLIUM2}"
+for dset in dev test train; do
+utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset}
+done
+
+mkdir -p data/train data/dev data/test
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/tedlium2/asr1/local/download_data.sh b/egs2/tedlium2/asr1/local/download_data.sh
new file mode 100755
index 00000000000..a1e9ebe5ebb
--- /dev/null
+++ b/egs2/tedlium2/asr1/local/download_data.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# Copyright  2014  Nickolay V. Shmyrev
+#            2014  Brno University of Technology (Author: Karel Vesely)
+#            2016  John Hopkins University (author: Daniel Povey)
+#	     2021  UCAS (author: Keqi Deng)
+# Apache 2.0
+
+#mkdir -p db
+TEDLIUM2=$1
+cd "${TEDLIUM2}"
+
+# TED-LIUM database:
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
+  if [ ! -e TEDLIUM_release2 ]; then
+    ln -sf /export/corpora5/TEDLIUM_release2
+  fi
+  echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release2"
+else
+  if [ ! -e TEDLIUM_release2 ]; then
+    echo "$0: downloading TEDLIUM_release2 data (it won't re-download if it was already downloaded.)"
+    # the following command won't re-get it if it's already there
+    # because of the --continue switch.
+    wget --continue http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz || exit 1
+    tar xf "TEDLIUM_release2.tar.gz"
+  else
+    echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
+  fi
+fi
+
+
+num_sph=$(find TEDLIUM_release2/ -name '*.sph' | wc -l)
+if [ "$num_sph" != 1514 ]; then
+  echo "$0: expected to find 1514 .sph files in the directory db/TEDLIUM_release2, found $num_sph"
+  exit 1
+fi
+
+exit 0
+
diff --git a/egs2/tedlium2/asr1/local/join_suffix.py b/egs2/tedlium2/asr1/local/join_suffix.py
new file mode 120000
index 00000000000..9737f8ca743
--- /dev/null
+++ b/egs2/tedlium2/asr1/local/join_suffix.py
@@ -0,0 +1 @@
+../../../../egs/tedlium2/asr1/local/join_suffix.py
\ No newline at end of file
diff --git a/egs2/tedlium2/asr1/local/path.sh b/egs2/tedlium2/asr1/local/path.sh
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/egs2/tedlium2/asr1/local/prepare_data.sh b/egs2/tedlium2/asr1/local/prepare_data.sh
new file mode 100755
index 00000000000..3d5a8ac7819
--- /dev/null
+++ b/egs2/tedlium2/asr1/local/prepare_data.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+#
+# Copyright  2014  Nickolay V. Shmyrev
+#            2014  Brno University of Technology (Author: Karel Vesely)
+#            2016  Johns Hopkins University (Author: Daniel Povey)
+#            2021  UCAS (author: Keqi Deng)
+# Apache 2.0
+
+# To be run from one directory above this script.
+
+. ./path.sh
+
+TEDLIUM2=$1
+
+export LC_ALL=C
+
+sph2pipe=sph2pipe
+
+# Prepare: test, train,
+for set in dev test train; do
+  dir=data/$set.orig
+  mkdir -p $dir
+
+  # Merge transcripts into a single 'stm' file, do some mappings:
+  # - <F0_M> -> <o,f0,male> : map dev stm labels to be coherent with train + test,
+  # - <F0_F> -> <o,f0,female> : --||--
+  # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary
+  # - <sil> -> null : remove marked <sil>, it is modelled implicitly (in kaldi)
+  # - (...) -> null : remove utterance names from end-lines of train
+  # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py)
+  { # Add STM header, so sclite can prepare the '.lur' file
+    echo ';;
+;; LABEL "o" "Overall" "Overall results"
+;; LABEL "f0" "f0" "Wideband channel"
+;; LABEL "f2" "f2" "Telephone channel"
+;; LABEL "male" "Male" "Male Talkers"
+;; LABEL "female" "Female" "Female Talkers"
+;;'
+    # Process the STMs
+    cat $TEDLIUM2/TEDLIUM_release2/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \
+      sed -e 's:<F0_M>:<o,f0,male>:' \
+          -e 's:<F0_F>:<o,f0,female>:' \
+          -e 's:([0-9])::g' \
+          -e 's:<sil>::g' \
+          -e 's:([^ ]*)$::' | \
+      awk '{ $2 = "A"; print $0; }'
+  } | local/join_suffix.py > data/$set.orig/stm
+
+  # Prepare 'text' file
+  # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
+  cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \
+    awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100);
+           for (i=7;i<=NF;i++) { printf(" %s", $i); }
+           printf("\n");
+         }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1
+
+  # Prepare 'segments', 'utt2spk', 'spk2utt'
+  cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f\n", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments
+  cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk
+  cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt
+
+  # Prepare 'wav.scp', 'reco2file_and_channel'
+  cat $dir/spk2utt | awk -v set=$set -v pwd=$PWD '{ printf("%s '$sph2pipe' -f wav -p '$TEDLIUM2'/TEDLIUM_release2/%s/sph/%s.sph |\n", $1, set, $1); }' > $dir/wav.scp
+  cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel
+
+  # Create empty 'glm' file
+  echo ';; empty.glm
+  [FAKE]     =>  %HESITATION     / [ ] __ [ ] ;; hesitation token
+  ' > data/$set.orig/glm
+
+  # The training set seems to not have enough silence padding in the segmentations,
+  # especially at the beginning of segments.  Extend the times.
+  if [ $set == "train" ]; then
+    mv data/$set.orig/segments data/$set.orig/segments.temp
+    utils/data/extend_segment_times.py --start-padding=0.15 \
+      --end-padding=0.1 <data/$set.orig/segments.temp >data/$set.orig/segments || exit 1
+    rm data/$set.orig/segments.temp
+  fi
+
+  # Check that data dirs are okay!
+  utils/validate_data_dir.sh --no-feats $dir || exit 1
+done
+
diff --git a/egs2/tedlium2/asr1/path.sh b/egs2/tedlium2/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/tedlium2/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/tedlium2/asr1/pyscripts b/egs2/tedlium2/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/tedlium2/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/tedlium2/asr1/run_streaming.sh b/egs2/tedlium2/asr1/run_streaming.sh
new file mode 100755
index 00000000000..2e85fe43f4d
--- /dev/null
+++ b/egs2/tedlium2/asr1/run_streaming.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+valid_set="dev"
+test_sets="test"
+
+asr_config=conf/train_asr_streaming_transformer.yaml
+inference_config=conf/decode_asr_streaming.yaml
+bpe_train_text=dump/raw/train_sp/text
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --use_streaming true                                \
+    --lang en                                          \
+    --audio_format flac                                 \
+    --feats_type raw                                   \
+    --token_type bpe                                  \
+    --nbpe 500                                         \
+    --bpe_train_text ${bpe_train_text}        \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/tedlium2/asr1/scripts b/egs2/tedlium2/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/tedlium2/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/tedlium2/asr1/steps b/egs2/tedlium2/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/tedlium2/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/tedlium2/asr1/utils b/egs2/tedlium2/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/tedlium2/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/asr.sh b/egs2/thchs30/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/thchs30/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/cmd.sh b/egs2/thchs30/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/thchs30/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/thchs30/asr1/conf/decode_asr.yaml b/egs2/thchs30/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/conf/fbank.conf b/egs2/thchs30/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/thchs30/asr1/conf/pbs.conf b/egs2/thchs30/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/thchs30/asr1/conf/pitch.conf b/egs2/thchs30/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/thchs30/asr1/conf/queue.conf b/egs2/thchs30/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/thchs30/asr1/conf/slurm.conf b/egs2/thchs30/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/thchs30/asr1/conf/train_asr.yaml b/egs2/thchs30/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/conf/train_lm.yaml b/egs2/thchs30/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/thchs30/asr1/conf/tuning/decode_rnn.yaml b/egs2/thchs30/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/thchs30/asr1/conf/tuning/decode_transformer.yaml b/egs2/thchs30/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/thchs30/asr1/conf/tuning/train_asr_conformer5.yaml b/egs2/thchs30/asr1/conf/tuning/train_asr_conformer5.yaml
new file mode 100644
index 00000000000..5a125de415c
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/tuning/train_asr_conformer5.yaml
@@ -0,0 +1,78 @@
+# network architecture
+
+# frontend related
+frontend: default
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+# encoder related
+encoder: conformer
+encoder_conf:
+    input_layer: conv2d
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: embed
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# optimization related
+optim: adam
+accum_grad: 1
+grad_clip: 3
+max_epoch: 50
+optim_conf:
+    lr: 4.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 256
+    warmup_steps: 25000
+
+# minibatch related
+batch_type: numel
+batch_bins: 10000000
+
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/thchs30/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/thchs30/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/thchs30/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/thchs30/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..0d34329d744
--- /dev/null
+++ b/egs2/thchs30/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 0
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: chainer
diff --git a/egs2/thchs30/asr1/db.sh b/egs2/thchs30/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/thchs30/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/local/data.sh b/egs2/thchs30/asr1/local/data.sh
new file mode 100755
index 00000000000..33433fa593d
--- /dev/null
+++ b/egs2/thchs30/asr1/local/data.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=2
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${THCHS30}" ]; then
+   log "Fill the value of 'THCHS30' of db.sh"
+   exit 1
+fi
+db_root=${THCHS30}
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage -1: download data from openslr"
+    local/download_and_untar.sh "${db_root}" "https://www.openslr.org/resources/18/data_thchs30.tgz" data_thchs30.tgz 
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: prepare thchs30 data"
+    local/thchs-30_data_prep.sh "$(pwd)" "${db_root}"/data_thchs30 || exit 1;
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/thchs30/asr1/local/download_and_untar.sh b/egs2/thchs30/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..c73da6e229f
--- /dev/null
+++ b/egs2/thchs30/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../tts1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/local/path.sh b/egs2/thchs30/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/thchs30/asr1/local/thchs-30_data_prep.sh b/egs2/thchs30/asr1/local/thchs-30_data_prep.sh
new file mode 120000
index 00000000000..55fde83ab52
--- /dev/null
+++ b/egs2/thchs30/asr1/local/thchs-30_data_prep.sh
@@ -0,0 +1 @@
+../../tts1/local/thchs-30_data_prep.sh
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/path.sh b/egs2/thchs30/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/thchs30/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/pyscripts b/egs2/thchs30/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/thchs30/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/run.sh b/egs2/thchs30/asr1/run.sh
new file mode 100755
index 00000000000..fb0861c14b1
--- /dev/null
+++ b/egs2/thchs30/asr1/run.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+train_dev=dev
+test_set="dev test"
+
+asr_config=conf/train_asr.yaml
+lm_config=conf/train_lm.yaml
+inference_config=conf/decode_asr.yaml
+
+nbpe=3000
+
+./asr.sh \
+    --ngpu 4 \
+    --lang "zh" \
+    --use_lm true \
+    --lm_config "${lm_config}" \
+    --token_type bpe \
+    --nbpe $nbpe \
+    --feats_type raw \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --bpe_train_text "data/${train_set}/text" \
+    --lm_train_text "data/${train_set}/text" "$@"
+
diff --git a/egs2/thchs30/asr1/scripts b/egs2/thchs30/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/thchs30/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/steps b/egs2/thchs30/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/thchs30/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/thchs30/asr1/utils b/egs2/thchs30/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/thchs30/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/README.md b/egs2/thchs30/tts1/README.md
new file mode 100644
index 00000000000..0314938b240
--- /dev/null
+++ b/egs2/thchs30/tts1/README.md
@@ -0,0 +1,24 @@
+# THCHS30 RECIPE
+
+This is the recipe of Mandrain multi-speaker TTS model with [THCHS30 dataset](http://www.openslr.org/18/).
+
+See the following pages for the usage:
+- [How to run the recipe](../../TEMPLATE/tts1/README.md#how-to-run)
+- [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
+- [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
+- [How to train with X-vector](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-x-vector-training)
+- [How to train with speaker ID](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-speaker-id-embedding-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+# INITIAL RESULTS
+
+## Pretrained models
+
+### thchs30_tts_train_raw_phn_pypinyin_g2p_phone_train.loss.best
+- Tacotron2
+- https://huggingface.co/ftshijt/ESPnet2_pretrained_model_ftshijt_thchs30_tts_train_raw_phn_pypinyin_g2p_phone_train.loss.best
diff --git a/egs2/thchs30/tts1/cmd.sh b/egs2/thchs30/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/thchs30/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/thchs30/tts1/conf/decode.yaml b/egs2/thchs30/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/conf/mfcc.conf b/egs2/thchs30/tts1/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/thchs30/tts1/conf/pbs.conf b/egs2/thchs30/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/thchs30/tts1/conf/queue.conf b/egs2/thchs30/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/thchs30/tts1/conf/slurm.conf b/egs2/thchs30/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/thchs30/tts1/conf/train.yaml b/egs2/thchs30/tts1/conf/train.yaml
new file mode 120000
index 00000000000..5825b613e30
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_gst+xvector_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/thchs30/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/thchs30/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/thchs30/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml b/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
new file mode 100644
index 00000000000..a6b8d59d422
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_conformer_fastspeech2.yaml
@@ -0,0 +1,113 @@
+# This configuration is for ESPnet2 to train Conformer-based
+# FastSpeech2 with GST + X-vector. It requires 4 GPU with 32 GB
+# memory and it takes ~3 days to finish the training on V100.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform                    # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 512                         # dimension of speaker embedding
+    spk_embed_integration_type: add            # how to integrate speaker embedding
+    use_gst: true                              # whether to use GST embedding
+    gst_heads: 4                               # number of heads in GST multi-head attention
+    gst_tokens: 16                             # number of global style tokens
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500  # number of iterations per epoch
+max_epoch: 500            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 18000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml b/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
new file mode 100644
index 00000000000..6065c914c39
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_tacotron2.yaml
@@ -0,0 +1,81 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 512              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 4                 # number of heads in GST multi-head attention
+    gst_tokens: 16               # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_transformer.yaml b/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_transformer.yaml
new file mode 100644
index 00000000000..737a26960d4
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/tuning/train_gst+xvector_transformer.yaml
@@ -0,0 +1,96 @@
+# This configuration is for ESPnet2 to train Transformer-TTS with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the learning of the diagonal attention.
+# It requires 4 GPUs with 32 GB memory and it takes around 3 days
+# to finish the training on Tesla V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: transformer           # model architecture
+tts_conf:                  # keyword arguments for the selected model
+    embed_dim: 0           # embedding dimension in encoder prenet
+    eprenet_conv_layers: 0 # number of conv layers in encoder prenet
+                           # if set to 0, no encoder prenet will be used
+    eprenet_conv_filts: 0  # filter size of conv layers in encoder prenet
+    eprenet_conv_chans: 0  # number of channels of conv layers in encoder prenet
+    dprenet_layers: 2      # number of layers in decoder prenet
+    dprenet_units: 256     # number of units in decoder prenet
+    adim: 512              # attention dimension
+    aheads: 8              # number of attention heads
+    elayers: 6             # number of encoder layers
+    eunits: 1024           # number of encoder ff units
+    dlayers: 6             # number of decoder layers
+    dunits: 1024           # number of decoder ff units
+    positionwise_layer_type: conv1d  # type of position-wise layer
+    positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
+    postnet_layers: 5                # number of layers of postnset
+    postnet_filts: 5                 # filter size of conv layers in postnet
+    postnet_chans: 256               # number of channels of conv layers in postnet
+    spk_embed_dim: 512               # dimension of speaker embedding
+    spk_embed_integration_type: add  # how to integrate speaker embedding
+    use_gst: true                    # whether to use GST embedding
+    gst_heads: 4                     # number of heads in GST multi-head attention
+    gst_tokens: 16                   # number of global style tokens
+    use_masking: True                # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0              # weight of positive sample in binary cross entropy calculation
+    use_scaled_pos_enc: True         # whether to use scaled positional encoding
+    encoder_normalize_before: True   # whether to perform layer normalization before the input
+    decoder_normalize_before: True   # whether to perform layer normalization before the input
+    reduction_factor: 1              # reduction factor
+    init_type: xavier_uniform        # initialization type
+    init_enc_alpha: 1.0              # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0              # initial value of alpha of decoder scaled position encoding
+    eprenet_dropout_rate: 0.0        # dropout rate for encoder prenet
+    dprenet_dropout_rate: 0.5        # dropout rate for decoder prenet
+    postnet_dropout_rate: 0.5        # dropout rate for postnet
+    transformer_enc_dropout_rate: 0.1                # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.1     # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.1           # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.1                # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.1     # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.1           # dropout rate for transformer decoder attention layer
+    transformer_enc_dec_attn_dropout_rate: 0.1       # dropout rate for transformer encoder-decoder attention layer
+    use_guided_attn_loss: true                       # whether to use guided attention loss
+    num_heads_applied_guided_attn: 2                 # number of layers to apply guided attention loss
+    num_layers_applied_guided_attn: 2                # number of heads to apply guided attention loss
+    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
+    guided_attn_loss_sigma: 0.4                      # sigma in guided attention loss
+    guided_attn_loss_lambda: 10.0                    # lambda in guided attention loss
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam             # optimizer type
+optim_conf:             # keyword arguments for selected optimizer
+    lr: 1.0             # learning rate
+scheduler: noamlr       # scheduler type
+scheduler_conf:         # keyword arguments for selected scheduler
+    model_size: 512     # model size, a.k.a., attention dimenstion
+    warmup_steps: 12000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000   # number of iterations per epoch
+max_epoch: 500              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 2               # gradient accumulation
+batch_bins: 9000000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/thchs30/tts1/conf/vad.conf b/egs2/thchs30/tts1/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/thchs30/tts1/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/thchs30/tts1/db.sh b/egs2/thchs30/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/thchs30/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/local/data.sh b/egs2/thchs30/tts1/local/data.sh
new file mode 100755
index 00000000000..fa4c93eda49
--- /dev/null
+++ b/egs2/thchs30/tts1/local/data.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=2
+threshold=35
+nj=16
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${THCHS30}" ]; then
+   log "Fill the value of 'THCHS30' of db.sh"
+   exit 1
+fi
+db_root=${THCHS30}
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage -1: download data from openslr"
+    local/download_and_untar.sh "${db_root}" "https://www.openslr.org/resources/18/data_thchs30.tgz" data_thchs30.tgz 
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: prepare thchs30 data"
+    local/thchs-30_data_prep.sh "$(pwd)" "${db_root}"/data_thchs30 || exit 1;
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: scripts/audio/trim_silence.sh"
+    for x in train dev test train_phn dev_phn test_phn; do
+        # shellcheck disable=SC2154
+        scripts/audio/trim_silence.sh \
+             --cmd "${train_cmd}" \
+             --nj "${nj}" \
+             --fs 16000 \
+             --win_length 512 \
+             --shift_length 128 \
+             --threshold "${threshold}" \
+             data/${x} data/${x}/log 
+        
+        utils/fix_data_dir.sh data/${x}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/thchs30/tts1/local/download_and_untar.sh b/egs2/thchs30/tts1/local/download_and_untar.sh
new file mode 100755
index 00000000000..5505934b52b
--- /dev/null
+++ b/egs2/thchs30/tts1/local/download_and_untar.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Luminar Technologies, Inc. (author: Daniel Galvez)
+#             2017  Ewald Enzinger
+# Apache 2.0
+
+# Adapted from egs/mini_librispeech/s5/local/download_and_untar.sh (commit 1cd6d2ac3a935009fdc4184cb8a72ddad98fe7d9)
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url> <filename>"
+  echo "e.g.: $0 /export/data/ https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz cv_corpus_v1.tar.gz"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+filename=$3
+filepath="$data/$filename"
+workspace=$PWD
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL."
+  exit 1;
+fi
+
+if [ -f $data/$filename.complete ]; then
+  echo "$0: data was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+if [ -f $filepath ]; then
+  size=$(/bin/ls -l $filepath | awk '{print $5}')
+  size_ok=false
+  if [ "$filesize" -eq "$size" ]; then size_ok=true; fi;
+  if ! $size_ok; then
+    echo "$0: removing existing file $filepath because its size in bytes ($size)"
+    echo "does not equal the size of the archives ($filesize)."
+    rm $filepath
+  else
+    echo "$filepath exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $filepath ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  echo "$0: downloading data from $url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $url; then
+    echo "$0: error executing wget $url"
+    exit 1;
+  fi
+  cd $workspace
+fi
+
+cd $data
+
+if ! tar -xzf $filename; then
+  echo "$0: error un-tarring archive $filepath"
+  exit 1;
+fi
+
+cd $workspace
+
+touch $data/$filename.complete
+
+echo "$0: Successfully downloaded and un-tarred $filepath"
+
+if $remove_archive; then
+  echo "$0: removing $filepath file since --remove-archive option was supplied."
+  rm $filepath
+fi
diff --git a/egs2/thchs30/tts1/local/path.sh b/egs2/thchs30/tts1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/thchs30/tts1/local/thchs-30_data_prep.sh b/egs2/thchs30/tts1/local/thchs-30_data_prep.sh
new file mode 100755
index 00000000000..84a3dec41bb
--- /dev/null
+++ b/egs2/thchs30/tts1/local/thchs-30_data_prep.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+#           2016  LeSpeech (Author: Xingyu Na)
+#           2021  Carnegie Mellon University (Author: Jiatong Shi)
+
+#This script pepares the data directory for thchs30 recipe. 
+#It reads the corpus and get wav.scp and transcriptions.
+
+dir=$1
+corpus_dir=$2
+
+
+cd $dir
+
+echo "creating data/{train,dev,test,train_phn,dev_phn,test_phn}"
+mkdir -p data/{train,dev,test,train_phn,dev_phn,test_phn}
+
+#create wav.scp, utt2spk.scp, spk2utt.scp, text
+(
+for x in train dev test; do
+  echo "cleaning data/${x}"
+  subset_dir=${dir}/data/${x}
+  subset_phn_dir=${dir}/data/${x}_phn
+  for f in wav.scp utt2spk spk2utt word.txt phone.txt text; do
+      rm -rf "${subset_dir}/${f}"
+  done
+  echo "preparing scps and text in data/$x"
+  for nn in `find  $corpus_dir/$x -name "*.wav" | sort -u | xargs -I {} basename {} .wav`; do
+      spkid=`echo $nn | awk -F"_" '{print "" $1}'`
+      spk_char=`echo $spkid | sed 's/\([A-Z]\).*/\1/'`
+      spk_num=`echo $spkid | sed 's/[A-Z]\([0-9]\)/\1/'`
+      spkid=$(printf '%s%.2d' "$spk_char" "$spk_num")
+      utt_num=`echo $nn | awk -F"_" '{print $2}'`
+      uttid=$(printf '%s%.2d_%.3d' "$spk_char" "$spk_num" "$utt_num")
+      echo $uttid $corpus_dir/$x/$nn.wav >> "${subset_dir}"/wav.scp
+      echo $uttid $spkid >> "${subset_dir}"/utt2spk
+      echo $uttid `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> "${subset_dir}"/word.txt
+      echo $uttid `sed -n 3p $corpus_dir/data/$nn.wav.trn` >> "${subset_dir}"/phone.txt
+  done 
+
+  cp "${subset_dir}"/word.txt "${subset_dir}"/text
+  for f in wav.scp utt2spk text phone.txt; do
+      sort "${subset_dir}"/${f} -o "${subset_dir}"/${f}
+  done
+  cp "${subset_dir}"/wav.scp "${subset_phn_dir}"/wav.scp
+  cp "${subset_dir}"/phone.txt "${subset_phn_dir}"/text
+  cp "${subset_dir}"/utt2spk "${subset_phn_dir}"/utt2spk
+done
+) || exit 1
+
+for x in train dev test train_phn dev_phn test_phn; do
+    utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+    utils/fix_data_dir.sh data/${x}
+done
+
diff --git a/egs2/thchs30/tts1/path.sh b/egs2/thchs30/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/thchs30/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/pyscripts b/egs2/thchs30/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/thchs30/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/run.sh b/egs2/thchs30/tts1/run.sh
new file mode 100755
index 00000000000..86aadceeaaa
--- /dev/null
+++ b/egs2/thchs30/tts1/run.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=16000
+n_fft=1024
+n_shift=256
+win_length=null
+
+opts=
+if [ "${fs}" -eq 16000 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+train_config=conf/train.yaml
+inference_config=conf/decode.yaml
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+g2p=pypinyin_g2p_phone
+# Input: 卡尔普陪外孙玩滑梯
+# pypinyin_g2p: ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+# pypinyin_g2p_phone: k a3 er3 p u3 p ei2 uai4 s un1 uan2 h ua2 t i1
+
+# if you want to use officially provided phoneme text (better for the quality)
+# train_set=train_phn
+# valid_set=dev_phn
+# test_sets="dev_phn test_phn"
+# g2p=none
+
+./tts.sh \
+    --lang zh \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    --use_xvector true \
+    ${opts} "$@"
diff --git a/egs2/thchs30/tts1/scripts b/egs2/thchs30/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/thchs30/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/sid b/egs2/thchs30/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/thchs30/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/steps b/egs2/thchs30/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/thchs30/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/tts.sh b/egs2/thchs30/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/thchs30/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/thchs30/tts1/utils b/egs2/thchs30/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/thchs30/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/timit/asr1/cmd.sh b/egs2/timit/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/timit/asr1/cmd.sh
+++ b/egs2/timit/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/totonac/asr1/RESULTS.md b/egs2/totonac/asr1/RESULTS.md
new file mode 100644
index 00000000000..5f8f1b27983
--- /dev/null
+++ b/egs2/totonac/asr1/RESULTS.md
@@ -0,0 +1,31 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue Dec 21 11:10:45 EST 2021`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.9.0`
+- model: https://huggingface.co/espnet/ftshijt_espnet2_asr_totonac_transformer
+
+## asr_train_asr_transformer_specaug_raw_bpe250_sp
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe250_valid.loss.ave_asr_model_valid.acc.best/dev|530|3547|59.8|32.9|7.3|6.5|46.7|87.4|
+|decode_asr_lm_lm_train_bpe250_valid.loss.ave_asr_model_valid.acc.best/test|704|5018|55.5|35.7|8.8|6.1|50.6|92.0|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe250_valid.loss.ave_asr_model_valid.acc.best/dev|530|22510|88.1|4.4|7.4|3.9|15.8|87.4|
+|decode_asr_lm_lm_train_bpe250_valid.loss.ave_asr_model_valid.acc.best/test|704|32990|86.9|4.3|8.8|4.0|17.1|92.0|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe250_valid.loss.ave_asr_model_valid.acc.best/dev|530|9360|70.3|15.8|13.8|4.3|34.0|87.4|
+|decode_asr_lm_lm_train_bpe250_valid.loss.ave_asr_model_valid.acc.best/test|704|13835|70.5|16.0|13.6|4.4|33.9|92.0|
+
diff --git a/egs2/totonac/asr1/asr.sh b/egs2/totonac/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/totonac/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/totonac/asr1/cmd.sh b/egs2/totonac/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/totonac/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/totonac/asr1/conf/decode_asr.yaml b/egs2/totonac/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/totonac/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/totonac/asr1/conf/fbank.conf b/egs2/totonac/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/totonac/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/totonac/asr1/conf/pbs.conf b/egs2/totonac/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/totonac/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/totonac/asr1/conf/pitch.conf b/egs2/totonac/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/totonac/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/totonac/asr1/conf/queue.conf b/egs2/totonac/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/totonac/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/totonac/asr1/conf/slurm.conf b/egs2/totonac/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/totonac/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/totonac/asr1/conf/train_asr.yaml b/egs2/totonac/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..baba03ff1b5
--- /dev/null
+++ b/egs2/totonac/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer_default_hubert_linear.yaml
\ No newline at end of file
diff --git a/egs2/totonac/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/totonac/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..cf0355b4705
--- /dev/null
+++ b/egs2/totonac/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 10
+
+init: chainer
diff --git a/egs2/totonac/asr1/conf/tuning/train_asr_transformer_default_hubert_linear.yaml b/egs2/totonac/asr1/conf/tuning/train_asr_transformer_default_hubert_linear.yaml
new file mode 100644
index 00000000000..a4e5d2aaca5
--- /dev/null
+++ b/egs2/totonac/asr1/conf/tuning/train_asr_transformer_default_hubert_linear.yaml
@@ -0,0 +1,93 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 512  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+#frontend related 
+frontend: fused
+frontend_conf: 
+  frontends:
+    - frontend_type: s3prl
+      frontend_conf:
+        upstream: hubert_large_ll60k
+      download_dir: ./hub
+      multilayer_feature: True
+
+    - frontend_type: default
+      n_fft: 512
+      win_length: 400
+      hop_length: 160
+
+  align_method: linear_projection
+  proj_dim: 100     
+    
+
+preencoder: linear
+preencoder_conf:
+    input_size: 160  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 4000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 5
+
+init: xavier_uniform
diff --git a/egs2/totonac/asr1/db.sh b/egs2/totonac/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/totonac/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/totonac/asr1/local/data.sh b/egs2/totonac/asr1/local/data.sh
new file mode 100755
index 00000000000..a11796911c0
--- /dev/null
+++ b/egs2/totonac/asr1/local/data.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright 2021 Carnegie Mellon University (Jiatong Shi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${TOTONAC}
+if [ -z "${TOTONAC}" ]; then
+    log "Fill the value of 'TOTONAC' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+wavdir=${TOTONAC}
+annotation_dir=${TOTONAC}
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Download data to ${TOTONAC}"
+    mkdir -p ${TOTONAC}
+    wget --no-check-certificate --directory-prefix=${TOTONAC} https://www.openslr.org/resources/107/Amith-Lopez_Totonac-recordings-northern-Puebla-and-adjacent-Veracruz_Metadata.xml 
+    local/download_and_untar.sh ${TOTONAC} https://www.openslr.org/resources/107/Totonac_Corpus.tgz Totonac_Corpus.tgz
+    git clone https://github.com/ftshijt/Totonac_Split.git local/split
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage2: Preparing data for TOTONAC"
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    for x in train dev "test"; do
+        python local/data_prep.py -w $wavdir -t data/${x} -i local/split/${x}.tsv -a ${annotation_dir}
+        # sort -o data/${x}/utt2spk > data/${x}/utt2spk
+        utils/fix_data_dir.sh data/${x}
+    done
+    
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/totonac/asr1/local/data_prep.py b/egs2/totonac/asr1/local/data_prep.py
new file mode 100644
index 00000000000..e3f76e03c0a
--- /dev/null
+++ b/egs2/totonac/asr1/local/data_prep.py
@@ -0,0 +1,223 @@
+from argparse import ArgumentParser
+import os
+import re
+import shutil
+import soundfile as sf
+import string
+import sys
+from xml.dom.minidom import parse
+
+s = "".join(chr(c) for c in range(sys.maxunicode + 1))
+ws = "".join(re.findall(r"\s", s))
+outtab = " " * len(ws)
+trantab = str.maketrans(ws, outtab)
+delset = string.punctuation
+delset = delset.replace(":", "")
+delset = delset.replace("'", "")
+
+
+def TextRefine(text):
+    text = re.sub(r"\.\.\.|\*|\[.*?\]", "", text.upper())
+    delset_specific = delset
+    return text.translate(str.maketrans("", "", delset_specific))
+
+
+def ExtractAudioID(audioname, wav_spk_info=None):
+    if wav_spk_info:
+        for key in wav_spk_info.keys():
+            if key in audioname:
+                return key
+    else:
+        print("ERROR in audioname")
+    return "error"
+
+
+def PackZero(number, size=6):
+    return "0" * (size - len(str(number))) + str(number)
+
+
+def LoadWavSpeakerInfo(info_file):
+    """return dict of wav: spk_list"""
+
+    info_file = open(info_file, "r", encoding="utf-8")
+    raw_info = list(map((lambda x: x.split("\t")), (info_file.read()).split("\n")))
+    wav_spk_info = {}
+    for mapping in raw_info[1:]:
+        if len(mapping) < 2:
+            continue
+        [spk, wav] = mapping
+        wav_spk_info[wav] = spk
+    return wav_spk_info
+
+
+def TimeOrderProcess(time_order_dom):
+    time_order = {}
+    time_slots = time_order_dom.getElementsByTagName("TIME_SLOT")
+    for time_slot in time_slots:
+        # convert to second based
+        time_order[time_slot.getAttribute("TIME_SLOT_ID")] = (
+            float(time_slot.getAttribute("TIME_VALUE")) / 1000
+        )
+    return time_order
+
+
+def ELANProcess(afile, spk_info):
+    try:
+        elan_content = parse(afile).documentElement
+    except Exception:
+        print("encoding failed  %s" % afile)
+        return None
+    time_order = TimeOrderProcess(elan_content.getElementsByTagName("TIME_ORDER")[0])
+    tiers = elan_content.getElementsByTagName("TIER")
+    channel = []
+    for tier in tiers:
+        if tier.getAttribute("LINGUISTIC_TYPE_REF") not in [
+            "UtteranceType",
+            "Transcription",
+            "Transcripción",
+        ]:
+            # only consider pure caption
+            continue
+
+        annotations = tier.getElementsByTagName("ANNOTATION")
+        for anno in annotations:
+            info = anno.getElementsByTagName("ALIGNABLE_ANNOTATION")[0]
+            start = time_order[info.getAttribute("TIME_SLOT_REF1")]
+            end = time_order[info.getAttribute("TIME_SLOT_REF2")]
+            text = ""
+            childs = info.getElementsByTagName("ANNOTATION_VALUE")[0].childNodes
+            for child in childs:
+                if child.firstChild is not None:
+                    continue
+                    text += child.firstChild.data
+                else:
+                    text += child.data
+            text = TextRefine(text)
+            text = text.translate(trantab)
+            if len(text) < 1:
+                continue
+            if start == end:
+                continue
+            channel.append([start, end, text])
+    return channel
+
+
+def TraverseData(
+    sound_dir,
+    annotation_dir,
+    target_dir,
+    speaker_info,
+):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    segments = open(os.path.join(target_dir, "segments"), "w", encoding="utf-8")
+    wavscp = open(os.path.join(target_dir, "wav.scp"), "w", encoding="utf-8")
+    utt2spk = open(os.path.join(target_dir, "utt2spk"), "w", encoding="utf-8")
+    spk2utt = open(os.path.join(target_dir, "spk2utt"), "w", encoding="utf-8")
+    text = open(os.path.join(target_dir, "text"), "w", encoding="utf-8")
+    name2spk = open(os.path.join(target_dir, "name2spk"), "w", encoding="utf-8")
+
+    # get relationship
+    sound_files = {}
+    annotation_files = {}
+    spk_id = 1
+    spk2utt_prep = {}
+    name2spk_prep = {}
+
+    wav_spk_info = LoadWavSpeakerInfo(speaker_info)
+    for root, dirs, files in os.walk(sound_dir):
+        for file in files:
+            if file[-4:] == ".wav":
+                sound_files[ExtractAudioID(file, wav_spk_info)] = os.path.join(
+                    root, file
+                )
+    for root, dirs, files in os.walk(annotation_dir):
+        for file in files:
+            if file[-4:] == ".eaf":
+                annotation_files[ExtractAudioID(file, wav_spk_info)] = os.path.join(
+                    root, file
+                )
+    for afile in annotation_files.keys():
+        afile_path = annotation_files[afile]
+        if afile == "error":
+            continue
+        spk_info = wav_spk_info[afile]
+        if len(spk_info) < 10:
+            spk_info += "-______"
+        channel_segments = ELANProcess(afile_path, spk_info)
+        if channel_segments is None:
+            continue
+
+        f = sf.SoundFile(sound_files[afile])
+        max_length = len(f) / f.samplerate
+
+        print(
+            '%s sox -t wavpcm "%s" -c 1 -r 16000 -t wavpcm - |'
+            % (afile, sound_files[afile]),
+            file=wavscp,
+        )
+        segment_number = 0
+        for segment in channel_segments:
+            # segments: start end text
+            segment_id = "%s_%s_%s" % (
+                spk_info,
+                afile,
+                PackZero(segment_number),
+            )
+            if float(segment[1]) > max_length:
+                continue
+            print(
+                "%s %s %s %s" % (segment_id, afile, segment[0], segment[1]),
+                file=segments,
+            )
+            print("%s %s" % (segment_id, spk_info), file=utt2spk)
+            print("%s %s" % (segment_id, segment[2]), file=text)
+            spk2utt_prep[spk_info] = spk2utt_prep.get(spk_info, "") + " %s" % (
+                segment_id
+            )
+            segment_number += 1
+
+        print("successfully processing %s" % afile)
+    for spk in spk2utt_prep.keys():
+        print("%s %s" % (spk, spk2utt_prep[spk]), file=spk2utt)
+    segments.close()
+    wavscp.close()
+    utt2spk.close()
+    spk2utt.close()
+    text.close()
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Process Raw data")
+    parser.add_argument(
+        "-w",
+        dest="wav_path",
+        type=str,
+        help="wav path",
+        default="",
+    )
+    parser.add_argument(
+        "-a",
+        dest="ann_path",
+        type=str,
+        help="annotation path",
+        default="",
+    )
+    parser.add_argument(
+        "-t", dest="target_dir", type=str, help="target_dir", default="data/mixtec"
+    )
+    parser.add_argument(
+        "-i",
+        dest="speaker_info",
+        type=str,
+        help="speaker info file dir",
+        default="local/speaker_wav_mapping_mixtec.csv",
+    )
+    args = parser.parse_args()
+    TraverseData(
+        args.wav_path,
+        args.ann_path,
+        args.target_dir,
+        speaker_info=args.speaker_info,
+    )
diff --git a/egs2/totonac/asr1/local/download_and_untar.sh b/egs2/totonac/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..339fcb29870
--- /dev/null
+++ b/egs2/totonac/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../puebla_nahuatl/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/totonac/asr1/local/path.sh b/egs2/totonac/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/totonac/asr1/path.sh b/egs2/totonac/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/totonac/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/totonac/asr1/pyscripts b/egs2/totonac/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/totonac/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/totonac/asr1/run.sh b/egs2/totonac/asr1/run.sh
new file mode 100755
index 00000000000..10e2f39ea3b
--- /dev/null
+++ b/egs2/totonac/asr1/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+train_dev="dev"
+test_set="test dev"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --local_data_opts "--stage 0" \
+    --audio_format "flac.ark" \
+    --feats_normalize utterance_mvn \
+    --stage 1 \
+    --stop_stage 100 \
+    --speed_perturb_factors "0.9 1.0 1.1" \
+    --ngpu 1 \
+    --use_lm true \
+    --token_type bpe \
+    --nbpe 250 \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --inference_asr_model valid.acc.best.pth \
+    --lm_train_text "data/${train_set}/text"  "$@"
+
diff --git a/egs2/totonac/asr1/scripts b/egs2/totonac/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/totonac/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/totonac/asr1/steps b/egs2/totonac/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/totonac/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/totonac/asr1/utils b/egs2/totonac/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/totonac/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/README.md b/egs2/tsukuyomi/tts1/README.md
new file mode 100644
index 00000000000..1c6226bee3f
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/README.md
@@ -0,0 +1,279 @@
+# TSUKUYOMI-CHAN RECIPE
+
+This is the recipe of the adaptation with Japanese single speaker using [つくよみちゃんコーパス](https://tyc.rei-yumesaki.net/material/corpus).
+
+This recipe assumes the use of pretrained model.
+Please follow the usage to perform fine-tuning with pretrained model.
+
+See the following pages before asking the question:
+- [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
+- [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
+
+# HOW TO RUN
+
+- [AR model case (Tacotron2 / Transformer)](#ar-model-case-tacotron2--transformer)
+- [Non-AR model case (FastSpeech / FastSpeech2)](#non-ar-model-case-fastspeech--fastspeech2)
+- [VITS case](#vits-case)
+
+## AR model case (Tacotron2 / Transformer)
+
+Here, we show the procedure of the fine-tuning using Tacotron2, which is pretrained on [JSUT](../../jsut/tts1) corpus with `pyopenjtalk_accent_with_pause` G2P.
+
+### 1. Run the recipe until stage 5
+
+```sh
+# From data preparation to statistics calculation
+$ ./run.sh --stop-stage 5 --g2p pyopenjtalk_accent_with_pause
+```
+
+The detail of stage 1-5 can be found in [`Recipe flow`](../../TEMPLATE/tts1/README.md#recipe-flow).
+
+### 2. Download pretrained model
+
+Download pretrained model from ESPnet model zoo here.
+If you have your own pretrained model, you can skip this step.
+
+```sh
+$ . ./path.sh
+$ espnet_model_zoo_download --unpack true --cachedir downloads kan-bayashi/jsut_tacotron2_accent_with_pause
+```
+
+You can find the other pretrained models in [ESPnet model zoo](https://github.com/espnet/espnet_model_zoo/blob/master/espnet_model_zoo/table.csv).
+
+### 3. Replace token list with pretrained model's one
+
+Since we use the same language data for fine-tuning, we need to use the token list of the pretrained model instead of that of data for fine-tuning.
+The downloaded pretrained model has `tokens_list` in the config, so first we create `tokens.txt` (`token_list`) from the config.
+
+```sh
+$ pyscripts/utils/make_token_list_from_config.py downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/config.yaml
+
+# tokens.txt is created in model directory
+$ ls downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+config.yaml  images  tokens.txt  train.loss.ave_5best.pth
+```
+
+Let us replace the `tokens.txt` with pretrained model's one.
+```sh
+# Make backup (Rename -> *.bak)
+$ mv dump/token_list/phn_jaconv_pyopenjtalk_accent_with_pause/tokens.{txt,txt.bak}
+# Make symlink to pretrained model's one (Just copy is also OK)
+$ ln -s $(pwd)/downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/tokens.txt dump/token_list/phn_jaconv_pyopenjtalk_accent_with_pause
+```
+
+### 4 (Optional). Replace statistics with pretrained model's one
+
+Sometimes, using the feature statistics of the pretrained models is better than using that of adaptation data.
+This is an optional step, so you can skip if you use the original statistics.
+
+```sh
+# Make backup (Rename -> *.bak)
+$ mv exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.{npz,npz.bak}
+# Make symlink to pretrained model's one (Just copy is also OK)
+$ ln -s $(pwd)/downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.npz exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train
+```
+
+### 5. Run fine-tuning
+
+Run the recipe from stage 6.
+
+You need to specify `--init_param` for `--train_args` to load pretrained parameters (Or you can write them in `*.yaml` config).
+Here `--init_param /path/to/model.pth:a:b` represents loading "a" parameters in model.pth into "b", and `:tts:tts` means load parameters except for the feature normalizer.
+
+```sh
+# Recommend using --tag to name the experiment directory
+$ ./run.sh \
+    --stage 6 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --train_config conf/tuning/finetune_tacotron2.yaml \
+    --train_args "--init_param downloads/0afe7c220cac7d9893eea4ff1e4ca64e/exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.loss.ave_5best.pth:tts:tts" \
+    --tag finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+```
+
+For more complex loading of pretrained parameters, please check [`How to load pretrained model?`](../../TEMPLATE/tts1/README.md#how-to-load-the-pretrained-model) For example, if you want to perform fine-tuning of English model with Japanese data, you may want to load the network except for the token embedding layer.
+
+## Non-AR model case (FastSpeech / FastSpeech2)
+
+To finetune non-AR models, we need to preapre `durations` file.
+Therefore, at first, please finish the finetuning of AR models by the above steps.
+
+Here, we show the procedure of FastSpeech2 fine-tuning with the above fine-tuened tacotron2 as the teacher.
+
+### 1. Prepare durations file using the adapted AR model
+
+First, prepare the `durations` for all sets by running AR model inference with teacher forcing.
+
+```sh
+$ ./run.sh \
+    --stage 7 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --tts_exp exp/tts_finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause \
+    --inference_args "--use_teacher_forcing true" \
+    --test_sets "tr_no_dev dev eval1"
+```
+
+You can find `durations` files in `exp/tts_finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/decode_use_teacher_forcingtrue_train.loss.ave/*`.
+
+### 2. Download pretrained model
+
+Download pretrained model from ESPnet model zoo here.
+If you have your own pretrained model, you can skip this step.
+
+```sh
+$ . ./path.sh
+$ espnet_model_zoo_download --unpack true --cachedir downloads kan-bayashi/jsut_fastspeech2_accent_with_pause
+```
+
+Please make sure this model used the same `token_list` as the teacher AR model.
+
+### 3. Run fine-tuning
+
+Here we skip the replacement of the statistics (Of course you can do it).
+And we assume that `tokens.txt` is already replaced in AR model fine-tuning.
+
+Since fastspeech2 requires extra feature calculation, run from stage 5.
+
+```sh
+# Recommend using --tag to name the experiment directory
+$ ./run.sh \
+    --stage 5 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --write_collected_feats true \
+    --teacher_dumpdir exp/tts_finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/decode_use_teacher_forcingtrue_train.loss.ave \
+    --tts_stats_dir exp/tts_finetune_tacotron2_raw_phn_jaconv_pyopenjtalk_accent_with_pause/decode_use_teacher_forcingtrue_train.loss.ave/stats \
+    --train_config conf/tuning/finetune_fastspeech2.yaml \
+    --train_args "--init_param downloads/0293a01e429a84a604304bf06f2cc0b0/exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.loss.ave_5best.pth:tts:tts" \
+    --tag finetune_fastspeech2_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+```
+
+## VITS case
+
+In the case of VITS, please be careful about the sampling rate.
+As a default, vits used 22.05 khz (but this recipe default is 24khz).
+
+You can also try full band setting (44.1 khz) since this corpus recorded with 96khz.
+
+### 1. Run the recipe until stage 5 with 22.05khz setup
+
+Here, we show the procedure with 22.05 khz setting.
+
+```sh
+# Here we changed root dumpdir from dump -> dump/22k and
+# different g2p to match with the pretrained model.
+# `min_wav_duration` is need to filter out less than 0.38 sec (~=8,192 / 22,050).
+$ ./run.sh \
+    --stage 1 \
+    --stop-stage 5 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --min_wav_duration 0.38 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/finetune_vits.yaml
+```
+
+### 2. Download pretrained model
+
+Download pretrained model from ESPnet model zoo here.
+If you have your own pretrained model, you can skip this step.
+
+```sh
+$ . ./path.sh
+$ espnet_model_zoo_download --unpack true --cachedir downloads kan-bayashi/jsut_vits_accent_with_pause
+```
+
+### 3. Replace token list with pretrained model's one
+
+Since we use the same language data for fine-tuning, we need to use the token list of the pretrained model instead of that of data for fine-tuning.
+The downloaded pretrained model has `tokens_list` in the config, so first we create `tokens.txt` (`token_list`) from the config.
+
+```sh
+$ pyscripts/utils/make_token_list_from_config.py downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_with_accent/config.yaml
+
+# tokens.txt is created in model directory
+$ ls downloads/f3698edf589206588f58f5ec837fa516/exp/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+config.yaml  images  train.total_count.ave_10best.pth
+```
+
+Let us replace the `tokens.txt` with pretrained model's one.
+```sh
+# Make backup (Rename -> *.bak)
+$ mv dump/22k/token_list/phn_jaconv_pyopenjtalk_accent_with_pause/tokens.{txt,txt.bak}
+# Make symlink to pretrained model's one (Just copy is also OK)
+$ ln -s $(pwd)/downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/tokens.txt dump/22k/token_list/phn_jaconv_pyopenjtalk_accent_with_pause
+```
+
+### 4. Run fine-tuning
+
+Run from stage 6.
+
+```sh
+# Recommend using --tag to name the experiment directory
+$ ./run.sh \
+    --stage 6 \
+    --g2p pyopenjtalk_accent_with_pause \
+    --min_wav_duration 0.38 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/finetune_vits.yaml \
+    --train_args "--init_param downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts" \
+    --tag finetune_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause
+```
+
+
+# INITIAL RESULTS
+
+- 44.1 kHz VITS adaptation
+
+## Environments
+
+- date: `Wed Sep 22 22:46:46 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `628b46282537ce532d613d6bafb75e826e8455de`
+  - Commit date: `Wed Sep 8 13:30:50 2021 +0900`
+
+## Pretrained Models
+
+### tsukuyomi_tts_finetune_full_band_jsut_vits_raw_phn_jaconv_pyopenjtalk_prosody_latest
+
+<details><summary>Command</summary><div>
+
+```sh
+# assume that finish the stage 5 and replace the token list
+./run.sh \
+    --stage 6 \
+    --min_wav_duration 0.38 \
+    --ngpu 4 \
+    --fs 44100 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --dumpdir dump/44k \
+    --expdir exp/44k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/finetune_full_band_vits.yaml \
+    --tag finetune_full_band_jsut_vits_raw_phn_jaconv_pyopenjtalk_prosody \
+    --train_args " --init_param ../../jsut/tts1/exp/44k/tts_train_full_band_vits_raw_phn_jaconv_pyopenjtalk_prosody/latest.pth:tts:tts" \
+    --g2p pyopenjtalk_prosody \
+    --inference_model latest.pth
+```
+
+</div></details>
+
+- 44.1 kHz / 100k iters / Adaptation from JSUT model
+- https://zenodo.org/record/5521446
diff --git a/egs2/tsukuyomi/tts1/cmd.sh b/egs2/tsukuyomi/tts1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/tsukuyomi/tts1/conf/decode.yaml b/egs2/tsukuyomi/tts1/conf/decode.yaml
new file mode 120000
index 00000000000..363baa875b7
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/decode.yaml
@@ -0,0 +1 @@
+tuning/decode_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/conf/finetune.yaml b/egs2/tsukuyomi/tts1/conf/finetune.yaml
new file mode 120000
index 00000000000..800c9335475
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/finetune.yaml
@@ -0,0 +1 @@
+tuning/finetune_tacotron2.yaml
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/conf/pbs.conf b/egs2/tsukuyomi/tts1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/tsukuyomi/tts1/conf/queue.conf b/egs2/tsukuyomi/tts1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/tsukuyomi/tts1/conf/slurm.conf b/egs2/tsukuyomi/tts1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/tsukuyomi/tts1/conf/tuning/decode_fastspeech.yaml b/egs2/tsukuyomi/tts1/conf/tuning/decode_fastspeech.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/tuning/decode_fastspeech.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/tsukuyomi/tts1/conf/tuning/decode_tacotron2.yaml b/egs2/tsukuyomi/tts1/conf/tuning/decode_tacotron2.yaml
new file mode 100644
index 00000000000..8c9c577421a
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/tuning/decode_tacotron2.yaml
@@ -0,0 +1,16 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5             # threshold to stop the generation
+maxlenratio: 10.0          # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0           # minimum length of generated samples = input length * minlenratio
+use_att_constraint: true   # whether to use attention constraint, which is introduced in deep voice 3
+backward_window: 1         # backward window size in the attention constraint
+forward_window: 3          # forward window size in the attention constraint
+use_teacher_forcing: false # whether to use teacher forcing
diff --git a/egs2/tsukuyomi/tts1/conf/tuning/decode_vits.yaml b/egs2/tsukuyomi/tts1/conf/tuning/decode_vits.yaml
new file mode 100644
index 00000000000..74bb0ebe0e2
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
diff --git a/egs2/tsukuyomi/tts1/conf/tuning/finetune_fastspeech2.yaml b/egs2/tsukuyomi/tts1/conf/tuning/finetune_fastspeech2.yaml
new file mode 100644
index 00000000000..3f477ee0433
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/tuning/finetune_fastspeech2.yaml
@@ -0,0 +1,98 @@
+# This configuration is for ESPnet2 to finetune FastSpeech2.
+# It requires only a single GPU with 12 GB memory and it
+# takes 0.5 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 0.1            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 800  # number of iterations per epoch
+max_epoch: 100            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 8             # gradient accumulation
+batch_bins: 3000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 1            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: null        # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/tsukuyomi/tts1/conf/tuning/finetune_full_band_vits.yaml b/egs2/tsukuyomi/tts1/conf/tuning/finetune_full_band_vits.yaml
new file mode 100644
index 00000000000..34290193e2c
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/tuning/finetune_full_band_vits.yaml
@@ -0,0 +1,184 @@
+# This configuration is for ESPnet2 to finetune 44.1 kHz VITS,
+# which is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 44100 hz audio as
+# the training data (mainly tested on tsukuyomi).
+# This configuration tested on 1 GPU (V100) with 32GB GPU
+# memory. It takes around a day to finish the training.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 44100          # must be the same as the training data
+        n_fft: 2048        # fft points
+        hop_length: 512    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 44100          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 1.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 1.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 100            # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 10000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/tsukuyomi/tts1/conf/tuning/finetune_tacotron2.yaml b/egs2/tsukuyomi/tts1/conf/tuning/finetune_tacotron2.yaml
new file mode 100644
index 00000000000..494b63a7d16
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/tuning/finetune_tacotron2.yaml
@@ -0,0 +1,78 @@
+# This configuration is for ESPnet2 to finetune Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-04       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 100    # number of iters per epoch
+max_epoch: 100              # number of epochs
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/tsukuyomi/tts1/conf/tuning/finetune_vits.yaml b/egs2/tsukuyomi/tts1/conf/tuning/finetune_vits.yaml
new file mode 100644
index 00000000000..a3b05bc36e7
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/conf/tuning/finetune_vits.yaml
@@ -0,0 +1,184 @@
+# This configuration is for ESPnet2 to finetune VITS, which
+# is truely end-to-end text-to-waveform model. To run
+# this config, you need to specify "--tts_task gan_tts"
+# option for tts.sh at least and use 22050 hz audio as
+# the training data (mainly tested on JVS).
+# This configuration tested on 1 GPU (V100) with 32GB GPU
+# memory. It takes around 1 day to finish the training.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        global_channels: -1
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 1.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 1.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 100            # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/tsukuyomi/tts1/db.sh b/egs2/tsukuyomi/tts1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/local/data.sh b/egs2/tsukuyomi/tts1/local/data.sh
new file mode 100755
index 00000000000..909e8e2e040
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/local/data.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=-1
+stop_stage=2
+threshold=45
+nj=8
+
+log "$0 $*"
+# shellcheck disable=SC1091
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+# shellcheck disable=SC1091
+. ./path.sh || exit 1;
+# shellcheck disable=SC1091
+. ./cmd.sh || exit 1;
+# shellcheck disable=SC1091
+. ./db.sh || exit 1;
+
+if [ -z "${TSUKUYOMI}" ]; then
+   log "Fill the value of 'TSUKUYOMI' of db.sh"
+   exit 1
+fi
+
+db_root=${TSUKUYOMI}
+train_set=tr_no_dev
+dev_set=dev
+eval_set=eval1
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    log "stage -1: local/data_download.sh"
+    local/data_download.sh "${db_root}"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    log "stage 0: local/data_prep.sh"
+    local/data_prep.sh "${db_root}/tsukuyomi_chan_corpus" data/all
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: scripts/audio/trim_silence.sh"
+    # shellcheck disable=SC2154
+    scripts/audio/trim_silence.sh \
+        --cmd "${train_cmd}" \
+        --nj "${nj}" \
+        --fs 48000 \
+        --win_length 2048 \
+        --shift_length 512 \
+        --threshold "${threshold}" \
+        data/all data/all/log
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: utils/subset_data_dir.sh"
+    utils/subset_data_dir.sh data/all 10 data/deveval
+    utils/subset_data_dir.sh --first data/deveval 5 "data/${dev_set}"
+    utils/subset_data_dir.sh --last data/deveval 5 "data/${eval_set}"
+    utils/copy_data_dir.sh data/all "data/${train_set}"
+    utils/filter_scp.pl --exclude data/deveval/wav.scp \
+        data/all/wav.scp > "data/${train_set}/wav.scp"
+    utils/fix_data_dir.sh "data/${train_set}"
+fi
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/tsukuyomi/tts1/local/data_download.sh b/egs2/tsukuyomi/tts1/local/data_download.sh
new file mode 100755
index 00000000000..db591f461a7
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/local/data_download.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+download_dir=$1
+
+# check arguments
+if [ $# != 1 ]; then
+    echo "Usage: $0 <download_dir>"
+    exit 1
+fi
+
+set -euo pipefail
+
+if [ ! -e "${download_dir}/tsukuyomi_chan_corpus" ]; then
+    (
+        mkdir -p "${download_dir}"
+        cd "${download_dir}"
+        wget "https://tyc.rei-yumesaki.net/files/sozai-tyc-corpus1.zip"
+        LC_ALL="" unzip -O sjis ./sozai-tyc-corpus1.zip
+        LC_ALL="" mv つくよみちゃん* tsukuyomi_chan_corpus
+    )
+    echo "Successfully finished download and unzip wav files."
+else
+    echo "Already exists. Skipped."
+fi
diff --git a/egs2/tsukuyomi/tts1/local/data_prep.sh b/egs2/tsukuyomi/tts1/local/data_prep.sh
new file mode 100755
index 00000000000..70dc964c651
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/local/data_prep.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+db_root=$1
+data_dir=$2
+
+# check arguments
+if [ $# != 2 ]; then
+    echo "Usage: $0 <db_root> <data_dir>"
+    echo "e.g.: $0 downloads/tsukuyomi_chan_corpus data/train"
+    exit 1
+fi
+
+set -euo pipefail
+
+# check directory existence
+[ ! -e "${data_dir}" ] && mkdir -p "${data_dir}"
+
+# set filenames
+scp=${data_dir}/wav.scp
+utt2spk=${data_dir}/utt2spk
+spk2utt=${data_dir}/spk2utt
+text=${data_dir}/text
+
+# check file existence
+[ -e "${scp}" ] && rm "${scp}"
+[ -e "${utt2spk}" ] && rm "${utt2spk}"
+[ -e "${text}" ] && rm "${text}"
+
+# make scp, utt2spk, and spk2utt
+find "${db_root}/02 WAV（+12dB増幅）" -name "*.wav" | sort | while read -r filename; do
+    id=tsukuyomi_$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
+    echo "${id} sox \"${filename}\" -r 48000 -t wav -c 1 -b 16 - |" >> "${scp}"
+    echo "${id} tsukuyomi" >> "${utt2spk}"
+done
+utils/utt2spk_to_spk2utt.pl "${utt2spk}" > "${spk2utt}"
+echo "Successfully finished making wav.scp, utt2spk, spk2utt."
+
+# make text
+find "${db_root}" -name "*.txt" | grep "補足なし台本" | while read -r filename; do
+    awk -F ":" -v spk=tsukuyomi '{print spk "_" $1 " " $2}' < "${filename}" | sort >> "${text}"
+done
+echo "Successfully finished making text."
+
+utils/fix_data_dir.sh "${data_dir}"
+echo "Successfully finished preparing data directory."
diff --git a/egs2/tsukuyomi/tts1/local/path.sh b/egs2/tsukuyomi/tts1/local/path.sh
new file mode 100644
index 00000000000..8779ab3ffd1
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/local/path.sh
@@ -0,0 +1,7 @@
+# check extra module installation
+if ! python3 -c "import pyopenjtalk" > /dev/null; then
+    echo "Error: pyopenjtalk is not installed." >&2
+    echo "Error: please install pyopenjtalk and its dependencies as follows:" >&2
+    echo "Error: cd ${MAIN_ROOT}/tools && make pyopenjtalk.done" >&2
+    return 1
+fi
diff --git a/egs2/tsukuyomi/tts1/path.sh b/egs2/tsukuyomi/tts1/path.sh
new file mode 120000
index 00000000000..524dc193272
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/path.sh
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/pyscripts b/egs2/tsukuyomi/tts1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/run.sh b/egs2/tsukuyomi/tts1/run.sh
new file mode 100755
index 00000000000..b8f241332dc
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/run.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 48000 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+
+train_set=tr_no_dev
+valid_set=dev
+test_sets="dev eval1"
+
+train_config=conf/finetune.yaml
+inference_config=conf/decode.yaml
+
+# Input example: こ、こんにちは
+
+# 1. Phoneme + Pause
+# (e.g. k o pau k o N n i ch i w a)
+g2p=pyopenjtalk
+
+# 2. Kana + Symbol
+# (e.g. コ 、 コ ン ニ チ ワ)
+# g2p=pyopenjtalk_kana
+
+# 3. Phoneme + Accent
+# (e.g. k 1 0 o 1 0 k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
+# g2p=pyopenjtalk_accent
+
+# 4. Phoneme + Accent + Pause
+# (e.g. k 1 0 o 1 0 pau k 5 -4 o 5 -4 N 5 -3 n 5 -2 i 5 -2 ch 5 -1 i 5 -1 w 5 0 a 5 0)
+# g2p=pyopenjtalk_accent_with_pause
+
+# 5. Phoneme + Prosody symbols
+# (e.g. ^, k, #, o, _, k, o, [, N, n, i, ch, i, w, a, $)
+# g2p=pyopenjtalk_prosody
+
+./tts.sh \
+    --lang jp \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner jaconv \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    ${opts} "$@"
diff --git a/egs2/tsukuyomi/tts1/scripts b/egs2/tsukuyomi/tts1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/sid b/egs2/tsukuyomi/tts1/sid
new file mode 120000
index 00000000000..7c50273be92
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/sid
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/sre08/v1/sid
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/steps b/egs2/tsukuyomi/tts1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/tts.sh b/egs2/tsukuyomi/tts1/tts.sh
new file mode 120000
index 00000000000..8f6f0cf1d99
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/tts.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/tsukuyomi/tts1/utils b/egs2/tsukuyomi/tts1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/tsukuyomi/tts1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/vctk/asr1/RESULTS.md b/egs2/vctk/asr1/RESULTS.md
new file mode 100644
index 00000000000..184887ca406
--- /dev/null
+++ b/egs2/vctk/asr1/RESULTS.md
@@ -0,0 +1,59 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+
+## Speaker closed setting (based on TTS's data preparation)
+- https://datashare.ed.ac.uk/handle/10283/3443
+- About 80% of the transcription in the evaluation data is covered by the training data in speaker closed condition.
+- Pre-trained model: https://huggingface.co/espnet/YosukeKashiwagi_vctk_asr_train_asr_transformer/tree/main/speaker_closed
+
+### Environments
+- date: `Thu Mar 10 09:51:35 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1+cu102`
+- Git hash: `a3e1543e96c1088bfed846d5c68c6f444a55aa75`
+  - Commit date: `Mon Feb 14 13:28:05 2022 -0500`
+
+### asr_train_asr_transformer_raw_char_sp
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|540|3657|95.2|4.0|0.7|0.6|5.4|18.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|540|3760|95.6|3.6|0.8|0.5|4.9|19.4|
+
+#### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|540|19387|98.2|0.8|1.0|0.5|2.2|18.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|540|19819|98.4|0.7|0.9|0.5|2.1|19.4|
+
+## Speaker open setting (based on noisy-vctk's data preparation)
+- https://datashare.ed.ac.uk/handle/10283/2791
+- About 90% of the transcription in the evaluation data is covered by the training data in speaker open condition.
+- Pre-trained model: https://huggingface.co/espnet/YosukeKashiwagi_vctk_asr_train_asr_transformer/tree/main/speaker_open
+
+## RESULTS
+### Environments
+- date: `Thu Mar 10 09:50:28 EST 2022`
+- python version: `3.9.5 (default, Jun  4 2021, 12:28:51)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1+cu102`
+- Git hash: `a3e1543e96c1088bfed846d5c68c6f444a55aa75`
+  - Commit date: `Mon Feb 14 13:28:05 2022 -0500`
+
+### asr_train_asr_transformer_raw_char_sp
+#### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|770|5804|92.3|6.8|0.9|1.0|8.7|32.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|824|6221|82.4|14.5|3.0|1.4|19.0|59.2|
+
+#### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev|770|29970|97.4|1.1|1.4|0.6|3.2|32.7|
+|decode_asr_asr_model_valid.acc.ave/eval1|824|32785|93.2|2.5|4.3|0.9|7.7|59.2|
\ No newline at end of file
diff --git a/egs2/vctk/asr1/asr.sh b/egs2/vctk/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/vctk/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/cmd.sh b/egs2/vctk/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/vctk/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/vctk/asr1/conf/decode_asr.yaml b/egs2/vctk/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..58a3dcf022b
--- /dev/null
+++ b/egs2/vctk/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.0
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.3
diff --git a/egs2/vctk/asr1/conf/fbank.conf b/egs2/vctk/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/vctk/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/vctk/asr1/conf/pbs.conf b/egs2/vctk/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/vctk/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/vctk/asr1/conf/pitch.conf b/egs2/vctk/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/vctk/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/vctk/asr1/conf/queue.conf b/egs2/vctk/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/vctk/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/vctk/asr1/conf/slurm.conf b/egs2/vctk/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/vctk/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/vctk/asr1/conf/train_asr_transformer.yaml b/egs2/vctk/asr1/conf/train_asr_transformer.yaml
new file mode 100644
index 00000000000..8958728c610
--- /dev/null
+++ b/egs2/vctk/asr1/conf/train_asr_transformer.yaml
@@ -0,0 +1,62 @@
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 200
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: transformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/vctk/asr1/db.sh b/egs2/vctk/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/vctk/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data.sh b/egs2/vctk/asr1/local/data.sh
new file mode 100755
index 00000000000..f7af83950a1
--- /dev/null
+++ b/egs2/vctk/asr1/local/data.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+stage=-1
+stop_stage=2
+use_speakeropen=false
+
+help_message=$(cat << EOF
+Usage: $0 
+  optional argument:
+    None
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if "${use_speakeropen}"; then
+
+    if [ ! -e "${NOISY_SPEECH}" ] ; then
+        log "
+        Please fill the value of 'NOISY_SPEECH' in db.sh
+        The 'NOISY_SPEECH' (https://doi.org/10.7488/ds/2117) directory
+        should at least contain the clean speech and the clean text:
+            noisy_speech
+            ├── clean_testset_wav
+            ├── clean_trainset_28spk_wav
+            ├── testset_txt
+            └── trainset_28spk_txt
+        "
+	exit 1
+    fi
+
+    if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+	log "stage 0: local/data_prep_speaker_open.sh"
+	# Initial normalization of the data
+	# Doesn't change sampling frequency and it's done after stages
+    local/data_prep_speaker_open.sh  ${NOISY_SPEECH} || exit 1;
+    fi
+
+else
+    
+    if [ -z "${VCTK}" ]; then
+	log "Please fill the value of 'VCTK' of db.sh"
+	exit 1
+    fi
+    db_root=${VCTK}
+    
+    train_set=tr_no_dev
+    dev_set=dev
+    eval_set=eval1
+
+    if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+	log "stage -1: Data Download"
+	local/data_download.sh "${db_root}"
+    fi
+    
+    if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+	log "stage 0: local/data_prep_speaker_closed.sh"
+	# Initial normalization of the data
+	# Doesn't change sampling frequency and it's done after stages
+	local/data_prep_speaker_closed.sh \
+            --train_set "${train_set}" \
+            --dev_set "${dev_set}" \
+            --eval_set "${eval_set}" \
+            "${db_root}"/VCTK-Corpus
+    fi
+
+fi
diff --git a/egs2/vctk/asr1/local/data_download.sh b/egs2/vctk/asr1/local/data_download.sh
new file mode 120000
index 00000000000..da6b5a37427
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_download.sh
@@ -0,0 +1 @@
+../../tts1/local/data_download.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data_prep_speaker_closed.sh b/egs2/vctk/asr1/local/data_prep_speaker_closed.sh
new file mode 120000
index 00000000000..1c53d10ee72
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_prep_speaker_closed.sh
@@ -0,0 +1 @@
+../../tts1/local/data_prep.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/local/data_prep_speaker_open.sh b/egs2/vctk/asr1/local/data_prep_speaker_open.sh
new file mode 100755
index 00000000000..b0abe73a17d
--- /dev/null
+++ b/egs2/vctk/asr1/local/data_prep_speaker_open.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+if [ $# -ne 1 ]; then
+  echo "Arguments should be NOISY_SPEECH wav path, see local/data.sh for example."
+  exit 1;
+fi
+
+NOISY_SPEECH=$1
+# check if the wav dirs exist.
+
+for ddir in clean_trainset_28spk_wav clean_testset_wav trainset_28spk_txt testset_txt; do
+  f=${NOISY_SPEECH}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+data=./data
+rm -r ${data}/tr_26spk 2>/dev/null || true
+rm -r ${data}/{cv, tt}_2spk 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+train_dir=${NOISY_SPEECH}/clean_trainset_28spk_wav
+test_dir=${NOISY_SPEECH}/clean_testset_wav
+
+echo "Building training and testing data"
+
+find $train_dir -name '*.wav' -not -name 'p226_*.wav' -not -name 'p287_*.wav' | sort -u > $tmpdir/tr_no_dev.flist
+find $train_dir -name 'p226_*.wav' -o -name 'p287_*.wav' | sort -u > $tmpdir/dev.flist
+find $test_dir -name '*.wav' | sort -u > $tmpdir/eval1.flist
+
+
+for x in tr_no_dev dev eval1; do
+
+  if [ "${x}" == "tr_no_dev" -o "${x}" == "dev" ]; then
+      text_dir=${NOISY_SPEECH}/trainset_28spk_txt
+  else
+      text_dir=${NOISY_SPEECH}/testset_txt
+  fi      
+
+  sed -e 's:.*p\([0-9]*\)_\([0-9]*\).wav$:p\1_\2:i' $tmpdir/${x}.flist \
+  > $tmpdir/${x}.uttids
+
+  paste $tmpdir/${x}.uttids $tmpdir/${x}.flist \
+  | sort -k1,1 >  $tmpdir/${x}.scp
+  mkdir -p ${data}/${x}
+  cp $tmpdir/${x}.scp ${data}/${x}/wav.scp
+  
+  awk '{split($1, lst, "_"); spk=lst[1]; print($1, spk)}' ${data}/${x}/wav.scp | \
+    sort -u> ${data}/${x}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${x}/utt2spk > ${data}/${x}/spk2utt
+
+  cat $tmpdir/${x}.uttids | \
+      while read uttid;
+      do
+	  if [ ! -f ${text_dir}/${uttid}.txt ]; then
+	      echo "missing text file for ${uttid}" 1>&2
+	      exit 1;
+	  fi
+	  echo "${uttid}" $(<${text_dir}/${uttid}.txt)
+      done | \
+	  sort -u > ${data}/${x}/text
+
+  sed -e "s#noisy_#clean_#g" ${data}/${x}/wav.scp \
+    > ${data}/${x}/spk1.scp
+done
+
+
+
diff --git a/egs2/vctk/asr1/local/path.sh b/egs2/vctk/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/vctk/asr1/path.sh b/egs2/vctk/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/vctk/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/vctk/asr1/pyscripts b/egs2/vctk/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/vctk/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/vctk/asr1/run.sh b/egs2/vctk/asr1/run.sh
new file mode 100755
index 00000000000..d363dc3a8a4
--- /dev/null
+++ b/egs2/vctk/asr1/run.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# if ture, speaker open setting will be used.
+use_speakeropen=true
+train_set=tr_no_dev
+valid_set=dev
+test_sets="dev eval1"
+
+token_type=char
+
+asr_config=conf/train_asr_transformer.yaml
+inference_config=conf/decode_asr.yaml
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="1.1 0.9 1.0"
+
+./asr.sh \
+    --ngpu 4 \
+    --token_type "${token_type}" \
+    --feats_type raw \
+    --fs 16k \
+    --local_data_opts "--use_speakeropen ${use_speakeropen}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --use_lm false \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/vctk/asr1/scripts b/egs2/vctk/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/vctk/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/vctk/asr1/steps b/egs2/vctk/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/vctk/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/vctk/asr1/utils b/egs2/vctk/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/vctk/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/vctk/tts1/README.md b/egs2/vctk/tts1/README.md
index cef641012ea..d4169b9bf88 100644
--- a/egs2/vctk/tts1/README.md
+++ b/egs2/vctk/tts1/README.md
@@ -7,11 +7,245 @@ See the following pages for the usage:
 - [How to train FastSpeech](../../TEMPLATE/tts1/README.md#fastspeech-training)
 - [How to train FastSpeech2](../../TEMPLATE/tts1/README.md#fastspeech2-training)
 - [How to train with X-vector](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-x-vector-training)
+- [How to train with speaker ID](../../TEMPLATE/tts1/README.md#multi-speaker-model-with-speaker-id-embedding-training)
+- [How to train VITS](../../TEMPLATE/tts1/README.md#vits-training)
+- [How to train joint text2wav](../../TEMPLATE/tts1/README.md#joint-text2wav-training)
 
 See the following pages before asking the question:
 - [ESPnet2 Tutorial](https://espnet.github.io/espnet/espnet2_tutorial.html)
 - [ESPnet2 TTS FAQ](../../TEMPLATE/tts1/README.md#faq)
 
+
+# FORTH RESULTS
+
+- Use espeak-ng based G2P
+
+## Environments
+- date: `Fri Oct  8 15:48:44 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `628b46282537ce532d613d6bafb75e826e8455de`
+  - Commit date: `Wed Sep 8 13:30:50 2021 +0900`
+
+## Pretrained Models
+
+### vctk_tts_train_multi_spk_vits_raw_phn_tacotron_espeak_ng_english_us_vits_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+# Prep data directory
+./run.sh --stage 1 --stop-stage 1
+
+# Since espeak is super slow, dump phonemized text at first
+for dset in tr_no_dev dev eval1; do
+    utils/copy_data_dir.sh data/"${dset}"{,_phn}
+    ./pyscripts/utils/convert_text_to_phn.py \
+        --nj 32 \
+        --g2p espeak_ng_english_us_vits \
+        --cleaer tacotron \
+        data/"${dset}"{,_phn}/text
+done
+
+# Run from stage 2
+./run.sh \
+    --train_set tr_no_dev_phn \
+    --valid_set dev_phn \
+    --test_sets "dev_phn eval1_phn" \
+    --srctexts "data/tr_no_dev_phn/text" \
+    --g2p none \
+    --cleaner none \
+    --stage 2 \
+    --use_sid true \
+    --min_wav_duration 0.38 \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_multi_spk_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 22.05 kHz / 1M iters / Use speaker ID (one-hot) / Averaged the last 5 epochs
+- https://zenodo.org/record/5560132
+
+### vctk_tts_train_xvector_vits_raw_phn_tacotron_espeak_ng_english_us_vits_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+# Prep data directory
+./run.sh --stage 1 --stop-stage 1
+
+# Since espeak is super slow, dump phonemized text at first
+for dset in tr_no_dev dev eval1; do
+    utils/copy_data_dir.sh data/"${dset}"{,_phn}
+    ./pyscripts/utils/convert_text_to_phn.py \
+        --nj 32 \
+        --g2p espeak_ng_english_us_vits \
+        --cleaer tacotron \
+        data/"${dset}"{,_phn}/text
+done
+
+# Run from stage 2
+./run.sh \
+    --train_set tr_no_dev_phn \
+    --valid_set dev_phn \
+    --test_sets "dev_phn eval1_phn" \
+    --srctexts "data/tr_no_dev_phn/text" \
+    --g2p none \
+    --cleaner none \
+    --stage 2 \
+    --use_xvector true \
+    --min_wav_duration 0.38 \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_xvector_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 22.05 kHz / 1M iters / Use X-vector / Averaged the last 5 epochs
+- https://zenodo.org/record/5560146
+
+### vctk_tts_train_full_band_multi_spk_vits_raw_phn_tacotron_espeak_ng_english_us_vits_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+# Prep data directory
+./run.sh --stage 1 --stop-stage 1
+
+# Since espeak is super slow, dump phonemized text at first
+for dset in tr_no_dev dev eval1; do
+    utils/copy_data_dir.sh data/"${dset}"{,_phn}
+    ./pyscripts/utils/convert_text_to_phn.py \
+        --nj 32 \
+        --g2p espeak_ng_english_us_vits \
+        --cleaer tacotron \
+        data/"${dset}"{,_phn}/text
+done
+
+# Run from stage 2
+./run.sh \
+    --train_set tr_no_dev_phn \
+    --valid_set dev_phn \
+    --test_sets "dev_phn eval1_phn" \
+    --srctexts "data/tr_no_dev_phn/text" \
+    --g2p none \
+    --cleaner none \
+    --stage 2 \
+    --use_sid true \
+    --min_wav_duration 0.38 \
+    --ngpu 4 \
+    --fs 44100 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --dumpdir dump/44k \
+    --expdir exp/44k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_full_band_multi_spk_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 44.1 kHz / 1M iters / Use speaker ID (one-hot) / Averaged the last 5 epochs
+- https://zenodo.org/record/5560148
+
+
+# THIRD RESULTS
+
+- Initial VITS models
+
+## Environments
+- date: `Sat Sep 11 09:52:43 JST 2021`
+- python version: `3.7.3 (default, Mar 27 2019, 22:11:17)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.3a2`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `628b46282537ce532d613d6bafb75e826e8455de`
+  - Commit date: `Wed Sep 8 13:30:50 2021 +0900`
+
+## Pretrained Models
+
+### vctk_tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --use_sid true \
+    --min_wav_duration 0.38 \
+    --ngpu 4 \
+    --fs 22050 \
+    --n_fft 1024 \
+    --n_shift 256 \
+    --dumpdir dump/22k \
+    --expdir exp/22k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_multi_spk_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 22.05 kHz / 1M iters / Use speaker ID (one-hot) / Averaged the last 10 epochs
+- https://zenodo.org/record/5500759
+
+### vctk_tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space_train.total_count.ave
+
+<details><summary>Command</summary><div>
+
+```sh
+./run.sh \
+    --stage 1 \
+    --use_sid true \
+    --min_wav_duration 0.38 \
+    --ngpu 4 \
+    --fs 44100 \
+    --n_fft 2048 \
+    --n_shift 512 \
+    --dumpdir dump/44k \
+    --expdir exp/44k \
+    --win_length null \
+    --tts_task gan_tts \
+    --feats_extract linear_spectrogram \
+    --feats_normalize none \
+    --train_config ./conf/tuning/train_full_band_multi_spk_vits.yaml \
+    --inference_model train.total_count.ave.pth
+```
+
+</div></details>
+
+- 44.1 kHz / 1M iters / Use speaker ID (one-hot) / Averaged the last 10 epochs
+- https://zenodo.org/record/5521431
+
+
 # SECOND RESULTS
 
 - Use X-vector as the speaker embedding
@@ -27,10 +261,10 @@ See the following pages before asking the question:
 
 ## Pretrained Models
 
-### vctk_tts_train_xvector_trasnformer_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
+### vctk_tts_train_xvector_transformer_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
 - https://zenodo.org/record/4393279
 
-### vctk_tts_train_gst+xvector_trasnformer_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
+### vctk_tts_train_gst+xvector_transformer_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
 - https://zenodo.org/record/4393277
 
 ### vctk_tts_train_xvector_tacotron2_raw_phn_tacotron_g2p_en_no_space_train.loss.ave
diff --git a/egs2/vctk/tts1/cmd.sh b/egs2/vctk/tts1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/vctk/tts1/cmd.sh
+++ b/egs2/vctk/tts1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/vctk/tts1/conf/tuning/decode_vits.yaml b/egs2/vctk/tts1/conf/tuning/decode_vits.yaml
new file mode 100644
index 00000000000..74bb0ebe0e2
--- /dev/null
+++ b/egs2/vctk/tts1/conf/tuning/decode_vits.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for VITS.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+noise_scale: 0.667         # noise scale parameter for the flow in VITS.
+noise_scale_dur: 0.8       # noise scale parameter for the stochastic duration predictor in VITS.
+speed_control_alpha: 1     # alpha to control the speed of generated speech.
+                           # 1 < alpha makes slower and 1 > alpha makes faster.
+use_teacher_forcing: false # whether to use teacher forcing.
diff --git a/egs2/vctk/tts1/conf/tuning/train_full_band_multi_spk_vits.yaml b/egs2/vctk/tts1/conf/tuning/train_full_band_multi_spk_vits.yaml
new file mode 100644
index 00000000000..4e4ce30effb
--- /dev/null
+++ b/egs2/vctk/tts1/conf/tuning/train_full_band_multi_spk_vits.yaml
@@ -0,0 +1,185 @@
+# This configuration is for ESPnet2 to train 44.1 kHz
+# multi-spk VITS, To run this config, you need to specify
+# "--tts_task gan_tts" and "--use_sid true"  options for
+# tts.sh at least and use 44100 hz audio as the training
+# data (mainly tested on VCTK).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: 128
+        global_channels: 256
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 44100          # must be the same as the training data
+        n_fft: 2048        # fft points
+        hop_length: 512    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 44100          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/vctk/tts1/conf/tuning/train_multi_spk_vits.yaml b/egs2/vctk/tts1/conf/tuning/train_multi_spk_vits.yaml
new file mode 100644
index 00000000000..28c442b3517
--- /dev/null
+++ b/egs2/vctk/tts1/conf/tuning/train_multi_spk_vits.yaml
@@ -0,0 +1,185 @@
+# This configuration is for ESPnet2 to train multi-speaker
+# VITS, which is truely end-to-end text-to-waveform model.
+# To run this config, you need to specify "--tts_task gan_tts"
+# and "--use_sid true" options for tts.sh at least and use
+# 22050 hz audio as the training data (mainly tested on VCTK).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: 128
+        global_channels: 256
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 3000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/vctk/tts1/conf/tuning/train_speechbrain_xvector_vits.yaml b/egs2/vctk/tts1/conf/tuning/train_speechbrain_xvector_vits.yaml
new file mode 100644
index 00000000000..971c41debe5
--- /dev/null
+++ b/egs2/vctk/tts1/conf/tuning/train_speechbrain_xvector_vits.yaml
@@ -0,0 +1,186 @@
+# This configuration is for ESPnet2 to train multi-speaker
+# VITS with x-vector (speechbrain) instead of sepeaker ID embedding.
+# To run this config, you need to specify "--tts_task gan_tts"
+# and "--use_xvector true" options for tts.sh at least and use
+# 22khz audio as the training data (mainly tested on LibriTTS).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        spk_embed_dim: 192
+        global_channels: 256
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/vctk/tts1/conf/tuning/train_xvector_vits.yaml b/egs2/vctk/tts1/conf/tuning/train_xvector_vits.yaml
new file mode 100644
index 00000000000..739abb667e0
--- /dev/null
+++ b/egs2/vctk/tts1/conf/tuning/train_xvector_vits.yaml
@@ -0,0 +1,186 @@
+# This configuration is for ESPnet2 to train multi-speaker
+# VITS with x-vector instead of sepeaker ID embedding.
+# To run this config, you need to specify "--tts_task gan_tts"
+# and "--use_xvector true" options for tts.sh at least and use
+# 22khz audio as the training data (mainly tested on LibriTTS).
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: vits
+tts_conf:
+    # generator related
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: -1
+        spk_embed_dim: 512
+        global_channels: 256
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: "conv1d"
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: "rel_pos"
+        text_encoder_self_attention_layer_type: "rel_selfattn"
+        text_encoder_activation_type: "swish"
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
+        #   errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
+        #   it as a default. We need to consider the alternative normalization
+        #   or different version pytorch may solve this issue.
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales: [8, 8, 2, 2]
+        decoder_upsample_kernel_sizes: [16, 16, 4, 4]
+        decoder_resblock_kernel_sizes: [3, 7, 11]
+        decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1d"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "LeakyReLU"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # loss function related
+    generator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    discriminator_adv_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        loss_type: mse                   # loss type, "mse" or "hinge"
+    feat_match_loss_params:
+        average_by_discriminators: false # whether to average loss value by #discriminators
+        average_by_layers: false         # whether to average loss value by #layers of each discriminator
+        include_final_outputs: true      # whether to include final outputs for loss calculation
+    mel_loss_params:
+        fs: 22050          # must be the same as the training data
+        n_fft: 1024        # fft points
+        hop_length: 256    # hop size
+        win_length: null   # window length
+        window: hann       # window type
+        n_mels: 80         # number of Mel basis
+        fmin: 0            # minimum frequency for Mel basis
+        fmax: null         # maximum frequency for Mel basis
+        log_base: null     # null represent natural log
+    lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+    lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+    lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+    lambda_dur: 1.0        # loss scaling coefficient for duration loss
+    lambda_kl: 1.0         # loss scaling coefficient for KL divergence loss
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: true # whether to cache generator outputs in the training
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+optim: adamw
+optim_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+# optimizer setting for discriminator
+optim2: adamw
+optim2_conf:
+    lr: 2.0e-4
+    betas: [0.8, 0.99]
+    eps: 1.0e-9
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 1000 # number of iterations per epoch
+max_epoch: 1000           # number of epochs
+accum_grad: 1             # gradient accumulation
+batch_bins: 5000000       # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+grad_clip: -1             # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+use_amp: false            # whether to use pytorch amp
+log_interval: 50          # log interval in iterations
+keep_nbest_models: 10     # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 777                 # random seed number
+patience: null            # patience for early stopping
+unused_parameters: true   # needed for multi gpu case
+best_model_criterion:     # criterion to save the best models
+-   - train
+    - total_count
+    - max
+cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
+                           # in the case of GAN-TTS training, we strongly recommend setting to false
+cudnn_benchmark: false     # setting to true might acdelerate the training speed but sometimes decrease it
+                           # therefore, we set to false as a default (recommend trying both cases)
diff --git a/egs2/vctk_noisy/enh1/cmd.sh b/egs2/vctk_noisy/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/vctk_noisy/enh1/conf/pbs.conf b/egs2/vctk_noisy/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/vctk_noisy/enh1/conf/queue.conf b/egs2/vctk_noisy/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/vctk_noisy/enh1/conf/slurm.conf b/egs2/vctk_noisy/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/vctk_noisy/enh1/conf/train.yaml b/egs2/vctk_noisy/enh1/conf/train.yaml
new file mode 120000
index 00000000000..c80d4bf9529
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/conf/train.yaml
@@ -0,0 +1 @@
+tuning/train_enh_conv_tasnet.yaml
\ No newline at end of file
diff --git a/egs2/vctk_noisy/enh1/conf/tuning/train_enh_conv_tasnet.yaml b/egs2/vctk_noisy/enh1/conf/tuning/train_enh_conv_tasnet.yaml
new file mode 100644
index 00000000000..8887d141d5f
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/conf/tuning/train_enh_conv_tasnet.yaml
@@ -0,0 +1,52 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-05
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 3
+model_conf:
+    loss_type: ci_sdr
+encoder: conv
+encoder_conf:
+    channel: 256
+    kernel_size: 20
+    stride: 10
+decoder: conv
+decoder_conf:
+    channel: 256
+    kernel_size: 20
+    stride: 10
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 8
+    stack: 3
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
diff --git a/egs2/vctk_noisy/enh1/db.sh b/egs2/vctk_noisy/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/vctk_noisy/enh1/enh.sh b/egs2/vctk_noisy/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/vctk_noisy/enh1/local/data.sh b/egs2/vctk_noisy/enh1/local/data.sh
new file mode 100755
index 00000000000..8a0f56ed4b8
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/local/data.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+help_message=$(cat << EOF
+Usage: $0 
+  optional argument:
+    None
+EOF
+)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if [ ! -e "${NOISY_SPEECH}" ] ; then
+    log "
+    Please fill the value of 'NOISY_SPEECH' in db.sh
+    The 'NOISY_SPEECH' (https://doi.org/10.7488/ds/2117) directory 
+    should at least contain the noisy speech and the clean reference:
+        noisy_speech
+        ├── clean_testset_wav
+        ├── clean_trainset_28spk_wav
+        ├── noisy_testset_wav
+        └── noisy_trainset_28spk_wav
+    "
+    exit 1
+fi
+
+log "Data preparation"
+# The following datasets will be created:
+# tr_26spk, {cv,tt}_2spk
+local/vctk_data_prep.sh  ${NOISY_SPEECH} || exit 1;
+
+
diff --git a/egs2/vctk_noisy/enh1/local/path.sh b/egs2/vctk_noisy/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/vctk_noisy/enh1/local/vctk_data_prep.sh b/egs2/vctk_noisy/enh1/local/vctk_data_prep.sh
new file mode 100755
index 00000000000..952cd6f481b
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/local/vctk_data_prep.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+
+. ./path.sh
+
+
+
+if [ $# -ne 1 ]; then
+  echo "Arguments should be NOISY_SPEECH wav path, see local/data.sh for example."
+  exit 1;
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+NOISY_SPEECH=$1
+# check if the wav dirs exist.
+
+for ddir in clean_trainset_28spk_wav noisy_trainset_28spk_wav clean_testset_wav noisy_testset_wav; do
+  f=${NOISY_SPEECH}/${ddir}
+  if [ ! -d $f ]; then
+    echo "Error: $f is not a directory."
+    exit 1;
+  fi
+done
+
+data=./data
+rm -r ${data}/tr_26spk 2>/dev/null || true
+rm -r ${data}/{cv, tt}_2spk 2>/dev/null || true
+
+tmpdir=data/temp
+rm -r  $tmpdir 2>/dev/null || true
+mkdir -p $tmpdir 
+
+train_dir=${NOISY_SPEECH}/noisy_trainset_28spk_wav
+test_dir=${NOISY_SPEECH}/noisy_testset_wav
+
+echo "Building training and testing data"
+
+find $train_dir -name '*.wav' -not -name 'p226_*.wav' -not -name 'p287_*.wav' | sort -u > $tmpdir/tr.flist
+find $train_dir -name 'p226_*.wav' -o -name 'p287_*.wav' | sort -u > $tmpdir/cv.flist
+find $test_dir -name '*.wav' | sort -u > $tmpdir/tt.flist
+
+
+for x in tr cv tt; do
+  if [ "$x" = "tr" ]; then
+    ddir=${x}_26spk
+  elif [ "$x" = "cv" -o "$x" = "tt" ]; then
+    ddir=${x}_2spk
+  fi
+
+  sed -e 's:.*p\([0-9]*\)_\([0-9]*\).wav$:p\1_\2:i' $tmpdir/${x}.flist \
+  > $tmpdir/${x}.uttids
+
+  paste $tmpdir/${x}.uttids $tmpdir/${x}.flist \
+  | sort -k1,1 >  $tmpdir/${x}.scp
+  mkdir -p ${data}/${ddir}
+  cp $tmpdir/${x}.scp ${data}/${ddir}/wav.scp
+  
+  awk '{split($1, lst, "_"); spk=lst[1]; print($1, spk)}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/utt2spk
+  utt2spk_to_spk2utt.pl ${data}/${ddir}/utt2spk > ${data}/${ddir}/spk2utt
+
+  awk '{print($1, "dummy")}' ${data}/${ddir}/wav.scp | \
+    sort -u> ${data}/${ddir}/text
+
+  sed -e "s#noisy_#clean_#g" ${data}/${ddir}/wav.scp \
+    > ${data}/${ddir}/spk1.scp
+done
+
+
+
diff --git a/egs2/vctk_noisy/enh1/path.sh b/egs2/vctk_noisy/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/vctk_noisy/enh1/pyscripts b/egs2/vctk_noisy/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/vctk_noisy/enh1/run.sh b/egs2/vctk_noisy/enh1/run.sh
new file mode 100755
index 00000000000..330b8792b0f
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/run.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=16k
+
+
+
+train_set=tr_26spk
+valid_set=cv_2spk
+test_sets=tt_2spk
+
+./enh.sh \
+    --lang en \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --local_data_opts "" \
+    --enh_config ./conf/train.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref false \
+    --max_wav_duration 30 \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/vctk_noisy/enh1/scripts b/egs2/vctk_noisy/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/vctk_noisy/enh1/steps b/egs2/vctk_noisy/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/vctk_noisy/enh1/utils b/egs2/vctk_noisy/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/vctk_noisy/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/vctk_noisyreverb/enh1/README.md b/egs2/vctk_noisyreverb/enh1/README.md
new file mode 100644
index 00000000000..bf59e1219a2
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/README.md
@@ -0,0 +1,21 @@
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Fri Jun  4 19:32:13 CST 2021`
+- python version: `3.8.10 (default, May 19 2021, 18:05:58)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.10`
+- pytorch version: `pytorch 1.7.1`
+- Git hash: `47d9206997841e72b0d35590555411346145a6f6`
+  - Commit date: `Tue Jun 1 12:49:23 2021 +0800`
+
+
+## enh_train_enh_conv_tasnet_raw
+
+- config: conf/tuning/train_enh_conv_tasnet.yaml
+- Pretrained model: https://zenodo.org/record/4899486/files/enh_train_enh_conv_tasnet_raw_valid.loss.best.zip
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_testset|0.28|8.65|8.65|0.00|
+|enhanced_valid|0.26|9.07|9.07|0.00|
+
diff --git a/egs2/vctk_noisyreverb/enh1/cmd.sh b/egs2/vctk_noisyreverb/enh1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/vctk_noisyreverb/enh1/conf/pbs.conf b/egs2/vctk_noisyreverb/enh1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/vctk_noisyreverb/enh1/conf/queue.conf b/egs2/vctk_noisyreverb/enh1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/vctk_noisyreverb/enh1/conf/slurm.conf b/egs2/vctk_noisyreverb/enh1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/vctk_noisyreverb/enh1/conf/tuning/train_enh_conv_tasnet.yaml b/egs2/vctk_noisyreverb/enh1/conf/tuning/train_enh_conv_tasnet.yaml
new file mode 100644
index 00000000000..8887d141d5f
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/conf/tuning/train_enh_conv_tasnet.yaml
@@ -0,0 +1,52 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 100
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-05
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.5
+    patience: 3
+model_conf:
+    loss_type: ci_sdr
+encoder: conv
+encoder_conf:
+    channel: 256
+    kernel_size: 20
+    stride: 10
+decoder: conv
+decoder_conf:
+    channel: 256
+    kernel_size: 20
+    stride: 10
+separator: tcn
+separator_conf:
+    num_spk: 1
+    layer: 8
+    stack: 3
+    bottleneck_dim: 256
+    hidden_dim: 512
+    kernel: 3
+    causal: False
+    norm_type: "gLN"
+    nonlinear: relu
diff --git a/egs2/vctk_noisyreverb/enh1/db.sh b/egs2/vctk_noisyreverb/enh1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/vctk_noisyreverb/enh1/enh.sh b/egs2/vctk_noisyreverb/enh1/enh.sh
new file mode 120000
index 00000000000..8fd33b0b191
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/enh.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/enh.sh
\ No newline at end of file
diff --git a/egs2/vctk_noisyreverb/enh1/local/data.sh b/egs2/vctk_noisyreverb/enh1/local/data.sh
new file mode 100755
index 00000000000..03478787836
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/local/data.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+
+# Copyright 2020  Shanghai Jiao Tong University (Authors: Chenda Li, Wangyou Zhang)
+# Apache 2.0
+set -e
+set -u
+set -o pipefail
+
+log() {
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+help_message=$(cat << EOF
+Usage: $0 
+  optional argument:
+    None
+EOF
+)
+
+. ./path.sh
+. ./db.sh
+
+
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "${help_message}"
+    exit 1;
+fi
+
+if [ ! -e "${NOISY_REVERBERANT_SPEECH}" ] || [ ! -e "${NOISY_SPEECH}" ] ; then
+    log "
+    Please fill the value of 'NOISY_REVERBERANT_SPEECH' and 'NOISY_SPEECH' in db.sh
+    The 'NOISY_REVERBERANT_SPEECH' (https://doi.org/10.7488/ds/2139) 
+    directory should be like:  
+        noisy_reverberant_speech
+        ├── logfiles
+        ├── noisyreverb_testset_wav
+        ├── noisyreverb_trainset_28spk_wav
+        └── noisyreverb_trainset_56spk_wav
+    the 'NOISY_SPEECH' (https://doi.org/10.7488/ds/2117) directory 
+    should at least contain the clean reference:
+        noisy_speech
+        ├── clean_testset_wav
+        ├── clean_trainset_28spk_wav
+        └── clean_trainset_56spk_wav
+    "
+    exit 1
+fi
+
+
+
+for dset in testset trainset_28spk trainset_56spk;
+do
+
+  mkdir -p data/${dset}
+  awk '{print $1 " '${NOISY_REVERBERANT_SPEECH}/noisyreverb_${dset}_wav/'"$1".wav"}' ${NOISY_REVERBERANT_SPEECH}/logfiles/log_${dset}.txt | sort -n  > data/${dset}/spk1.scp
+  awk '{print $1 " '${NOISY_SPEECH}/clean_${dset}_wav/'"$1".wav"}' ${NOISY_REVERBERANT_SPEECH}/logfiles/log_${dset}.txt | sort -n  > data/${dset}/wav.scp
+  awk -F '_| ' '{print $1"_"$2, $1}' ${NOISY_REVERBERANT_SPEECH}/logfiles/log_${dset}.txt | sort -n  > data/${dset}/utt2spk
+  ./utils/utt2spk_to_spk2utt.pl  data/${dset}/utt2spk >  data/${dset}/spk2utt
+done
+
+# By default, combine the 28spk and the 56spk dataset together
+combine_data.sh --extra-files 'spk1.scp' data/train_28_and_56spk data/trainset_28spk data/trainset_56spk
+
+# Split the whole training set into train and valid
+./utils/subset_data_dir.sh --spk-list local/train_spk data/train_28_and_56spk data/train
+./utils/fix_data_dir.sh data/train
+./utils/filter_scp.pl data/train/wav.scp data/train_28_and_56spk/spk1.scp > data/train/spk1.scp
+
+./utils/subset_data_dir.sh --spk-list local/valid_spk data/train_28_and_56spk data/valid
+./utils/fix_data_dir.sh data/valid
+./utils/filter_scp.pl data/valid/wav.scp data/train_28_and_56spk/spk1.scp > data/valid/spk1.scp
diff --git a/egs2/vctk_noisyreverb/enh1/local/path.sh b/egs2/vctk_noisyreverb/enh1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/vctk_noisyreverb/enh1/local/train_spk b/egs2/vctk_noisyreverb/enh1/local/train_spk
new file mode 100644
index 00000000000..763d4b91335
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/local/train_spk
@@ -0,0 +1,78 @@
+p226
+p227
+p228
+p230
+p231
+p233
+p234
+p236
+p237
+p239
+p241
+p243
+p244
+p245
+p246
+p247
+p248
+p249
+p250
+p251
+p254
+p255
+p256
+p258
+p259
+p260
+p263
+p264
+p265
+p266
+p267
+p268
+p269
+p270
+p271
+p272
+p273
+p274
+p275
+p276
+p277
+p278
+p279
+p281
+p282
+p283
+p284
+p285
+p286
+p287
+p292
+p293
+p295
+p298
+p299
+p301
+p302
+p303
+p304
+p305
+p306
+p307
+p308
+p310
+p312
+p313
+p314
+p316
+p323
+p326
+p333
+p334
+p335
+p336
+p339
+p341
+p343
+p345
diff --git a/egs2/vctk_noisyreverb/enh1/local/valid_spk b/egs2/vctk_noisyreverb/enh1/local/valid_spk
new file mode 100644
index 00000000000..8bc609d3703
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/local/valid_spk
@@ -0,0 +1,8 @@
+p347
+p351
+p360
+p361
+p363
+p364
+p374
+p376
diff --git a/egs2/vctk_noisyreverb/enh1/path.sh b/egs2/vctk_noisyreverb/enh1/path.sh
new file mode 120000
index 00000000000..eb217d35673
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/path.sh
\ No newline at end of file
diff --git a/egs2/vctk_noisyreverb/enh1/pyscripts b/egs2/vctk_noisyreverb/enh1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/vctk_noisyreverb/enh1/run.sh b/egs2/vctk_noisyreverb/enh1/run.sh
new file mode 100755
index 00000000000..239d1ac9612
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+sample_rate=8k
+
+
+
+train_set=train
+valid_set=valid
+test_sets=testset
+
+./enh.sh \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --fs ${sample_rate} \
+    --ngpu 1 \
+    --spk_num 1 \
+    --enh_config conf/tuning/train_enh_conv_tasnet.yaml \
+    --use_dereverb_ref false \
+    --use_noise_ref false \
+    --nj 16 \
+    --inference_model "valid.loss.best.pth" \
+    "$@"
diff --git a/egs2/vctk_noisyreverb/enh1/scripts b/egs2/vctk_noisyreverb/enh1/scripts
new file mode 120000
index 00000000000..9aeb3f26509
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/enh1/scripts
\ No newline at end of file
diff --git a/egs2/vctk_noisyreverb/enh1/steps b/egs2/vctk_noisyreverb/enh1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/vctk_noisyreverb/enh1/utils b/egs2/vctk_noisyreverb/enh1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/vctk_noisyreverb/enh1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/vivos/asr1/cmd.sh b/egs2/vivos/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/vivos/asr1/cmd.sh
+++ b/egs2/vivos/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/vivos/asr1/run.sh b/egs2/vivos/asr1/run.sh
old mode 100755
new mode 100644
diff --git a/egs2/voxforge/asr1/cmd.sh b/egs2/voxforge/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/voxforge/asr1/cmd.sh
+++ b/egs2/voxforge/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/wenetspeech/asr1/README.md b/egs2/wenetspeech/asr1/README.md
new file mode 100644
index 00000000000..1a625f14752
--- /dev/null
+++ b/egs2/wenetspeech/asr1/README.md
@@ -0,0 +1,29 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Wed Oct  6 15:11:20 CST 2021`
+- python version: `3.8.11 (default, Aug  3 2021, 15:09:35)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.2a1`
+- pytorch version: `pytorch 1.9.0`
+- Pretrained model: https://huggingface.co/espnet/pengcheng_guo_wenetspeech_asr_train_asr_raw_zh_char
+
+## asr_train_asr_conformer_raw_zh_char w/o LM
+- ASR config: conf/train_asr.yaml
+- Decode config: conf/decode_asr.yaml
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_rnn_asr_model_valid.acc.ave_10best/aishell_test|7176|7176|67.1|32.9|0.0|0.1|33.0|32.9|
+|decode_asr_rnn_asr_model_valid.acc.ave_10best/dev|13825|16684|32.1|54.1|13.8|0.1|68.0|64.2|
+|decode_asr_rnn_asr_model_valid.acc.ave_10best/test_meeting|8370|8599|13.4|84.6|2.0|0.1|86.7|86.8|
+|decode_asr_rnn_asr_model_valid.acc.ave_10best/test_net|24774|25995|46.2|50.4|3.4|1.1|54.9|52.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_rnn_asr_model_valid.acc.ave_10best/aishell_test|7176|104765|96.3|3.6|0.1|0.2|3.9|32.9|
+|decode_asr_rnn_asr_model_valid.acc.ave_10best/dev|13825|333357|90.7|3.4|5.9|0.4|9.7|64.2|
+|decode_asr_rnn_asr_model_valid.acc.ave_10best/test_meeting|8370|220614|84.6|5.0|10.4|0.5|15.9|86.8|
+|decode_asr_rnn_asr_model_valid.acc.ave_10best/test_net|24774|416968|91.8|5.3|2.9|0.6|8.8|52.5|
diff --git a/egs2/wenetspeech/asr1/asr.sh b/egs2/wenetspeech/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/wenetspeech/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/wenetspeech/asr1/cmd.sh b/egs2/wenetspeech/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/wenetspeech/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/wenetspeech/asr1/conf/decode_asr.yaml b/egs2/wenetspeech/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..88fdbc20b91
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
+lm_weight: 0.3
diff --git a/egs2/wenetspeech/asr1/conf/fbank.conf b/egs2/wenetspeech/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..75232358639
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000
+--num-mel-bins=80
diff --git a/egs2/wenetspeech/asr1/conf/pbs.conf b/egs2/wenetspeech/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/wenetspeech/asr1/conf/pitch.conf b/egs2/wenetspeech/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/wenetspeech/asr1/conf/queue.conf b/egs2/wenetspeech/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/wenetspeech/asr1/conf/slurm.conf b/egs2/wenetspeech/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/wenetspeech/asr1/conf/train_asr.yaml b/egs2/wenetspeech/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..10fea4f0971
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_conformer.yaml
\ No newline at end of file
diff --git a/egs2/wenetspeech/asr1/conf/train_lm.yaml b/egs2/wenetspeech/asr1/conf/train_lm.yaml
new file mode 120000
index 00000000000..72276561c41
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/train_lm.yaml
@@ -0,0 +1 @@
+tuning/train_lm_transformer.yaml
\ No newline at end of file
diff --git a/egs2/wenetspeech/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/wenetspeech/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..abe914249b6
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,82 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# CTC realted
+ctc_conf:
+    ignore_nan_grad: true
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+num_workers: 2
+batch_type: numel
+batch_bins: 28000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 30
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0015
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/wenetspeech/asr1/conf/tuning/train_lm_transformer.yaml b/egs2/wenetspeech/asr1/conf/tuning/train_lm_transformer.yaml
new file mode 100644
index 00000000000..25412477a41
--- /dev/null
+++ b/egs2/wenetspeech/asr1/conf/tuning/train_lm_transformer.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 10000000
+accum_grad: 1
+max_epoch: 20
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
diff --git a/egs2/wenetspeech/asr1/db.sh b/egs2/wenetspeech/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/wenetspeech/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/wenetspeech/asr1/local/data.sh b/egs2/wenetspeech/asr1/local/data.sh
new file mode 100755
index 00000000000..dcfba5f58c6
--- /dev/null
+++ b/egs2/wenetspeech/asr1/local/data.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+# general configuration
+nj=10
+stage=1
+stop_stage=100
+set=L
+data_dir="data"
+
+log "$0 $*"
+. utils/parse_options.sh
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ ! -e "${WENETSPEECH}" ]; then
+    log "Fill the value of 'WENETSPEECH' of db.sh"
+    log "or download the data set follwing the instruction in https://wenet-e2e.github.io/WenetSpeech/"
+    exit 1
+fi
+
+if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then
+    echo "Valid WENETSPEECH data not found in ${WENETSPEECH}."
+    echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/"
+    echo "and re-construct the data."
+    exit 1
+fi
+
+train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")"
+dev_set=dev
+test_sets="test_net test_meeting"
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "data preparation"
+    mkdir -p ${data_dir}
+    abs_data_dir=$(readlink -f ${data_dir})
+    log "making Kaldi format data directory in ${abs_data_dir}"
+    local/wenetspeech_data_prep.sh \
+        --train-subset ${set} \
+        --stage 1 \
+        ${WENETSPEECH} \
+        ${abs_data_dir}
+
+    # prepare utt2spk and spk2utt files
+    for x in ${train_set} ${dev_set} ${test_sets}; do
+        dir=${data_dir}/${x}
+        paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \
+            sort -u > ${dir}/utt2spk
+        utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "process the long term opus audio file, may take about 3 hours"
+    for x in ${train_set} ${dev_set} ${test_sets}; do
+        log "process audio for ${data_dir}/${x}"
+        dir=${data_dir}/${x}
+        mkdir -p ${dir}/logs
+
+        nutt=$(<${dir}/segments wc -l)
+        nj=$((nj<nutt?nj:nutt))
+
+        split_scps=""
+        for n in $(seq ${nj}); do
+            split_scps="${split_scps} ${dir}/logs/segments.${n}"
+        done
+        utils/split_scp.pl ${dir}/segments ${split_scps}
+
+        ${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\
+            python3 local/process_opus.py \
+                ${dir}/wav.scp \
+                ${dir}/logs/segments.JOB   \
+                ${dir}/logs/wav.JOB.scp
+
+        # modify the `wav.scp` file and rename the `segments` file
+        # rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh`
+        mv ${dir}/wav.scp ${dir}/wav.scp.org
+        mv ${dir}/segments ${dir}/segments.org
+        for n in $(seq ${nj}); do
+            cat ${dir}/logs/wav.${n}.scp || exit 1;
+        done | sort -u > ${dir}/wav.scp
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "format text file"
+    for x in ${train_set} ${dev_set} ${test_sets}; do
+        log "format text for ${data_dir}/${x}"
+        dir=${data_dir}/${x}
+        mv ${dir}/text ${dir}/text.org
+        paste -d " " <(cut -f 1 ${dir}/text.org) \
+            <(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \
+            sort -u > ${dir}/text
+        utils/fix_data_dir.sh ${dir}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/wenetspeech/asr1/local/extract_meta.py b/egs2/wenetspeech/asr1/local/extract_meta.py
new file mode 100755
index 00000000000..30fa8803406
--- /dev/null
+++ b/egs2/wenetspeech/asr1/local/extract_meta.py
@@ -0,0 +1,114 @@
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import argparse
+import json
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""
+      This script is used to process raw json dataset of WenetSpeech,
+      where the long wav is splitinto segments and
+      data of wenet format is generated.
+      """
+    )
+    parser.add_argument("input_json", help="""Input json file of WenetSpeech""")
+    parser.add_argument("output_dir", help="""Output dir for prepared data""")
+
+    args = parser.parse_args()
+    return args
+
+
+def meta_analysis(input_json, output_dir):
+    input_dir = os.path.dirname(input_json)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    try:
+        with open(input_json, "r") as injson:
+            json_data = json.load(injson)
+    except Exception:
+        sys.exit(f"Failed to load input json file: {input_json}")
+    else:
+        if json_data["audios"] is not None:
+            with open(f"{output_dir}/text", "w") as utt2text, open(
+                f"{output_dir}/segments", "w"
+            ) as segments, open(f"{output_dir}/utt2dur", "w") as utt2dur, open(
+                f"{output_dir}/wav.scp", "w"
+            ) as wavscp, open(
+                f"{output_dir}/utt2subsets", "w"
+            ) as utt2subsets, open(
+                f"{output_dir}/reco2dur", "w"
+            ) as reco2dur:
+                for long_audio in json_data["audios"]:
+                    try:
+                        long_audio_path = os.path.realpath(
+                            os.path.join(input_dir, long_audio["path"])
+                        )
+                        aid = long_audio["aid"]
+                        segments_lists = long_audio["segments"]
+                        duration = long_audio["duration"]
+                        assert os.path.exists(long_audio_path)
+                    except AssertionError:
+                        print(
+                            f"""Warning: {aid} something is wrong,
+                                  maybe AssertionError, skipped"""
+                        )
+                        continue
+                    except Exception:
+                        print(
+                            f"""Warning: {aid} something is wrong, maybe the
+                                  error path: {long_audio_path}, skipped"""
+                        )
+                        continue
+                    else:
+                        wavscp.write(f"{aid}\t{long_audio_path}\n")
+                        reco2dur.write(f"{aid}\t{duration}\n")
+                        for segment_file in segments_lists:
+                            try:
+                                sid = segment_file["sid"]
+                                start_time = segment_file["begin_time"]
+                                end_time = segment_file["end_time"]
+                                dur = end_time - start_time
+                                text = segment_file["text"]
+                                segment_subsets = segment_file["subsets"]
+                            except Exception:
+                                print(
+                                    f"""Warning: {segment_file} something
+                                          is wrong, skipped"""
+                                )
+                                continue
+                            else:
+                                utt2text.write(f"{sid}\t{text}\n")
+                                segments.write(
+                                    f"{sid}\t{aid}\t{start_time}\t{end_time}\n"
+                                )
+                                utt2dur.write(f"{sid}\t{dur}\n")
+                                segment_sub_names = " ".join(segment_subsets)
+                                utt2subsets.write(f"{sid}\t{segment_sub_names}\n")
+
+
+def main():
+    args = get_args()
+
+    meta_analysis(args.input_json, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/wenetspeech/asr1/local/path.sh b/egs2/wenetspeech/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/wenetspeech/asr1/local/process_opus.py b/egs2/wenetspeech/asr1/local/process_opus.py
new file mode 100755
index 00000000000..7d6a6af8d1a
--- /dev/null
+++ b/egs2/wenetspeech/asr1/local/process_opus.py
@@ -0,0 +1,87 @@
+# Copyright 2021  NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# process_opus.py: segmentation and downsampling of opus audio
+
+# usage: python3 process_opus.py wav.scp segments output_wav.scp
+
+from pydub import AudioSegment
+import sys
+import os
+
+
+def read_file(wav_scp, segments):
+    wav_scp_dict = {}
+    with open(wav_scp, "r", encoding="UTF-8") as fin:
+        for line_str in fin:
+            wav_id, path = line_str.strip().split()
+            wav_scp_dict[wav_id] = path
+
+    utt_list = []
+    seg_path_list = []
+    start_time_list = []
+    end_time_list = []
+    with open(segments, "r", encoding="UTF-8") as fin:
+        for line_str in fin:
+            arr = line_str.strip().split()
+            assert len(arr) == 4
+            utt_list.append(arr[0])
+            seg_path_list.append(wav_scp_dict[arr[1]])
+            start_time_list.append(float(arr[2]))
+            end_time_list.append(float(arr[3]))
+    return utt_list, seg_path_list, start_time_list, end_time_list
+
+
+# TODO(Qijie): Fix the process logic
+def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list):
+    num_utts = len(utt_list)
+    step = int(num_utts * 0.01)
+    with open(output_wav_scp, "w", encoding="UTF-8") as fout:
+        previous_wav_path = ""
+        for i in range(num_utts):
+            utt_id = utt_list[i]
+            current_wav_path = seg_path_list[i]
+            output_dir = (os.path.dirname(current_wav_path)).replace(
+                "audio", "audio_seg"
+            )
+            seg_wav_path = os.path.join(output_dir, utt_id + ".wav")
+
+            os.makedirs(output_dir, exist_ok=True)
+            if current_wav_path != previous_wav_path:
+                source_wav = AudioSegment.from_file(current_wav_path)
+            previous_wav_path = current_wav_path
+
+            start = int(start_time_list[i] * 1000)
+            end = int(end_time_list[i] * 1000)
+            target_audio = source_wav[start:end].set_frame_rate(16000)
+            target_audio.export(seg_wav_path, format="wav")
+
+            fout.write("{} {}\n".format(utt_id, seg_wav_path))
+            if i % step == 0:
+                print("seg wav finished: {}%".format(int(i / step)))
+
+
+def main():
+    wav_scp = sys.argv[1]
+    segments = sys.argv[2]
+    output_wav_scp = sys.argv[3]
+
+    utt_list, seg_path_list, start_time_list, end_time_list = read_file(
+        wav_scp, segments
+    )
+    output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs2/wenetspeech/asr1/local/text_normalize.pl b/egs2/wenetspeech/asr1/local/text_normalize.pl
new file mode 100755
index 00000000000..55b35e21043
--- /dev/null
+++ b/egs2/wenetspeech/asr1/local/text_normalize.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/env perl
+use utf8;
+use open qw(:std :utf8);
+use warnings;
+
+while (<STDIN>) {
+    chomp;
+    # remove non UTF-8 whitespace character
+    if ($_ =~ /　/) {$_ =~ s:　::g;}
+    if ($_ =~ / /) {$_ =~ s: ::g;}
+    # upper letters
+    if ($_ =~ /[a-zA-Z]/) {$_ =~ uc $_;}
+    # add "_" before and after each English word
+    if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
+    if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
+    if ($_ =~ m/([A-Z]+)(\p{Han}+)/) {$_ =~ s/([A-Z]+)(\p{Han}+)/$1\_$2/g;}
+    if ($_ =~ m/(\p{Han}+)([A-Z]+)/) {$_ =~ s/(\p{Han}+)([A-Z]+)/$1\_$2/g;}
+    # remove UTF-8 whitespace charcter
+    if ($_ =~ /\s+/) {$_ =~ s:\s+::g;}
+    # replace "_" with a normal whitespace
+    if ($_ =~ /\_/) {$_ =~ s:\_: :g;}
+
+    print "$_\n";
+}
diff --git a/egs2/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/egs2/wenetspeech/asr1/local/wenetspeech_data_prep.sh
new file mode 100755
index 00000000000..4959328b806
--- /dev/null
+++ b/egs2/wenetspeech/asr1/local/wenetspeech_data_prep.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Seasalt AI, Inc (Author: Guoguo Chen)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+#                 NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -o pipefail
+
+stage=1
+prefix=
+train_subset=L
+
+. utils/parse_options.sh || exit 1;
+
+filter_by_id () {
+  idlist=$1
+  input=$2
+  output=$3
+  field=1
+  if [ $# -eq 4 ]; then
+    field=$4
+  fi
+  cat $input | perl -se '
+    open(F, "<$idlist") || die "Could not open id-list file $idlist";
+    while(<F>) {
+      @A = split;
+      @A>=1 || die "Invalid id-list file line $_";
+      $seen{$A[0]} = 1;
+    }
+    while(<>) {
+      @A = split;
+      @A > 0 || die "Invalid file line $_";
+      @A >= $field || die "Invalid file line $_";
+      if ($seen{$A[$field-1]}) {
+        print $_;
+      }
+    }' -- -idlist="$idlist" -field="$field" > $output ||\
+  (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
+}
+
+subset_data_dir () {
+  utt_list=$1
+  src_dir=$2
+  dest_dir=$3
+  mkdir -p $dest_dir || exit 1;
+  # wav.scp text segments utt2dur
+  filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
+    (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
+    (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
+    (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
+  awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
+  filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
+    (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
+  rm -f $dest_dir/reco
+}
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
+  echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
+  echo ""
+  echo "This script takes the WenetSpeech source directory, and prepares the"
+  echo "WeNet format data directory."
+  echo "  --prefix <prefix>                # Prefix for output data directory."
+  echo "  --stage <stage>                  # Processing stage."
+  echo "  --train-subset <L|M|S|W>     # Train subset to be created."
+  exit 1
+fi
+
+wenetspeech_dir=$1
+data_dir=$2
+
+declare -A subsets
+subsets=(
+  [L]="train_l"
+  [M]="train_m"
+  [S]="train_s"
+  [W]="train_w"
+  [DEV]="dev"
+  [TEST_NET]="test_net"
+  [TEST_MEETING]="test_meeting")
+
+prefix=${prefix:+${prefix}_}
+
+corpus_dir=$data_dir/${prefix}corpus/
+if [ $stage -le 1 ]; then
+  echo "$0: Extract meta into $corpus_dir"
+  # Sanity check.
+  [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
+    echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
+  [ ! -d $wenetspeech_dir/audio ] &&\
+    echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
+
+  [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
+
+  # Files to be created:
+  # wav.scp text segments utt2dur
+  python3 local/extract_meta.py \
+    $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Split data to train, dev, test_net, and test_meeting"
+  [ ! -f $corpus_dir/utt2subsets ] &&\
+    echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
+  for label in $train_subset DEV TEST_NET TEST_MEETING; do
+    if [ ! ${subsets[$label]+set} ]; then
+      echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
+    fi
+    subset=${subsets[$label]}
+    [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
+    cat $corpus_dir/utt2subsets | \
+       awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
+       > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
+    subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
+      $corpus_dir $data_dir/${prefix}$subset || exit 1;
+  done
+fi
+
+echo "$0: Done"
diff --git a/egs2/wenetspeech/asr1/path.sh b/egs2/wenetspeech/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/wenetspeech/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/wenetspeech/asr1/pyscripts b/egs2/wenetspeech/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/wenetspeech/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/wenetspeech/asr1/run.sh b/egs2/wenetspeech/asr1/run.sh
new file mode 100755
index 00000000000..0267f2d37ed
--- /dev/null
+++ b/egs2/wenetspeech/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+set=L    # S for the small set, M for the mediate set, L for the large set
+
+train_set=train_"$(echo "${set}" | tr "[:lower:]" "[:upper:]")"
+valid_set=dev
+test_sets="dev test_meeting test_net"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=false
+
+# speed perturbation related
+# add "--speed_perturb_factors="0.9 1.0 1.1" if you want to
+# apply speed perturbation for the training data
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --local_data_opts "--set ${set}"                   \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/wenetspeech/asr1/scripts b/egs2/wenetspeech/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/wenetspeech/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/wenetspeech/asr1/steps b/egs2/wenetspeech/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/wenetspeech/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/wenetspeech/asr1/utils b/egs2/wenetspeech/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/wenetspeech/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/egs2/wham/enh1/cmd.sh b/egs2/wham/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/wham/enh1/cmd.sh
+++ b/egs2/wham/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/whamr/enh1/cmd.sh b/egs2/whamr/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/whamr/enh1/cmd.sh
+++ b/egs2/whamr/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/whamr/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml b/egs2/whamr/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
index 43b73c7d8b7..28accb29529 100644
--- a/egs2/whamr/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
+++ b/egs2/whamr/enh1/conf/tuning/train_enh_beamformer_mvdr.yaml
@@ -32,6 +32,7 @@ encoder: stft
 encoder_conf:
     n_fft: 512
     hop_length: 128
+    use_builtin_complex: False
 decoder: stft
 decoder_conf:
     n_fft: 512
diff --git a/egs2/whamr/enh1/conf/tuning/train_enh_beamformer_wmpdr.yaml b/egs2/whamr/enh1/conf/tuning/train_enh_beamformer_wmpdr.yaml
index 7d6c3fc2cdb..b0249dc3fdd 100644
--- a/egs2/whamr/enh1/conf/tuning/train_enh_beamformer_wmpdr.yaml
+++ b/egs2/whamr/enh1/conf/tuning/train_enh_beamformer_wmpdr.yaml
@@ -32,6 +32,7 @@ encoder: stft
 encoder_conf:
     n_fft: 512
     hop_length: 128
+    use_builtin_complex: False
 decoder: stft
 decoder_conf:
     n_fft: 512
diff --git a/egs2/wsj/asr1/README.md b/egs2/wsj/asr1/README.md
index e9ab5394d0d..f87e60e1991 100644
--- a/egs2/wsj/asr1/README.md
+++ b/egs2/wsj/asr1/README.md
@@ -1,11 +1,63 @@
 <!-- Generated by ./scripts/utils/show_asr_result.sh -->
 # RESULTS
 
+## Self-supervised learning features [HuBERT_large_ll60k, Conformer, utt_mvn](conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml)
+
+### Environments
+- date: `Mon Aug  2 19:37:05 EDT 2021`
+- python version: `3.7.10 (default, Feb 26 2021, 18:47:35)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.8.0`
+- Git hash: `05a7d399f37a54659f42739c859d5b85cad9cdc6`
+  - Commit date: `Mon Aug 2 17:49:38 2021 +0900`
+- Pretrained model: https://zenodo.org/record/5156171
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_dev93|503|8234|97.2|2.5|0.3|0.4|3.1|33.2|
+|decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_eval92|333|5643|98.4|1.6|0.1|0.2|1.8|22.2|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_dev93|503|48634|99.0|0.3|0.7|0.2|1.2|43.7|
+|decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_eval92|333|33341|99.4|0.2|0.4|0.1|0.7|35.4|
+
+
+## Self-supervised learning features [Wav2Vec2_large_ll60k, Conformer, utt_mvn](conf/tuning/train_asr_conformer_s3prlfrontend_wav2vec2.yaml) with [Transformer-LM](conf/tuning/train_lm_transformer.yaml)
+
+### Environments
+- date: `Mon Aug  2 19:37:05 EDT 2021`
+- python version: `3.7.10 (default, Feb 26 2021, 18:47:35)  [GCC 7.3.0]`
+- espnet version: `espnet 0.10.0`
+- pytorch version: `pytorch 1.8.0`
+- Git hash: `05a7d399f37a54659f42739c859d5b85cad9cdc6`
+  - Commit date: `Mon Aug 2 17:49:38 2021 +0900`
+- pretrained model: https://zenodo.org/record/5156153
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_dev93|503|8234|97.4|2.3|0.3|0.2|2.8|32.0|
+|decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_eval92|333|5643|98.4|1.6|0.1|0.1|1.8|19.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_dev93|503|48634|99.1|0.4|0.6|0.2|1.1|42.5|
+|decode_lm_lm_train_lm_transformer_en_char_valid.loss.ave_asr_model_valid.acc.ave/test_eval92|333|33341|99.3|0.3|0.4|0.1|0.8|32.4|
+
+
 ## Using Transformer LM (ASR model is same as the above): lm_weight=1.2, ctc_weight=0.3, beam_size=20
 
 - ASR config: [conf/tuning/train_asr_transformer2.yaml](conf/tuning/train_asr_transformer2.yaml)
 - LM config: [conf/tuning/train_lm_transformer.yaml](conf/tuning/train_lm_transformer.yaml)
-- Decode config:  [conf/decode.conf](conf/decode.conf)
+- Decode config:  [conf/decode.yaml](conf/decode.yaml)
 - Pretrained model: https://zenodo.org/record/4243201
 
 ### WER
@@ -27,7 +79,7 @@
 
 - ASR config: [conf/tuning/train_asr_transformer2.yaml](conf/tuning/train_asr_transformer2.yaml)
 - LM config: [conf/tuning/train_lm_adam_layers4.yaml](conf/tuning/train_lm_adam_layers4.yaml)
-- Decode config:  [conf/decode.conf](conf/decode.conf)
+- Decode config:  [conf/decode.yaml](conf/decode.yaml)
 - Pretrained model: https://zenodo.org/record/4003381/
 
 ### WER
diff --git a/egs2/wsj/asr1/cmd.sh b/egs2/wsj/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/wsj/asr1/cmd.sh
+++ b/egs2/wsj/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml b/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml
new file mode 100644
index 00000000000..7621cc71599
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/inference_asr_maskctc.yaml
@@ -0,0 +1,7 @@
+# Add the following options for running maskctc inference
+#   --inference_asr_model valid.acc_mlm.ave.pth
+#   --use_maskctc true
+# To run CTC greedy decoding, set maskctc_n_iterations to 1
+# and maskctc_threshold_probability to 0.0
+maskctc_n_iterations: 10
+maskctc_threshold_probability: 0.999
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml
new file mode 100644
index 00000000000..d8595911d0b
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_hubert.yaml
@@ -0,0 +1,92 @@
+num_workers: 8
+batch_type: numel
+batch_bins: 4000000
+accum_grad: 4
+max_epoch: 100
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: true
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# ctc related
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_wav2vec2.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_wav2vec2.yaml
new file mode 100644
index 00000000000..75c58cb1a64
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/train_asr_conformer_s3prlfrontend_wav2vec2.yaml
@@ -0,0 +1,92 @@
+num_workers: 8
+batch_type: numel
+batch_bins: 4000000
+accum_grad: 4
+max_epoch: 100
+patience: none
+# The initialization method for model parameters
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+freeze_param: [
+"frontend.upstream"
+]
+
+frontend: s3prl
+frontend_conf:
+    frontend_conf:
+        upstream: wav2vec2_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+    download_dir: ./hub
+    multilayer_feature: true
+
+preencoder: linear
+preencoder_conf:
+    input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+    output_size: 80
+
+encoder: conformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d2
+    normalize_before: true
+    macaron_style: true
+    rel_pos_type: latest
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# ctc related
+ctc_conf:
+    ignore_nan_grad: true
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+optim: adam
+optim_conf:
+    lr: 0.005
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
diff --git a/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
new file mode 100644
index 00000000000..8f5204bef97
--- /dev/null
+++ b/egs2/wsj/asr1/conf/tuning/train_asr_transformer_maskctc.yaml
@@ -0,0 +1,65 @@
+batch_type: folded
+batch_size: 32
+accum_grad: 8
+max_epoch: 100
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc_mlm
+    - max
+keep_nbest_models: 10
+
+model: maskctc
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+encoder: transformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d
+    normalize_before: true
+
+decoder: mlm
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+num_att_plot: 0
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/README.md b/egs2/wsj0_2mix/enh1/README.md
index 8cffdfb5b61..b5040ed04ae 100644
--- a/egs2/wsj0_2mix/enh1/README.md
+++ b/egs2/wsj0_2mix/enh1/README.md
@@ -1,20 +1,93 @@
 <!-- Generated by ./scripts/utils/show_enh_score.sh -->
-<!-- These results are from the code before refactoring -->
 # RESULTS
 ## Environments
-- date: `Wed Jul 22 18:19:59 CST 2020`
+- date: `Thu Feb  4 01:19:29 CST 2021`
 - python version: `3.7.6 (default, Jan  8 2020, 19:59:22)  [GCC 7.3.0]`
-- espnet version: `espnet 0.7.0`
+- espnet version: `espnet 0.9.7`
 - pytorch version: `pytorch 1.5.0`
-- Git hash: `db6c56e2eddbe947f8e506d74f807fa102c9a72e`
-  - Commit date: `Wed Jul 22 17:50:07 2020 +0800`
+- Git hash: `a3334220b0352931677946d178fade3313cf82bb`
+  - Commit date: `Fri Jan 29 23:35:47 2021 +0800`
 
 
-## enh_train_enh_PSM_raw
+## enh_train_enh_conv_tasnet_raw
 
-config: ./conf/tuning/train_enh_PSM.yaml
+ - config: ./conf/tuning/train_enh_conv_tasnet.yaml
+ - Pretrained model: https://zenodo.org/record/4498562
 
 |dataset|STOI|SAR|SDR|SIR|
 |---|---|---|---|---|
-|enhanced_cv_min_8k|0.891453|11.4742|10.4078|18.5941|
-|enhanced_tt_min_8k|0.898718|11.3724|10.327|18.6458|
+|enhanced_cv_min_8k|0.94|17.37|16.80|26.97|
+|enhanced_tt_min_8k|0.95|16.62|15.94|25.90|
+
+
+## enh_train_enh_rnn_tf_raw
+
+ - config: conf/tuning/train_enh_rnn_tf.yaml
+ - Pretrained model: https://zenodo.org/record/4498554
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_cv_min_8k|0.89|11.55|10.39|18.06|
+|enhanced_tt_min_8k|0.89|11.40|10.24|18.04|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Apr 15 00:20:31 CST 2021`
+- python version: `3.7.10 (default, Feb 26 2021, 18:47:35)  [GCC 7.3.0]`
+- espnet version: `espnet 0.9.8`
+- pytorch version: `pytorch 1.5.0`
+- Git hash: `08d2a6674adc536325ce15202c460ce9f5920f1b`
+  - Commit date: `Thu Apr 15 00:13:49 2021 +0800`
+
+
+## enh_train_enh_dprnn_tasnet_raw
+
+ - config: conf/tuning/train_enh_dprnn_tasnet.yaml
+ - Pretrained model: https://zenodo.org/record/4688000
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_cv_min_8k|0.96|19.04|18.54|29.15|
+|enhanced_tt_min_8k|0.96|18.82|18.29|28.92|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Wed Feb 23 16:42:06 CST 2022`
+- python version: `3.7.11 (default, Jul 27 2021, 14:32:16)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.8.1`
+- Git hash: `ac3c10cfe4faf82c0bb30f8b32d9e8692363e0a9`
+  - Commit date: `Fri Feb 11 16:22:52 2022 +0800`
+
+
+## enh_train_enh_skim_tasnet_noncausal_raw
+
+ - config: conf/tuning/train_enh_skim_tasnet_noncausal.yaml
+ - Pretrained model: https://huggingface.co/lichenda/wsj0_2mix_skim_noncausal
+
+|dataset|STOI|SAR|SDR|SIR|
+|---|---|---|---|---|
+|enhanced_cv_min_8k|0.96|19.17|18.70|29.56|
+|enhanced_tt_min_8k|0.97|18.96|18.45|29.31|
+
+<!-- Generated by ./scripts/utils/show_enh_score.sh -->
+# RESULTS
+## Environments
+- date: `Thu Mar  3 14:29:20 CST 2022`
+- python version: `3.8.12 (default, Oct 12 2021, 13:49:34)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.2+cu102`
+- Git hash: `9c24b3adddbde3402530080cb58ae08a6f4dd642`
+  - Commit date: `Wed Feb 23 14:49:15 2022 -0500`
+
+
+## DC-CRN complex spectral mapping (SNR loss)
+
+config: conf/tuning/train_enh_dc_crn_mapping_snr.yaml
+
+|dataset|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|
+|enhanced_cv_min_8k|0.93|14.86|14.16|24.02|13.73|
+|enhanced_tt_min_8k|0.94|14.25|13.46|23.13|13.01|
diff --git a/egs2/wsj0_2mix/enh1/cmd.sh b/egs2/wsj0_2mix/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/wsj0_2mix/enh1/cmd.sh
+++ b/egs2/wsj0_2mix/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_DPTNet.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_DPTNet.yaml
index 691f584c4fc..db0927d1c5f 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_DPTNet.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_DPTNet.yaml
@@ -27,8 +27,7 @@ scheduler_conf:
     mode: min
     factor: 0.5
     patience: 1
-model_conf:
-    loss_type: si_snr
+
 encoder: same
 decoder: same
 separator: asteroid
@@ -55,3 +54,13 @@ separator_conf:
     stride: 8
     sample_rate: 8000
 
+
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_convtasnet.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_convtasnet.yaml
index 152aec47a44..bbbc5608a05 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_convtasnet.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_convtasnet.yaml
@@ -27,8 +27,7 @@ scheduler_conf:
     mode: min
     factor: 0.5
     patience: 1
-model_conf:
-    loss_type: si_snr
+
 encoder: same
 decoder: same
 separator: asteroid
@@ -54,3 +53,12 @@ separator_conf:
     encoder_activation: None
     sample_rate: 8000
 
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_convtasnet_pretrain.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_convtasnet_pretrain.yaml
index a1e222399d8..a3fcefe6304 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_convtasnet_pretrain.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_asteroid_convtasnet_pretrain.yaml
@@ -27,8 +27,7 @@ scheduler_conf:
     mode: min
     factor: 0.5
     patience: 1
-model_conf:
-    loss_type: si_snr
+
 encoder: same
 decoder: same
 separator: asteroid
@@ -37,3 +36,12 @@ separator_conf:
     model_name: ConvTasNet
     pretrained_path: mpariente/ConvTasNet_WHAM!_sepclean # choose the pretrained model
 
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_conformer_tf.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_conformer_tf.yaml
index 901315c5487..b9cb1bddb13 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_conformer_tf.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_conformer_tf.yaml
@@ -25,9 +25,6 @@ scheduler_conf:
     mode: min
     factor: 0.7
     patience: 1
-model_conf:
-    loss_type: mask_mse
-    mask_type: psm
 encoder: stft
 encoder_conf:
     n_fft: 512
@@ -59,3 +56,38 @@ separator_conf:
     use_cnn_in_conformer: true                   # whether to use CNN in conformer
     conformer_enc_kernel_size: 5                 # kernel size in CNN module of conformer-based encoder
 
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+  # The second criterion
+  - name: l1 
+    conf:
+      compute_on_mask: False
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      # whether the permutation is computed independently
+      # If false, it will use the permutation order from 
+      # the pervious criterion (if it exists)
+      independent_perm: False  
+  # The third criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 5.0
+      independent_perm: False
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_conv_tasnet.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_conv_tasnet.yaml
index ec839bd0331..78f88918c3c 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_conv_tasnet.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_conv_tasnet.yaml
@@ -27,8 +27,6 @@ scheduler_conf:
     mode: min
     factor: 0.5
     patience: 1
-model_conf:
-    loss_type: si_snr
 encoder: conv
 encoder_conf:
     channel: 256
@@ -51,3 +49,12 @@ separator_conf:
     norm_type: "gLN"
     nonlinear: relu
 
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
new file mode 100644
index 00000000000..64cc661070a
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dc_crn_mapping_snr.yaml
@@ -0,0 +1,66 @@
+init: xavier_uniform
+max_epoch: 200
+batch_type: folded
+batch_size:  16
+iterator_type: chunk
+chunk_length: 32000
+num_workers: 4
+optim: adam
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 1.0e-7
+    amsgrad: true
+patience: 10
+grad_clip: 5
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+separator: dc_crn
+separator_conf:
+    num_spk: 2
+    input_channels: [2, 16, 32, 64, 128, 256]
+    enc_hid_channels: 8
+    enc_layers: 5
+    glstm_groups: 2
+    glstm_layers: 2
+    glstm_bidirectional: true
+    glstm_rearrange: false
+    mode: mapping
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml
index 4173f6e572b..b341e682da2 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dprnn_tasnet.yaml
@@ -1,8 +1,8 @@
 optim: adam
 init: xavier_uniform
-max_epoch: 100
+max_epoch: 150
 batch_type: folded
-batch_size: 3 # batch_size 16 can be trained on 4 RTX 2080ti
+batch_size: 1 # batch_size 16 can be trained on 4 RTX 2080ti
 iterator_type: chunk
 chunk_length: 32000
 num_workers: 4
@@ -27,8 +27,7 @@ scheduler_conf:
     mode: min
     factor: 0.7
     patience: 1
-model_conf:
-    loss_type: si_snr
+
 encoder: conv
 encoder_conf:
     channel: 64
@@ -51,3 +50,23 @@ separator_conf:
     dropout: 0.1
     nonlinear: relu
 
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dprnn_tf.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dprnn_tf.yaml
index 961c2fd2380..8bf4bbb1bbd 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dprnn_tf.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_dprnn_tf.yaml
@@ -25,9 +25,7 @@ scheduler_conf:
     mode: min
     factor: 0.5
     patience: 1
-model_conf:
-    loss_type: mask_mse
-    mask_type: psm
+
 encoder: stft
 encoder_conf:
     n_fft: 512
@@ -48,3 +46,37 @@ separator_conf:
     dropout: 0.1
     nonlinear: relu
 
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+  # The second criterion
+  - name: l1 
+    conf:
+      compute_on_mask: False
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      # whether the permutation is computed independently
+      # If false, it will use the permutation order from 
+      # the pervious criterion (if it exists)
+      independent_perm: False  
+  # The third criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 5.0
+      independent_perm: False
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_rnn_tf.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_rnn_tf.yaml
index 4f767a7a824..9d0b1761eaa 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_rnn_tf.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_rnn_tf.yaml
@@ -25,9 +25,45 @@ scheduler_conf:
     mode: min
     factor: 0.7
     patience: 1
-model_conf:
-    loss_type: mask_mse
-    mask_type: psm
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+  # The second criterion
+  - name: l1 
+    conf:
+      compute_on_mask: False
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      # whether the permutation is computed independently
+      # If false, it will use the permutation order from 
+      # the pervious criterion (if it exists)
+      independent_perm: False  
+  # The third criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 5.0
+      independent_perm: False
+
+
+
+
 encoder: stft
 encoder_conf:
     n_fft: 256
@@ -44,3 +80,5 @@ separator_conf:
     layer: 3
     unit: 896
     dropout: 0.5
+
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
new file mode 100644
index 00000000000..cb59580d9d8
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_causal.yaml
@@ -0,0 +1,72 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 4
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    causal: True
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: False
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml
new file mode 100644
index 00000000000..0c338171b63
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_noncausal.yaml
@@ -0,0 +1,72 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 20
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: reducelronplateau
+scheduler_conf:
+    mode: min
+    factor: 0.7
+    patience: 1
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    causal: False
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: True
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml
new file mode 100644
index 00000000000..9b85b4f7944
--- /dev/null
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_skim_tasnet_steplr.yaml
@@ -0,0 +1,71 @@
+optim: adam
+init: xavier_uniform
+max_epoch: 150
+batch_type: folded
+batch_size: 8
+iterator_type: chunk
+chunk_length: 16000
+num_workers: 4
+optim_conf:
+    lr: 1.0e-03
+    eps: 1.0e-08
+    weight_decay: 0
+patience: 50
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - si_snr
+    - max
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: steplr
+scheduler_conf:
+    step_size: 2
+    gamma: 0.98
+
+encoder: conv
+encoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+decoder: conv
+decoder_conf:
+    channel: 64
+    kernel_size: 2
+    stride: 1
+separator: skim
+separator_conf:
+    causal: False
+    num_spk: 2
+    layer: 6
+    nonlinear: relu
+    unit: 128
+    segment_size: 250
+    dropout: 0.1
+    mem_type: hc 
+    seg_overlap: True
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      independent_perm: True
+
+
+
+
+  
+    
+
diff --git a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_transformer_tf.yaml b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_transformer_tf.yaml
index e36e5a958ff..225ee565f2e 100644
--- a/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_transformer_tf.yaml
+++ b/egs2/wsj0_2mix/enh1/conf/tuning/train_enh_transformer_tf.yaml
@@ -25,9 +25,7 @@ scheduler_conf:
     mode: min
     factor: 0.7
     patience: 1
-model_conf:
-    loss_type: mask_mse
-    mask_type: psm
+
 encoder: stft
 encoder_conf:
     n_fft: 512
@@ -54,3 +52,37 @@ separator_conf:
 
 
 
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+  # The second criterion
+  - name: l1 
+    conf:
+      compute_on_mask: False
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
+      # whether the permutation is computed independently
+      # If false, it will use the permutation order from 
+      # the pervious criterion (if it exists)
+      independent_perm: False  
+  # The third criterion
+  - name: si_snr 
+    conf:
+      eps: 1.0e-7
+    wrapper: pit
+    wrapper_conf:
+      weight: 5.0
+      independent_perm: False
\ No newline at end of file
diff --git a/egs2/wsj0_2mix/enh1/run.sh b/egs2/wsj0_2mix/enh1/run.sh
index ce9543b8530..bfabf713ca5 100755
--- a/egs2/wsj0_2mix/enh1/run.sh
+++ b/egs2/wsj0_2mix/enh1/run.sh
@@ -21,5 +21,5 @@ test_sets="tt_${min_or_max}_${sample_rate} "
     --lang en \
     --ngpu 1 \
     --local_data_opts "--sample_rate ${sample_rate} --min_or_max ${min_or_max}" \
-    --enh_config ./conf/tuning/train_enh_rnn_tf.yaml \
+    --enh_config conf/tuning/train_enh_dprnn_tasnet.yaml \
     "$@"
diff --git a/egs2/wsj0_2mix_spatialized/enh1/README.md b/egs2/wsj0_2mix_spatialized/enh1/README.md
index f9d474e6701..1c7588b6e62 100644
--- a/egs2/wsj0_2mix_spatialized/enh1/README.md
+++ b/egs2/wsj0_2mix_spatialized/enh1/README.md
@@ -16,5 +16,56 @@ config: conf/tuning/train_enh_beamformer_no_wpe.yaml
 
 |dataset|STOI|SAR|SDR|SIR|
 |---|---|---|---|---|
-|enhanced_cv_spatialized_anechoic_multich|0.965245|24.4831|20.6248|23.7421|
-|enhanced_tt_spatialized_anechoic_multich|0.968908|24.3599|20.4742|23.5676|
+|enhanced_cv_spatialized_anechoic_multich|0.96|24.48|20.62|23.74|
+|enhanced_tt_spatialized_anechoic_multich|0.96|24.35|20.47|23.56|
+
+## Environments
+- date: `Tue Jan 26 19:31:08 CST 2021`
+- python version: `3.6.3 |Anaconda, Inc.| (default, Nov 20 2017, 20:41:42)  [GCC 7.2.0]`
+- espnet version: `espnet 0.9.7`
+- pytorch version: `pytorch 1.6.0`
+- Git hash: `6534ed0904ba7205c23f387c834bdb40243f4d5a`
+  - Commit date: `Tue Jan 19 22:07:08 2021 +0800`
+
+
+## enh_train_enh_beamformer_no_wpe_raw
+
+config: ./conf/tuning/train_enh_beamformer_no_wpe.yaml
+
+Model link: https://zenodo.org/record/4512933/files/enh_train_enh_beamformer_no_wpe_raw_valid.loss.best.zip?download=1
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_cv_spatialized_anechoic_multich_min_8k|3.09|0.93|12.60|11.76|25.09|9.49|
+|enhanced_tt_spatialized_anechoic_multich_min_8k|3.02|0.94|12.58|11.72|25.02|9.42|
+|enhanced_cv_spatialized_anechoic_2ch_min_8k|3.63|0.97|25.38|22.24|26.01|20.78|
+|enhanced_tt_spatialized_anechoic_2ch_min_8k|3.59|0.97|25.36|22.25|26.01|20.81|
+
+## Environments
+- date: `Sat Feb  6 16:42:40 CST 2021`
+- python version: `3.6.3 |Anaconda, Inc.| (default, Nov 20 2017, 20:41:42)  [GCC 7.2.0]`
+- espnet version: `espnet 0.9.7`
+- pytorch version: `pytorch 1.6.0`
+- Git hash: `110eca55d633ccba2968ff56ff7bddfd7b3887cb`
+  - Commit date: `Thu Feb 4 01:25:24 2021 +0800`
+
+
+## enh_train_enh_beamformer_no_wpe_raw
+
+config: ./conf/tuning/train_enh_beamformer_no_wpe.yaml
+
+With the following configuration in `run.sh`,
+
+```bash
+train_set=tr_spatialized_reverb_multich_${min_or_max}_${sample_rate}
+valid_set=cv_spatialized_reverb_multich_${min_or_max}_${sample_rate}
+test_sets="tt_spatialized_reverb_multich_${min_or_max}_${sample_rate}"
+```
+
+Model link: https://zenodo.org/record/4513751/files/enh_train_enh_beamformer_no_wpe_raw_valid.loss.best.zip?download=1
+
+|dataset|PESQ|STOI|SAR|SDR|SIR|SI_SNR|
+|---|---|---|---|---|---|---|
+|enhanced_cv_spatialized_reverb_multich_min_8k|2.89|0.87|11.91|9.50|16.72|6.45|
+|enhanced_tt_spatialized_reverb_multich_min_8k|2.88|0.87|11.85|9.39|16.55|6.38|
+
diff --git a/egs2/wsj0_2mix_spatialized/enh1/cmd.sh b/egs2/wsj0_2mix_spatialized/enh1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/wsj0_2mix_spatialized/enh1/cmd.sh
+++ b/egs2/wsj0_2mix_spatialized/enh1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/wsj0_2mix_spatialized/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml b/egs2/wsj0_2mix_spatialized/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
index c151ea80eb9..80a96f3a9a0 100644
--- a/egs2/wsj0_2mix_spatialized/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
+++ b/egs2/wsj0_2mix_spatialized/enh1/conf/tuning/train_enh_beamformer_no_wpe.yaml
@@ -25,13 +25,12 @@ scheduler_conf:
     mode: min
     factor: 0.5
     patience: 1
-model_conf:
-    loss_type: mask_mse
-    mask_type: PSM^2
+
 encoder: stft
 encoder_conf:
     n_fft: 512
     hop_length: 128
+    use_builtin_complex: False
 decoder: stft
 decoder_conf:
     n_fft: 512
@@ -60,3 +59,16 @@ separator_conf:
     beamformer_type: mvdr
     bdropout_rate: 0.0
     shared_power: False
+
+
+criterions: 
+  # The first criterion
+  - name: mse 
+    conf:
+      compute_on_mask: True
+      mask_type: PSM^2
+    # the wrapper for the current criterion
+    # PIT is widely used in the speech separation task
+    wrapper: pit
+    wrapper_conf:
+      weight: 1.0
diff --git a/egs2/yesno/asr1/cmd.sh b/egs2/yesno/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/yesno/asr1/cmd.sh
+++ b/egs2/yesno/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/yoloxochitl_mixtec/asr1/RESULTS.md b/egs2/yoloxochitl_mixtec/asr1/RESULTS.md
new file mode 100644
index 00000000000..5caaadca972
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/RESULTS.md
@@ -0,0 +1,28 @@
+<!-- Generated by scripts/utils/show_asr_result.sh -->
+# RESULTS
+## Environments
+- date: `Tue Dec 21 11:00:50 EST 2021`
+- python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`
+- espnet version: `espnet 0.10.5a1`
+- pytorch version: `pytorch 1.9.0`
+- model: exp/asr_train_asr_transformer_specaug_raw_bpe500/
+
+## asr_train_asr_transformer_specaug_raw_bpe500
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|4985|81348|84.1|11.8|4.1|2.5|18.3|82.5|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|4985|626187|93.4|2.2|4.4|2.4|9.0|82.5|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_bpe500_valid.loss.ave_asr_model_valid.acc.best/test|4985|325684|90.7|5.2|4.1|2.2|11.5|82.5|
+
diff --git a/egs2/yoloxochitl_mixtec/asr1/asr.sh b/egs2/yoloxochitl_mixtec/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/cmd.sh b/egs2/yoloxochitl_mixtec/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/decode_asr.yaml b/egs2/yoloxochitl_mixtec/asr1/conf/decode_asr.yaml
new file mode 120000
index 00000000000..f3f59d5ac2b
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/decode_asr.yaml
@@ -0,0 +1 @@
+tuning/decode_transformer.yaml
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/fbank.conf b/egs2/yoloxochitl_mixtec/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/pbs.conf b/egs2/yoloxochitl_mixtec/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/pitch.conf b/egs2/yoloxochitl_mixtec/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/queue.conf b/egs2/yoloxochitl_mixtec/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/slurm.conf b/egs2/yoloxochitl_mixtec/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/train_asr.yaml b/egs2/yoloxochitl_mixtec/asr1/conf/train_asr.yaml
new file mode 120000
index 00000000000..00ea2f24791
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/train_asr.yaml
@@ -0,0 +1 @@
+tuning/train_asr_transformer.yaml
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/train_lm.yaml b/egs2/yoloxochitl_mixtec/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..5bc7b7fcbfc
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/train_lm.yaml
@@ -0,0 +1,14 @@
+lm_conf:
+    nlayers: 2
+    unit: 650
+optim: sgd        # or adam
+batch_type: folded
+batch_size: 64   # batch size in LM training
+max_epoch: 20      # if the data size is large, we can reduce this
+patience: 3
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/tuning/decode_rnn.yaml b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/decode_rnn.yaml
new file mode 100644
index 00000000000..169f05a1ad9
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm_weight: 0.3
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/tuning/decode_transformer.yaml b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/decode_transformer.yaml
new file mode 100644
index 00000000000..d89db079882
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/decode_transformer.yaml
@@ -0,0 +1,7 @@
+batch_size: 1
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.3
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_conformer.yaml b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_conformer.yaml
new file mode 100644
index 00000000000..33ea8424ad2
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_conformer.yaml
@@ -0,0 +1,69 @@
+# minibatch related
+batch_type: folded
+batch_size: 16
+
+accum_grad: 3
+max_epoch: 60
+patience: none
+init: xavier_uniform
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+encoder: conformer
+encoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    normalize_before: true
+    macaron_style: false
+    pos_enc_layer_type: "rel_pos"
+    selfattention_layer_type: "rel_selfattn"
+    activation_type: "swish"
+    use_cnn_module:  true
+    cnn_module_kernel: 15
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_rnn.yaml b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_rnn.yaml
new file mode 100644
index 00000000000..f55ad9d72bd
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_rnn.yaml
@@ -0,0 +1,35 @@
+# network architecture
+# encoder related
+encoder: vgg_rnn
+encoder_conf:
+    rnn_type: lstm     # encoder architecture type
+    bidirectional: True
+    use_projection: True
+    num_layers: 4
+    hidden_size: 1024
+    output_size: 1024
+
+# decoder related
+decoder: rnn
+decoder_conf:
+    num_layers: 2
+    hidden_size: 1024
+    sampling_probability: 0
+    att_conf:
+        atype: location
+        adim: 1024
+        aconv_chans: 10
+        aconv_filts: 100
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+
+# minibatch related
+batch_size: 30
+
+# optimization related
+optim: adadelta
+max_epoch: 15
+patience: 3
+
diff --git a/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_transformer.yaml b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_transformer.yaml
new file mode 100644
index 00000000000..4bc2299cbb2
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/conf/tuning/train_asr_transformer.yaml
@@ -0,0 +1,51 @@
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    input_layer: "conv2d"
+    num_blocks: 12
+    linear_units: 2048
+    dropout_rate: 0.1
+    output_size: 256  # dimension of attention
+    attention_heads: 4
+    attention_dropout_rate: 0.0
+
+# decoder related
+# decoder related
+decoder: transformer
+decoder_conf:
+    input_layer: "embed"
+    num_blocks: 6
+    linear_units: 2048
+    dropout_rate: 0.1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: folded
+batch_size: 32
+
+# optimization related
+optim: adam
+accum_grad: 2
+grad_clip: 5
+patience: 15
+max_epoch: 100
+optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    warmup_steps: 25000
+
+# others:
+best_model_criterion:
+-  - valid
+   - acc
+   - max
+keep_nbest_models: 100
+
+init: chainer
diff --git a/egs2/yoloxochitl_mixtec/asr1/db.sh b/egs2/yoloxochitl_mixtec/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/local/data.sh b/egs2/yoloxochitl_mixtec/asr1/local/data.sh
new file mode 100755
index 00000000000..31e8faddcc4
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/local/data.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+# general configuration
+stage=0       # start from 0 if you need to start from data preparation
+stop_stage=100
+SECONDS=0
+
+# dataset related
+annotation_type=eaf
+annotation_id=mixtec_underlying_full
+text_format=underlying_full # surface, underlying_reduce
+
+ . utils/parse_options.sh || exit 1;
+
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+mkdir -p ${YOLOXOCHITL_MIXTEC}
+if [ -z "${YOLOXOCHITL_MIXTEC}" ]; then
+    log "Fill the value of 'YOLOXOCHITL_MIXTEC' of db.sh"
+    exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+train_dev="dev"
+test_set="test"
+
+wavdir=${YOLOXOCHITL_MIXTEC}/Yoloxochitl-Mixtec-for-ASR/Sound-files-Narratives-for-ASR
+annodir=${YOLOXOCHITL_MIXTEC}/Yoloxochitl-Mixtec-for-ASR/Transcriptions-for-ASR/ELAN-files-with-underlying-and-surface-tiers
+
+log "data preparation started"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    log "stage1: Download data to ${YOLOXOCHITL_MIXTEC}"
+    mkdir -p ${YOLOXOCHITL_MIXTEC}
+    local/download_and_untar.sh ${YOLOXOCHITL_MIXTEC} http://www.openslr.org/resources/89/Yoloxochitl-Mixtec-Data.tgz Yoloxochitl-Mixtec-Data.tgz
+    local/download_and_untar.sh local http://www.openslr.org/resources/89/Yoloxochitl-Mixtec-Manifest.tgz Yoloxochitl-Mixtec-Manifest.tgz
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage2: Preparing data for yoloxochitl_mixtec"
+    python3 local/data_prep.py -w $wavdir -a $annodir -t data/${annotation_id} \
+                              -m ${annotation_type} -i local/speaker_wav_mapping_mixtec_remove_reserve.csv \
+                              -f ${text_format}
+    chmod +x data/${annotation_id}/remix_script.sh
+    mkdir -p remixed
+    ./data/${annotation_id}/remix_script.sh
+
+    # ESPNet Version (same as voxforge)
+    # consider duplicated sentences (does not consider speaker split)
+    # filter out the same sentences (also same text) of test&dev set from validated set
+    local/split_tr_dt_et.sh data/${annotation_id} data/${train_set} data/${train_dev} data/${test_set}
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/yoloxochitl_mixtec/asr1/local/data_prep.py b/egs2/yoloxochitl_mixtec/asr1/local/data_prep.py
new file mode 120000
index 00000000000..b08079f1ec1
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/local/data_prep.py
@@ -0,0 +1 @@
+../../../../egs/yoloxochitl_mixtec/asr1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/local/download_and_untar.sh b/egs2/yoloxochitl_mixtec/asr1/local/download_and_untar.sh
new file mode 120000
index 00000000000..264471adfb1
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/local/download_and_untar.sh
@@ -0,0 +1 @@
+../../../../egs/yoloxochitl_mixtec/asr1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/local/filter_text.py b/egs2/yoloxochitl_mixtec/asr1/local/filter_text.py
new file mode 100755
index 00000000000..162d09eeb68
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/local/filter_text.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import codecs
+from io import open
+import sys
+
+
+sys.stdin = codecs.getreader("utf-8")(sys.stdin.buffer)
+sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--filter-list", "-f", type=str, help="filter list")
+    args = parser.parse_args()
+
+    with open(args.filter_list, encoding="utf-8") as f:
+        fil = [x.rstrip() for x in f]
+
+    for x in sys.stdin:
+        # extract text parts
+        text = " ".join(x.rstrip().split()[1:])
+        if text in fil:
+            print(x.split()[0], text)
diff --git a/egs2/yoloxochitl_mixtec/asr1/local/path.sh b/egs2/yoloxochitl_mixtec/asr1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh b/egs2/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh
new file mode 120000
index 00000000000..a5773b84536
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh
@@ -0,0 +1 @@
+../../../../egs/yoloxochitl_mixtec/asr1/local/split_tr_dt_et.sh
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/path.sh b/egs2/yoloxochitl_mixtec/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/pyscripts b/egs2/yoloxochitl_mixtec/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/run.sh b/egs2/yoloxochitl_mixtec/asr1/run.sh
new file mode 100755
index 00000000000..0ea9a839121
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/run.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set="train"
+train_dev="dev"
+test_set="test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+./asr.sh \
+    --local_data_opts "--stage 1" \
+    --stage 1 \
+    --stop_stage 100 \
+    --ngpu 1 \
+    --nj 40 \
+    --inference_nj 40 \
+    --use_lm true \
+    --token_type bpe \
+    --nbpe 500 \
+    --feats_type raw \
+    --asr_config "${asr_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${train_dev}" \
+    --test_sets "${test_set}" \
+    --inference_asr_model valid.acc.best.pth \
+    --lm_train_text "data/${train_set}/text"  "$@"
+
diff --git a/egs2/yoloxochitl_mixtec/asr1/scripts b/egs2/yoloxochitl_mixtec/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/steps b/egs2/yoloxochitl_mixtec/asr1/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/yoloxochitl_mixtec/asr1/utils b/egs2/yoloxochitl_mixtec/asr1/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/yoloxochitl_mixtec/asr1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/egs2/zeroth_korean/asr1/cmd.sh b/egs2/zeroth_korean/asr1/cmd.sh
index e0c19d89ddb..2aae6919fef 100644
--- a/egs2/zeroth_korean/asr1/cmd.sh
+++ b/egs2/zeroth_korean/asr1/cmd.sh
@@ -22,7 +22,7 @@
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
-# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 
@@ -77,7 +77,7 @@ elif [ "${cmd_backend}" = slurm ]; then
     # The default setting is written in conf/slurm.conf.
     # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
     # To know the "partion" names, type "sinfo".
-    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
     # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
 
     export train_cmd="slurm.pl"
diff --git a/egs2/zh_openslr38/asr1/README.md b/egs2/zh_openslr38/asr1/README.md
new file mode 100644
index 00000000000..e646b431596
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/README.md
@@ -0,0 +1,39 @@
+# Corpus
+**Free ST Chinese Mandarin Corpus**: a free Mandarin Chinese corpus collected by Surfingtech (www.surfing.ai). The dataset contains 102600 utterances from 855 speakers, for a total of 109.73 hours of speech. 
+
+Since all speakers have 120 utterances, we manually divide the data into train, dev, and test split with a ratio of 90-5-5 using speaker IDs, resulting in 769, 43, and 43 speakers in our train, dev, test split respectively. Utterances with the same speaker ID are kept in the same split.
+
+The original dataset contains duplicates sentences with the same transcript, but are spoken by different speakers. Although the waveforms are different for these duplicates, we still remove sentences in the test and development set that have duplicate transcripts in the training set, in order to eliminate any effect of training data leakage.
+
+Link: https://openslr.org/38
+
+# Results
+## Environments
+- python version: `3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:11)  [GCC 9.4.0]`
+- espnet version: `espnet 0.10.7a1`
+- pytorch version: `pytorch 1.10.1`
+- pretrained model: https://huggingface.co/espnet/zh_openslr38/blob/main/exp/asr_train_asr_conformer_raw_zh_char_sp/valid.acc.ave_10best.pth
+
+## Spectrum Features
+Code to reproduce:
+```./run.sh```
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_rnn_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/dev|4322|46490|91.0|8.4|0.5|0.2|9.2|51.5|
+|decode_asr_rnn_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test|4167|45803|91.1|8.5|0.5|0.2|9.1|52.2|
+
+## HuBERT Self-Supervised Learning (SSLR)
+We provide the script to train with SSLR features via HuBERT. Due to the much longer training time with HuBERT, we only train for 24 epochs. The model does not show a lower CER over spectrum features, but training for longer may lead to improved results.
+
+Code to reproduce:
+```./local/run_sslr.sh```
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.best/dev|4322|46490|90.8|8.6|0.6|0.2|9.4|51.9|
+|decode_asr_lm_lm_train_lm_zh_char_valid.loss.ave_asr_model_valid.acc.best/test|4167|45803|90.8|8.7|0.5|0.2|9.4|54.1|
diff --git a/egs2/zh_openslr38/asr1/asr.sh b/egs2/zh_openslr38/asr1/asr.sh
new file mode 120000
index 00000000000..60b05122cfd
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/asr.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/asr.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/cmd.sh b/egs2/zh_openslr38/asr1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/zh_openslr38/asr1/conf/decode_asr.yaml b/egs2/zh_openslr38/asr1/conf/decode_asr.yaml
new file mode 100644
index 00000000000..88fdbc20b91
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/decode_asr.yaml
@@ -0,0 +1,6 @@
+beam_size: 20
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.6
+lm_weight: 0.3
diff --git a/egs2/zh_openslr38/asr1/conf/fbank.conf b/egs2/zh_openslr38/asr1/conf/fbank.conf
new file mode 100644
index 00000000000..82ac7bd0dbc
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs2/zh_openslr38/asr1/conf/pbs.conf b/egs2/zh_openslr38/asr1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/zh_openslr38/asr1/conf/pitch.conf b/egs2/zh_openslr38/asr1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/zh_openslr38/asr1/conf/queue.conf b/egs2/zh_openslr38/asr1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/zh_openslr38/asr1/conf/slurm.conf b/egs2/zh_openslr38/asr1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/zh_openslr38/asr1/conf/train_asr.yaml b/egs2/zh_openslr38/asr1/conf/train_asr.yaml
new file mode 100644
index 00000000000..98588892b1c
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/train_asr.yaml
@@ -0,0 +1,76 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 40
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/conf/train_lm.yaml b/egs2/zh_openslr38/asr1/conf/train_lm.yaml
new file mode 100644
index 00000000000..6f12611bf06
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/train_lm.yaml
@@ -0,0 +1,29 @@
+lm: transformer
+lm_conf:
+    pos_enc: null
+    embed_unit: 128
+    att_unit: 512
+    head: 8
+    unit: 2048
+    layer: 16
+    dropout_rate: 0.1
+
+# optimization related
+grad_clip: 5.0
+batch_type: numel
+batch_bins: 2000000
+accum_grad: 1
+max_epoch: 15  # 15epoch is enougth
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 10  # 10 is good.
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml b/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml
new file mode 100644
index 00000000000..03a410cded5
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/conf/tuning/train_asr_sslr.yaml
@@ -0,0 +1,89 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    extract_feats_in_collect_stats: false   # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.
+
+# minibatch related
+batch_type: numel
+batch_bins: 4000000
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 40
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+frontend: s3prl
+frontend_conf:
+   frontend_conf:
+      upstream: hubert_large_ll60k  # Note: If the upstream is changed, please change the input_size in the preencoder.
+   download_dir: ./hub
+   multilayer_feature: True
+   
+preencoder: linear
+preencoder_conf:
+   input_size: 1024  # Note: If the upstream is changed, please change this value accordingly.
+   output_size: 80
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
diff --git a/egs2/zh_openslr38/asr1/db.sh b/egs2/zh_openslr38/asr1/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py b/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py
new file mode 100644
index 00000000000..3c61d786c1d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/check_train_test_duplicate.py
@@ -0,0 +1,61 @@
+from collections import Counter
+
+train_file = "data/train/text"
+
+train_lines = []
+with open(train_file) as f:
+    for line in f:
+        if not line:
+            continue
+        train_lines.append(line.split()[1])
+train_lines = set(train_lines)
+
+for test_name in ("test", "dev"):
+    test_file = f"data/{test_name}/text"
+
+    test_lines = []
+    test_uttids = []
+    with open(test_file) as f:
+        for line in f:
+            if not line:
+                continue
+            test_uttids.append(line.split()[0])
+            test_lines.append(line.split()[1])
+
+    count = 0
+    duplicate_uttids = []  # duplicate ids in the test file
+    for t, uttid in zip(test_lines, test_uttids):
+        if t in train_lines:
+            duplicate_uttids.append(uttid)
+            count += 1
+    duplicate_uttids = set(duplicate_uttids)
+    print(count, "duplicates in", test_name)
+
+    # if input("continue? [y/n]") == 'y':
+    # remove all instances of duplicate uttids in: spk2utt, text, utt2spk, wav.scp
+    with open(f"data/{test_name}/spk2utt", "r") as f:
+        # replace all uttid with empty string
+        text = f.read()
+        for uttid in duplicate_uttids:
+            text = text.replace(" " + uttid, "")
+        for line in text.split("\n"):
+            if not line:
+                continue
+            if len(line.strip().split(" ")) < 2:
+                print(f"removing {line} from spk2utt")
+                text = text.replace(line + "\n", "")
+    with open(f"data/{test_name}/spk2utt", "w") as f:
+        f.write(text)
+
+    for name in ("text", "utt2spk", "wav.scp"):
+        with open(f"data/{test_name}/{name}", "r") as f:
+            # remove all lines that contain ids that correspond to duplicate sentences
+            out_lines = []
+            for line in f:
+                if not line.split()[0] in duplicate_uttids:
+                    out_lines.append(line.strip())
+        with open(f"data/{test_name}/{name}", "w") as f:
+            f.write("\n".join(out_lines))
+            f.write("\n")
+    # else:
+    #     print("ok.")
diff --git a/egs2/zh_openslr38/asr1/local/data.sh b/egs2/zh_openslr38/asr1/local/data.sh
new file mode 100755
index 00000000000..a8ae818556d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+help_message=$(cat << EOF
+Usage: $0
+
+Options:
+    --remove_archive (bool): true or false
+      With remove_archive=True, the archives will be removed after being successfully downloaded and un-tarred.
+EOF
+)
+SECONDS=0
+
+# Data preparation related
+data_url=www.openslr.org/resources/38
+remove_archive=false
+download_opt=
+
+log "$0 $*"
+
+
+. ./utils/parse_options.sh
+
+. ./db.sh
+. ./path.sh
+. ./cmd.sh
+
+
+if [ $# -gt 1 ]; then
+  log "${help_message}"
+  exit 2
+fi
+
+if "$remove_archive"; then
+  download_opt="--remove-archive"
+fi
+
+if [ -z "${ST_CMDS}" ]; then
+  log "Error: \$ST_CMDS is not set in db.sh."
+  exit 2
+fi
+
+
+log "Download data to ${ST_CMDS}"
+if [ ! -d "${ST_CMDS}" ]; then
+    mkdir -p "${ST_CMDS}"
+fi
+# To absolute path
+ST_CMDS=$(cd ${ST_CMDS}; pwd)
+
+echo local/data_download.sh ${download_opt} "${ST_CMDS}" "${data_url}" ST-CMDS-20170001_1-OS.tar.gz
+local/data_download.sh ${download_opt} "${ST_CMDS}" "${data_url}" ST-CMDS-20170001_1-OS.tar.gz
+
+log "Data Preparation"
+train_dir=data/train
+dev_dir=data/dev
+test_dir=data/test
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+
+python3 local/data_split.py ${ST_CMDS}/ST-CMDS-20170001_1-OS
+
+for dir in $train_dir $dev_dir $test_dir; do
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+python3 local/check_train_test_duplicate.py
+
+# validate formats
+utils/validate_data_dir.sh --no-feats data/train
+utils/validate_data_dir.sh --no-feats data/dev
+utils/validate_data_dir.sh --no-feats data/test
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/data_download.sh b/egs2/zh_openslr38/asr1/local/data_download.sh
new file mode 100755
index 00000000000..27edd73b27d
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data_download.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+set -euo pipefail
+
+cwd=$(pwd)
+if [ ! -e "${data}/.complete" ]; then
+    mkdir -p "${data}"
+    cd "${data}" || exit 1;
+    wget $url/ST-CMDS-20170001_1-OS.tar.gz
+    tar xf ST-CMDS-20170001_1-OS.tar.gz
+
+    if $remove_archive; then
+        echo "$0: removing $data/ST-CMDS-20170001_1-OS.tar.gz file since --remove-archive option was supplied."
+        rm $data/ST-CMDS-20170001_1-OS.tar.gz
+    fi
+
+    cd "${cwd}" || exit 1;
+    echo "$0: Successfully downloaded and un-tarred $data/ST-CMDS-20170001_1-OS.tar.gz"
+    touch ${data}/.complete
+else
+    echo "$0: Already exists. Skip download."
+fi
+
+exit 0;
diff --git a/egs2/zh_openslr38/asr1/local/data_split.py b/egs2/zh_openslr38/asr1/local/data_split.py
new file mode 100644
index 00000000000..df952d304cd
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/data_split.py
@@ -0,0 +1,107 @@
+"""
+Split data to train, dev, test
+"""
+import sys
+import os
+from collections import defaultdict
+import random
+
+train_size = 0.9
+random.seed(1)
+
+data_dir = sys.argv[1]  # ST-CMDS-20170001_1-OS
+
+# create speaker id dictionary
+d = defaultdict(list)
+for fn in os.listdir(data_dir):
+    if not fn.endswith(".wav"):
+        continue
+    # 20170001P00001A0001.wav
+    prefix, s = fn.split("P")
+    try:
+        speaker, s = s.split("A")
+        letter = "A"
+    except ValueError:
+        speaker, s = s.split("I")
+        letter = "I"
+    utt, _ = s.split(".")
+    d[speaker + letter].append(utt)
+
+speaker_ids = list(d.keys())
+random.shuffle(speaker_ids)
+
+num_speakers = len(speaker_ids)
+assert (
+    num_speakers == 855
+), "Number of speakers should be 855 in Free ST Chinese Mandarin Corpus."
+
+num_train = int(train_size * num_speakers)
+num_test = int((num_speakers - num_train) / 2)
+
+train_speakers = speaker_ids[:num_train]
+dev_speakers = speaker_ids[num_train:-num_test]
+test_speakers = speaker_ids[-num_test:]
+
+print(
+    f"# train: {num_train}, # dev:{num_speakers-num_train-num_test}, # test:{num_test}"
+)
+
+
+def get_transcription(spk_id, utt_id):
+    text_fn = get_text_filename(spk_id, utt_id)
+    with open(text_fn) as f:
+        lines = f.readlines()
+    assert len(lines) == 1, f"More than one line in transription file:{text_fn}"
+    return lines[0]
+
+
+def get_text_filename(spk_id, utt_id):
+    return f"{data_dir}/20170001P{spk_id}{utt_id}.txt"
+
+
+def get_wav_filename(spk_id, utt_id):
+    return f"{data_dir}/20170001P{spk_id}{utt_id}.wav"
+
+
+def create_files(speakers, directory):
+    text_lines, scp_lines, utt2spk_lines = [], [], []
+    for spk_id in speakers:
+        for utt_id in d[spk_id]:
+            # add spk_id in front to make utt_id unique
+            unique_utt_id = spk_id + utt_id
+
+            transcription = get_transcription(spk_id, utt_id)
+            text_lines.append(f"{unique_utt_id} {transcription}\n")
+
+            wav_file_path = get_wav_filename(spk_id, utt_id)
+            scp_lines.append(f"{unique_utt_id} {wav_file_path}\n")
+
+            utt2spk_lines.append(f"{unique_utt_id} {spk_id}\n")
+
+    # sort
+    text_lines.sort()
+    scp_lines.sort()
+    utt2spk_lines.sort()
+
+    # write to file
+    with open(f"{directory}/text", "w+") as text_file:
+        text_file.writelines(text_lines)
+
+    with open(f"{directory}/wav.scp", "w+") as scp_file:
+        scp_file.writelines(scp_lines)
+
+    with open(f"{directory}/utt2spk", "w+") as utt2spk_file:
+        utt2spk_file.writelines(utt2spk_lines)
+
+
+print("Creating files for train...", end="")
+create_files(train_speakers, "data/train")
+print("Done.")
+
+print("Creating files for dev...", end="")
+create_files(dev_speakers, "data/dev")
+print("Done.")
+
+print("Creating files for test...", end="")
+create_files(test_speakers, "data/test")
+print("Done.")
diff --git a/egs2/zh_openslr38/asr1/local/path.sh b/egs2/zh_openslr38/asr1/local/path.sh
new file mode 100755
index 00000000000..cd186777d50
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/path.sh
@@ -0,0 +1 @@
+MAIN_ROOT=$PWD/../../..
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/local/run_sslr.sh b/egs2/zh_openslr38/asr1/local/run_sslr.sh
new file mode 100755
index 00000000000..0f924c725c5
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/local/run_sslr.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/tuning/train_asr_sslr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@" \
+    --feats_normalize uttmvn \
+    --nj 1 \
+    --inference_asr_model valid.acc.best.pth \
+    --gpu_inference true
diff --git a/egs2/zh_openslr38/asr1/path.sh b/egs2/zh_openslr38/asr1/path.sh
new file mode 120000
index 00000000000..c9ac0a75bc6
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/path.sh
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/pyscripts b/egs2/zh_openslr38/asr1/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/run.sh b/egs2/zh_openslr38/asr1/run.sh
new file mode 100755
index 00000000000..8d443a09702
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/run.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr.yaml
+inference_config=conf/decode_asr.yaml
+
+lm_config=conf/train_lm.yaml
+use_lm=true
+use_wordlm=false
+
+# speed perturbation related
+# (train_set will be "${train_set}_sp" if speed_perturb_factors is specified)
+speed_perturb_factors="0.9 1.0 1.1"
+
+./asr.sh                                               \
+    --lang zh                                          \
+    --audio_format wav                                 \
+    --feats_type raw                                   \
+    --token_type char                                  \
+    --use_lm ${use_lm}                                 \
+    --use_word_lm ${use_wordlm}                        \
+    --lm_config "${lm_config}"                         \
+    --asr_config "${asr_config}"                       \
+    --inference_config "${inference_config}"           \
+    --train_set "${train_set}"                         \
+    --valid_set "${valid_set}"                         \
+    --test_sets "${test_sets}"                         \
+    --speed_perturb_factors "${speed_perturb_factors}" \
+    --asr_speech_fold_length 512 \
+    --asr_text_fold_length 150 \
+    --lm_fold_length 150 \
+    --lm_train_text "data/${train_set}/text" "$@"
diff --git a/egs2/zh_openslr38/asr1/scripts b/egs2/zh_openslr38/asr1/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/steps b/egs2/zh_openslr38/asr1/steps
new file mode 120000
index 00000000000..91f2d234e20
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/egs2/zh_openslr38/asr1/utils b/egs2/zh_openslr38/asr1/utils
new file mode 120000
index 00000000000..f49247da827
--- /dev/null
+++ b/egs2/zh_openslr38/asr1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file
diff --git a/espnet/asr/asr_mix_utils.py b/espnet/asr/asr_mix_utils.py
index 2a7a84f54cb..f147c472fb1 100644
--- a/espnet/asr/asr_mix_utils.py
+++ b/espnet/asr/asr_mix_utils.py
@@ -18,14 +18,9 @@
 
 from chainer.training import extension
 
-import matplotlib
-
 from espnet.asr.asr_utils import parse_hypothesis
 
 
-matplotlib.use("Agg")
-
-
 # * -------------------- chainer extension related -------------------- *
 class PlotAttentionReport(extension.Extension):
     """Plot attention reporter.
@@ -115,6 +110,9 @@ def draw_attention_plot(self, att_w):
             matplotlib.pyplot: pyplot object with attention matrix image.
 
         """
+        import matplotlib
+
+        matplotlib.use("Agg")
         import matplotlib.pyplot as plt
 
         if len(att_w.shape) == 3:
diff --git a/espnet/asr/asr_utils.py b/espnet/asr/asr_utils.py
index 99f6c8c84f2..e8c7387ae4b 100644
--- a/espnet/asr/asr_utils.py
+++ b/espnet/asr/asr_utils.py
@@ -490,7 +490,7 @@ def draw_ctc_plot(self, ctc_prob):
                     )
                 else:
                     plt.plot(times_probs, ctc_prob[:, idx])
-            plt.xlabel(u"Input [frame]", fontsize=12)
+            plt.xlabel("Input [frame]", fontsize=12)
             plt.ylabel("Posteriors", fontsize=12)
             plt.xticks(list(range(0, int(n_frames) + 1, 10)))
             plt.yticks(list(range(0, 2, 1)))
@@ -666,7 +666,7 @@ def add_gradient_noise(model, iteration, duration=100, eta=1.0, scale_factor=0.5
         scale_factor (float) {0.55}: The scale of `sigma`.
     """
     interval = (iteration // duration) + 1
-    sigma = eta / interval ** scale_factor
+    sigma = eta / interval**scale_factor
     for param in model.parameters():
         if param.grad is not None:
             _shape = param.grad.size()
diff --git a/espnet/asr/chainer_backend/asr.py b/espnet/asr/chainer_backend/asr.py
index 54b16fc1066..976d920bfbd 100644
--- a/espnet/asr/chainer_backend/asr.py
+++ b/espnet/asr/chainer_backend/asr.py
@@ -39,13 +39,7 @@
 import espnet.lm.chainer_backend.extlm as extlm_chainer
 import espnet.lm.chainer_backend.lm as lm_chainer
 
-# numpy related
-import matplotlib
-
 from espnet.utils.training.tensorboard_logger import TensorboardLogger
-from tensorboardX import SummaryWriter
-
-matplotlib.use("Agg")
 
 
 def train(args):
@@ -466,6 +460,11 @@ def train(args):
 
     set_early_stop(trainer, args)
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        try:
+            from tensorboardX import SummaryWriter
+        except Exception:
+            logging.error("Please install tensorboardx")
+            raise
         writer = SummaryWriter(args.tensorboard_dir)
         trainer.extend(
             TensorboardLogger(writer, att_reporter),
diff --git a/espnet/asr/pytorch_backend/asr.py b/espnet/asr/pytorch_backend/asr.py
index b0d13c601c7..d487380bd3f 100644
--- a/espnet/asr/pytorch_backend/asr.py
+++ b/espnet/asr/pytorch_backend/asr.py
@@ -4,18 +4,18 @@
 """Training/decoding definition for the speech recognition task."""
 
 import copy
+from distutils.version import LooseVersion
+import itertools
 import json
 import logging
 import math
 import os
-import sys
 
 from chainer import reporter as reporter_module
 from chainer import training
 from chainer.training import extensions
 from chainer.training.updater import StandardUpdater
 import numpy as np
-from tensorboardX import SummaryWriter
 import torch
 from torch.nn.parallel import data_parallel
 
@@ -55,15 +55,6 @@
 from espnet.utils.training.train_utils import check_early_stop
 from espnet.utils.training.train_utils import set_early_stop
 
-import matplotlib
-
-matplotlib.use("Agg")
-
-if sys.version_info[0] == 2:
-    from itertools import izip_longest as zip_longest
-else:
-    from itertools import zip_longest as zip_longest
-
 
 def _recursive_to(xs, device):
     if torch.is_tensor(xs):
@@ -324,12 +315,12 @@ class CustomConverterMulEnc(object):
 
     """
 
-    def __init__(self, subsamping_factors=[1, 1], dtype=torch.float32):
+    def __init__(self, subsampling_factors=[1, 1], dtype=torch.float32):
         """Initialize the converter."""
-        self.subsamping_factors = subsamping_factors
+        self.subsampling_factors = subsampling_factors
         self.ignore_id = -1
         self.dtype = dtype
-        self.num_encs = len(subsamping_factors)
+        self.num_encs = len(subsampling_factors)
 
     def __call__(self, batch, device=torch.device("cpu")):
         """Transform a batch and send it to a device.
@@ -348,7 +339,7 @@ def __call__(self, batch, device=torch.device("cpu")):
         ys = batch[0][-1]
 
         # perform subsampling
-        if np.sum(self.subsamping_factors) > self.num_encs:
+        if np.sum(self.subsampling_factors) > self.num_encs:
             xs_list = [
                 [x[:: self.subsampling_factors[i], :] for x in xs_list[i]]
                 for i in range(self.num_encs)
@@ -413,10 +404,10 @@ def train(args):
     # specify attention, CTC, hybrid mode
     if "transducer" in args.model_module:
         if (
-            getattr(args, "etype", False) == "transformer"
-            or getattr(args, "dtype", False) == "transformer"
+            getattr(args, "etype", False) == "custom"
+            or getattr(args, "dtype", False) == "custom"
         ):
-            mtl_mode = "transformer_transducer"
+            mtl_mode = "custom_transducer"
         else:
             mtl_mode = "transducer"
         logging.info("Pure transducer mode")
@@ -519,16 +510,23 @@ def train(args):
     elif args.opt == "noam":
         from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt
 
-        # For transformer-transducer, adim declaration is within the block definition.
-        # Thus, we need retrieve the most dominant value (d_hidden) for Noam scheduler.
-        if hasattr(args, "enc_block_arch") or hasattr(args, "dec_block_arch"):
-            adim = model.most_dom_dim
+        if "transducer" in mtl_mode:
+            if args.noam_adim > 0:
+                optimizer = get_std_opt(
+                    model_params,
+                    args.noam_adim,
+                    args.optimizer_warmup_steps,
+                    args.noam_lr,
+                )
+            else:
+                raise ValueError("noam-adim option should be set to use Noam scheduler")
         else:
-            adim = args.adim
-
-        optimizer = get_std_opt(
-            model_params, adim, args.transformer_warmup_steps, args.transformer_lr
-        )
+            optimizer = get_std_opt(
+                model_params,
+                args.adim,
+                args.transformer_warmup_steps,
+                args.transformer_lr,
+            )
     else:
         raise NotImplementedError("unknown optimizer: " + args.opt)
 
@@ -683,8 +681,8 @@ def train(args):
     is_attn_plot = (
         "transformer" in args.model_module
         or "conformer" in args.model_module
-        or mtl_mode in ["att", "mtl"]
-    ) or mtl_mode == "transformer_transducer"
+        or mtl_mode in ["att", "mtl", "custom_transducer"]
+    )
 
     if args.num_save_attention > 0 and is_attn_plot:
         data = sorted(
@@ -746,21 +744,74 @@ def train(args):
         report_keys_cer_ctc = [
             "main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)
         ] + ["validation/main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)]
-    trainer.extend(
-        extensions.PlotReport(
+
+    if hasattr(model, "is_transducer"):
+        trans_keys = [
+            "main/loss",
+            "validation/main/loss",
+            "main/loss_trans",
+            "validation/main/loss_trans",
+        ]
+
+        ctc_keys = (
+            ["main/loss_ctc", "validation/main/loss_ctc"] if args.use_ctc_loss else []
+        )
+
+        aux_trans_keys = (
             [
-                "main/loss",
-                "validation/main/loss",
-                "main/loss_ctc",
-                "validation/main/loss_ctc",
-                "main/loss_att",
-                "validation/main/loss_att",
+                "main/loss_aux_trans",
+                "validation/main/loss_aux_trans",
             ]
-            + ([] if args.num_encs == 1 else report_keys_loss_ctc),
-            "epoch",
-            file_name="loss.png",
+            if args.use_aux_transducer_loss
+            else []
         )
-    )
+
+        symm_kl_div_keys = (
+            [
+                "main/loss_symm_kl_div",
+                "validation/main/loss_symm_kl_div",
+            ]
+            if args.use_symm_kl_div_loss
+            else []
+        )
+
+        lm_keys = (
+            [
+                "main/loss_lm",
+                "validation/main/loss_lm",
+            ]
+            if args.use_lm_loss
+            else []
+        )
+
+        transducer_keys = (
+            trans_keys + ctc_keys + aux_trans_keys + symm_kl_div_keys + lm_keys
+        )
+
+        trainer.extend(
+            extensions.PlotReport(
+                transducer_keys,
+                "epoch",
+                file_name="loss.png",
+            )
+        )
+    else:
+        trainer.extend(
+            extensions.PlotReport(
+                [
+                    "main/loss",
+                    "validation/main/loss",
+                    "main/loss_ctc",
+                    "validation/main/loss_ctc",
+                    "main/loss_att",
+                    "validation/main/loss_att",
+                ]
+                + ([] if args.num_encs == 1 else report_keys_loss_ctc),
+                "epoch",
+                file_name="loss.png",
+            )
+        )
+
     trainer.extend(
         extensions.PlotReport(
             ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png"
@@ -780,7 +831,7 @@ def train(args):
         snapshot_object(model, "model.loss.best"),
         trigger=training.triggers.MinValueTrigger("validation/main/loss"),
     )
-    if mtl_mode not in ["ctc", "transducer", "transformer_transducer"]:
+    if mtl_mode not in ["ctc", "transducer", "custom_transducer"]:
         trainer.extend(
             snapshot_object(model, "model.acc.best"),
             trigger=training.triggers.MaxValueTrigger("validation/main/acc"),
@@ -849,21 +900,33 @@ def train(args):
     trainer.extend(
         extensions.LogReport(trigger=(args.report_interval_iters, "iteration"))
     )
-    report_keys = [
-        "epoch",
-        "iteration",
-        "main/loss",
-        "main/loss_ctc",
-        "main/loss_att",
-        "validation/main/loss",
-        "validation/main/loss_ctc",
-        "validation/main/loss_att",
-        "main/acc",
-        "validation/main/acc",
-        "main/cer_ctc",
-        "validation/main/cer_ctc",
-        "elapsed_time",
-    ] + ([] if args.num_encs == 1 else report_keys_cer_ctc + report_keys_loss_ctc)
+
+    if hasattr(model, "is_transducer"):
+        report_keys = (
+            [
+                "epoch",
+                "iteration",
+            ]
+            + transducer_keys
+            + ["elapsed_time"]
+        )
+    else:
+        report_keys = [
+            "epoch",
+            "iteration",
+            "main/loss",
+            "main/loss_ctc",
+            "main/loss_att",
+            "validation/main/loss",
+            "validation/main/loss_ctc",
+            "validation/main/loss_att",
+            "main/acc",
+            "validation/main/acc",
+            "main/cer_ctc",
+            "validation/main/cer_ctc",
+            "elapsed_time",
+        ] + ([] if args.num_encs == 1 else report_keys_cer_ctc + report_keys_loss_ctc)
+
     if args.opt == "adadelta":
         trainer.extend(
             extensions.observe_value(
@@ -888,6 +951,8 @@ def train(args):
     set_early_stop(trainer, args)
 
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        from torch.utils.tensorboard import SummaryWriter
+
         trainer.extend(
             TensorboardLogger(
                 SummaryWriter(args.tensorboard_dir),
@@ -913,6 +978,39 @@ def recog(args):
     assert isinstance(model, ASRInterface)
     model.recog_args = args
 
+    if args.quantize_config is not None:
+        q_config = set([getattr(torch.nn, q) for q in args.quantize_config])
+    else:
+        q_config = {torch.nn.Linear}
+
+    if args.quantize_asr_model:
+        logging.info("Use a quantized ASR model for decoding.")
+
+        # It seems quantized LSTM only supports non-packed sequence before torch 1.4.0.
+        # Reference issue: https://github.com/pytorch/pytorch/issues/27963
+        if (
+            torch.__version__ < LooseVersion("1.4.0")
+            and "lstm" in train_args.etype
+            and torch.nn.LSTM in q_config
+        ):
+            raise ValueError(
+                "Quantized LSTM in ESPnet is only supported with torch 1.4+."
+            )
+
+        # Dunno why but weight_observer from dynamic quantized module must have
+        # dtype=torch.qint8 with torch < 1.5 although dtype=torch.float16 is supported.
+        if args.quantize_dtype == "float16" and torch.__version__ < LooseVersion(
+            "1.5.0"
+        ):
+            raise ValueError(
+                "float16 dtype for dynamic quantization is not supported with torch "
+                "version < 1.5.0. Switching to qint8 dtype instead."
+            )
+
+        dtype = getattr(torch, args.quantize_dtype)
+
+        model = torch.quantization.quantize_dynamic(model, q_config, dtype=dtype)
+
     if args.streaming_mode and "transformer" in train_args.model_module:
         raise NotImplementedError("streaming mode for transformer is not implemented")
     logging.info(
@@ -936,6 +1034,9 @@ def recog(args):
             )
         )
         torch_load(args.rnnlm, rnnlm)
+        if args.quantize_lm_model:
+            dtype = getattr(torch, args.quantize_dtype)
+            rnnlm = torch.quantization.quantize_dynamic(rnnlm, q_config, dtype=dtype)
         rnnlm.eval()
     else:
         rnnlm = None
@@ -992,18 +1093,17 @@ def recog(args):
     )
 
     # load transducer beam search
-    if hasattr(model, "is_rnnt"):
+    if hasattr(model, "is_transducer"):
         if hasattr(model, "dec"):
             trans_decoder = model.dec
         else:
             trans_decoder = model.decoder
-        joint_network = model.joint_network
+        joint_network = model.transducer_tasks.joint_network
 
         beam_search_transducer = BeamSearchTransducer(
             decoder=trans_decoder,
             joint_network=joint_network,
             beam_size=args.beam_size,
-            nbest=args.nbest,
             lm=rnnlm,
             lm_weight=args.lm_weight,
             search_type=args.search_type,
@@ -1011,7 +1111,12 @@ def recog(args):
             u_max=args.u_max,
             nstep=args.nstep,
             prefix_alpha=args.prefix_alpha,
+            expansion_gamma=args.expansion_gamma,
+            expansion_beta=args.expansion_beta,
             score_norm=args.score_norm,
+            softmax_temperature=args.softmax_temperature,
+            nbest=args.nbest,
+            quantization=args.quantize_asr_model,
         )
 
     if args.batchsize == 0:
@@ -1069,7 +1174,7 @@ def recog(args):
                             for n in range(args.nbest):
                                 nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"])
                                 nbest_hyps[n]["score"] += hyps[n]["score"]
-                elif hasattr(model, "is_rnnt"):
+                elif hasattr(model, "is_transducer"):
                     nbest_hyps = model.recognize(feat, beam_search_transducer)
                 else:
                     nbest_hyps = model.recognize(
@@ -1083,7 +1188,7 @@ def recog(args):
 
         def grouper(n, iterable, fillvalue=None):
             kargs = [iter(iterable)] * n
-            return zip_longest(*kargs, fillvalue=fillvalue)
+            return itertools.zip_longest(*kargs, fillvalue=fillvalue)
 
         # sort data if batchsize > 1
         keys = list(js.keys())
@@ -1261,7 +1366,7 @@ def enhance(args):
 
     def grouper(n, iterable, fillvalue=None):
         kargs = [iter(iterable)] * n
-        return zip_longest(*kargs, fillvalue=fillvalue)
+        return itertools.zip_longest(*kargs, fillvalue=fillvalue)
 
     num_images = 0
     if not os.path.exists(args.image_dir):
@@ -1290,6 +1395,9 @@ def grouper(n, iterable, fillvalue=None):
 
             # Plot spectrogram
             if args.image_dir is not None and num_images < args.num_images:
+                import matplotlib
+
+                matplotlib.use("Agg")
                 import matplotlib.pyplot as plt
 
                 num_images += 1
@@ -1368,88 +1476,3 @@ def grouper(n, iterable, fillvalue=None):
             if num_images >= args.num_images and enh_writer is None:
                 logging.info("Breaking the process.")
                 break
-
-
-def ctc_align(args):
-    """CTC forced alignments with the given args.
-
-    Args:
-        args (namespace): The program arguments.
-    """
-
-    def add_alignment_to_json(js, alignment, char_list):
-        """Add N-best results to json.
-
-        Args:
-            js (dict[str, Any]): Groundtruth utterance dict.
-            alignment (list[int]): List of alignment.
-            char_list (list[str]): List of characters.
-
-        Returns:
-            dict[str, Any]: N-best results added utterance dict.
-
-        """
-        # copy old json info
-        new_js = dict()
-        new_js["ctc_alignment"] = []
-
-        alignment_tokens = []
-        for idx, a in enumerate(alignment):
-            alignment_tokens.append(char_list[a])
-        alignment_tokens = " ".join(alignment_tokens)
-
-        new_js["ctc_alignment"] = alignment_tokens
-
-        return new_js
-
-    set_deterministic_pytorch(args)
-    model, train_args = load_trained_model(args.model)
-    assert isinstance(model, ASRInterface)
-    model.eval()
-
-    load_inputs_and_targets = LoadInputsAndTargets(
-        mode="asr",
-        load_output=True,
-        sort_in_input_length=False,
-        preprocess_conf=train_args.preprocess_conf
-        if args.preprocess_conf is None
-        else args.preprocess_conf,
-        preprocess_args={"train": False},
-    )
-
-    if args.ngpu > 1:
-        raise NotImplementedError("only single GPU decoding is supported")
-    if args.ngpu == 1:
-        device = "cuda"
-    else:
-        device = "cpu"
-    dtype = getattr(torch, args.dtype)
-    logging.info(f"Decoding device={device}, dtype={dtype}")
-    model.to(device=device, dtype=dtype).eval()
-
-    # read json data
-    with open(args.align_json, "rb") as f:
-        js = json.load(f)["utts"]
-    new_js = {}
-    if args.batchsize == 0:
-        with torch.no_grad():
-            for idx, name in enumerate(js.keys(), 1):
-                logging.info("(%d/%d) aligning " + name, idx, len(js.keys()))
-                batch = [(name, js[name])]
-                feat, label = load_inputs_and_targets(batch)
-                feat = feat[0]
-                label = label[0]
-                enc = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0)
-                alignment = model.ctc.forced_align(enc, label)
-                new_js[name] = add_alignment_to_json(
-                    js[name], alignment, train_args.char_list
-                )
-    else:
-        raise NotImplementedError("Align_batch is not implemented.")
-
-    with open(args.result_label, "wb") as f:
-        f.write(
-            json.dumps(
-                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
-            ).encode("utf_8")
-        )
diff --git a/espnet/asr/pytorch_backend/asr_init.py b/espnet/asr/pytorch_backend/asr_init.py
index 5831abde090..51bca5b7808 100644
--- a/espnet/asr/pytorch_backend/asr_init.py
+++ b/espnet/asr/pytorch_backend/asr_init.py
@@ -1,11 +1,11 @@
 """Finetuning methods."""
 
+from collections import OrderedDict
 import logging
 import os
+import re
 import torch
 
-from collections import OrderedDict
-
 from espnet.asr.asr_utils import get_model_conf
 from espnet.asr.asr_utils import torch_load
 from espnet.nets.asr_interface import ASRInterface
@@ -19,17 +19,17 @@ def freeze_modules(model, modules):
     """Freeze model parameters according to modules list.
 
     Args:
-        model (torch.nn.Module): main model to update
-        modules (list): specified module list for freezing
+        model (torch.nn.Module): Main model.
+        modules (List): Specified module(s) to freeze.
 
     Return:
-        model (torch.nn.Module): updated model
-        model_params (filter): filtered model parameters
+        model (torch.nn.Module) : Updated main model.
+        model_params (filter): Filtered model parameters.
 
     """
     for mod, param in model.named_parameters():
         if any(mod.startswith(m) for m in modules):
-            logging.info(f"freezing {mod}, it will not be updated.")
+            logging.warning(f"Freezing {mod}. It will not be updated during training.")
             param.requires_grad = False
 
     model_params = filter(lambda x: x.requires_grad, model.parameters())
@@ -41,45 +41,50 @@ def transfer_verification(model_state_dict, partial_state_dict, modules):
     """Verify tuples (key, shape) for input model modules match specified modules.
 
     Args:
-        model_state_dict (OrderedDict): the initial model state_dict
-        partial_state_dict (OrderedDict): the trained model state_dict
-        modules (list): specified module list for transfer
+        model_state_dict (Dict) : Main model state dict.
+        partial_state_dict (Dict): Pre-trained model state dict.
+        modules (List): Specified module(s) to transfer.
 
     Return:
-        (boolean): allow transfer
+        (bool): Whether transfer learning is allowed.
 
     """
-    modules_model = []
+    model_modules = []
     partial_modules = []
 
+    for key_m, value_m in model_state_dict.items():
+        if any(key_m.startswith(m) for m in modules):
+            model_modules += [(key_m, value_m.shape)]
+    model_modules = sorted(model_modules, key=lambda x: (x[0], x[1]))
+
     for key_p, value_p in partial_state_dict.items():
         if any(key_p.startswith(m) for m in modules):
             partial_modules += [(key_p, value_p.shape)]
+    partial_modules = sorted(partial_modules, key=lambda x: (x[0], x[1]))
 
-    for key_m, value_m in model_state_dict.items():
-        if any(key_m.startswith(m) for m in modules):
-            modules_model += [(key_m, value_m.shape)]
+    module_match = model_modules == partial_modules
 
-    len_match = len(modules_model) == len(partial_modules)
-
-    module_match = sorted(modules_model, key=lambda x: (x[0], x[1])) == sorted(
-        partial_modules, key=lambda x: (x[0], x[1])
-    )
+    if not module_match:
+        logging.error(
+            "Some specified modules from the pre-trained model "
+            "don't match with the new model modules:"
+        )
+        logging.error(f"Pre-trained: {set(partial_modules) - set(model_modules)}")
+        logging.error(f"New model: {set(model_modules) - set(partial_modules)}")
+        exit(1)
 
-    return len_match and module_match
+    return module_match
 
 
 def get_partial_state_dict(model_state_dict, modules):
-    """Create state_dict with specified modules matching input model modules.
-
-    Note that get_partial_lm_state_dict is used if a LM specified.
+    """Create state dict with specified modules matching input model modules.
 
     Args:
-        model_state_dict (OrderedDict): trained model state_dict
-        modules (list): specified module list for transfer
+        model_state_dict (Dict): Pre-trained model state dict.
+        modules (Dict): Specified module(s) to transfer.
 
     Return:
-        new_state_dict (OrderedDict): the updated state_dict
+        new_state_dict (Dict): State dict with specified modules weights.
 
     """
     new_state_dict = OrderedDict()
@@ -95,10 +100,10 @@ def get_lm_state_dict(lm_state_dict):
     """Create compatible ASR decoder state dict from LM state dict.
 
     Args:
-        lm_state_dict (OrderedDict): pre-trained LM state_dict
+        lm_state_dict (Dict): Pre-trained LM state dict.
 
     Return:
-        new_state_dict (OrderedDict): LM state_dict with updated keys
+        new_state_dict (Dict): State dict with compatible key names.
 
     """
     new_state_dict = OrderedDict()
@@ -116,14 +121,14 @@ def get_lm_state_dict(lm_state_dict):
 
 
 def filter_modules(model_state_dict, modules):
-    """Filter non-matched modules in module_state_dict.
+    """Filter non-matched modules in model state dict.
 
     Args:
-        model_state_dict (OrderedDict): trained model state_dict
-        modules (list): specified module list for transfer
+        model_state_dict (Dict): Pre-trained model state dict.
+        modules (List): Specified module(s) to transfer.
 
     Return:
-        new_mods (list): the update module list
+        new_mods (List): Filtered module list.
 
     """
     new_mods = []
@@ -137,34 +142,83 @@ def filter_modules(model_state_dict, modules):
             incorrect_mods += [mod]
 
     if incorrect_mods:
-        logging.warning(
-            "module(s) %s don't match or (partially match) "
-            "available modules in model.",
-            incorrect_mods,
+        logging.error(
+            "Specified module(s) don't match or (partially match) "
+            f"available modules in model. You specified: {incorrect_mods}."
         )
-        logging.warning("for information, the existing modules in model are:")
-        logging.warning("%s", mods_model)
+        logging.error("The existing modules in model are:")
+        logging.error(f"{mods_model}")
+        exit(1)
 
     return new_mods
 
 
+def create_transducer_compatible_state_dict(
+    model_state_dict, encoder_type, encoder_units
+):
+    """Create a compatible transducer model state dict for transfer learning.
+
+    If RNN encoder modules from a non-Transducer model are found in
+    the pre-trained model state dict, the corresponding modules keys are
+    renamed for compatibility.
+
+    Args:
+        model_state_dict (Dict): Pre-trained model state dict
+        encoder_type (str): Type of pre-trained encoder.
+        encoder_units (int): Number of encoder units in pre-trained model.
+
+    Returns:
+        new_state_dict (Dict): Transducer compatible pre-trained model state dict.
+
+    """
+    if encoder_type.endswith("p") or not encoder_type.endswith(("lstm", "gru")):
+        return model_state_dict
+
+    new_state_dict = OrderedDict()
+    rnn_key_name = "birnn" if "b" in encoder_type else "rnn"
+
+    for key, value in list(model_state_dict.items()):
+        if any(k in key for k in ["l_last", "nbrnn"]):
+            if "nbrnn" in key:
+                layer_name = rnn_key_name + re.search("_l([0-9]+)", key).group(1)
+
+                key = re.sub(
+                    "_l([0-9]+)",
+                    "_l0",
+                    key.replace("nbrnn", layer_name),
+                )
+
+            if (encoder_units * 2) == value.size(-1):
+                value = value[:, :encoder_units] + value[:, encoder_units:]
+
+        new_state_dict[key] = value
+
+    return new_state_dict
+
+
 def load_trained_model(model_path, training=True):
     """Load the trained model for recognition.
 
     Args:
         model_path (str): Path to model.***.best
+        training (bool): Training mode specification for transducer model.
+
+    Returns:
+        model (torch.nn.Module): Trained model.
+        train_args (Namespace): Trained model arguments.
 
     """
     idim, odim, train_args = get_model_conf(
         model_path, os.path.join(os.path.dirname(model_path), "model.json")
     )
 
-    logging.warning("reading model parameters from " + model_path)
+    logging.info(f"Reading model parameters from {model_path}")
 
     if hasattr(train_args, "model_module"):
         model_module = train_args.model_module
     else:
         model_module = "espnet.nets.pytorch_backend.e2e_asr:E2E"
+
     # CTC Loss is not needed, default to builtin to prevent import errors
     if hasattr(train_args, "ctc_type"):
         train_args.ctc_type = "builtin"
@@ -181,27 +235,26 @@ def load_trained_model(model_path, training=True):
     return model, train_args
 
 
-def get_trained_model_state_dict(model_path):
+def get_trained_model_state_dict(model_path, new_is_transducer):
     """Extract the trained model state dict for pre-initialization.
 
     Args:
-        model_path (str): Path to model.***.best
+        model_path (str): Path to trained model.
+        new_is_transducer (bool): Whether the new model is Transducer-based.
 
     Return:
-        model.state_dict() (OrderedDict): the loaded model state_dict
-        (bool): Boolean defining whether the model is an LM
+        (Dict): Trained model state dict.
 
     """
+    logging.info(f"Reading model parameters from {model_path}")
+
     conf_path = os.path.join(os.path.dirname(model_path), "model.json")
-    if "rnnlm" in model_path:
-        logging.warning("reading model parameters from %s", model_path)
 
+    if "rnnlm" in model_path:
         return get_lm_state_dict(torch.load(model_path))
 
     idim, odim, args = get_model_conf(model_path, conf_path)
 
-    logging.warning("reading model parameters from " + model_path)
-
     if hasattr(args, "model_module"):
         model_module = args.model_module
     else:
@@ -210,34 +263,42 @@ def get_trained_model_state_dict(model_path):
     model_class = dynamic_import(model_module)
     model = model_class(idim, odim, args)
     torch_load(model_path, model)
+
     assert (
         isinstance(model, MTInterface)
         or isinstance(model, ASRInterface)
         or isinstance(model, TTSInterface)
     )
 
+    if new_is_transducer and "transducer" not in args.model_module:
+        return create_transducer_compatible_state_dict(
+            model.state_dict(),
+            args.etype,
+            args.eunits,
+        )
+
     return model.state_dict()
 
 
 def load_trained_modules(idim, odim, args, interface=ASRInterface):
-    """Load model encoder or/and decoder modules with ESPNET pre-trained model(s).
+    """Load ASR/MT/TTS model with pre-trained weights for specified modules.
 
     Args:
-        idim (int): initial input dimension.
-        odim (int): initial output dimension.
-        args (Namespace): The initial model arguments.
-        interface (Interface): ASRInterface or STInterface or TTSInterface.
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        args Namespace: Model arguments.
+        interface (ASRInterface|MTInterface|TTSInterface): Model interface.
 
     Return:
-        model (torch.nn.Module): The model with pretrained modules.
+        main_model (torch.nn.Module): Model with pre-initialized weights.
 
     """
 
     def print_new_keys(state_dict, modules, model_path):
-        logging.warning("loading %s from model: %s", modules, model_path)
+        logging.info(f"Loading {modules} from model: {model_path}")
 
         for k in state_dict.keys():
-            logging.warning("override %s" % k)
+            logging.warning(f"Overriding module {k}")
 
     enc_model_path = args.enc_init
     dec_model_path = args.dec_init
@@ -249,16 +310,17 @@ def print_new_keys(state_dict, modules, model_path):
     assert isinstance(main_model, interface)
 
     main_state_dict = main_model.state_dict()
+    logging.warning("Model(s) found for pre-initialization.")
 
-    logging.warning("model(s) found for pre-initialization")
     for model_path, modules in [
         (enc_model_path, enc_modules),
         (dec_model_path, dec_modules),
     ]:
         if model_path is not None:
             if os.path.isfile(model_path):
-                model_state_dict = get_trained_model_state_dict(model_path)
-
+                model_state_dict = get_trained_model_state_dict(
+                    model_path, "transducer" in args.model_module
+                )
                 modules = filter_modules(model_state_dict, modules)
 
                 partial_state_dict = get_partial_state_dict(model_state_dict, modules)
@@ -269,13 +331,9 @@ def print_new_keys(state_dict, modules, model_path):
                     ):
                         print_new_keys(partial_state_dict, modules, model_path)
                         main_state_dict.update(partial_state_dict)
-                    else:
-                        logging.warning(
-                            f"modules {modules} in model {model_path} "
-                            f"don't match your training config",
-                        )
             else:
-                logging.warning("model was not found : %s", model_path)
+                logging.error(f"Specified model was not found: {model_path}")
+                exit(1)
 
     main_model.load_state_dict(main_state_dict)
 
diff --git a/espnet/asr/pytorch_backend/asr_mix.py b/espnet/asr/pytorch_backend/asr_mix.py
index 3d9ce6d9110..53208f16f8e 100644
--- a/espnet/asr/pytorch_backend/asr_mix.py
+++ b/espnet/asr/pytorch_backend/asr_mix.py
@@ -15,7 +15,6 @@
 from chainer.training import extensions
 from itertools import zip_longest as zip_longest
 import numpy as np
-from tensorboardX import SummaryWriter
 import torch
 
 from espnet.asr.asr_mix_utils import add_results_to_json
@@ -46,10 +45,6 @@
 from espnet.utils.training.train_utils import check_early_stop
 from espnet.utils.training.train_utils import set_early_stop
 
-import matplotlib
-
-matplotlib.use("Agg")
-
 
 class CustomConverter(object):
     """Custom batch converter for Pytorch.
@@ -520,6 +515,8 @@ def train(args):
     set_early_stop(trainer, args)
 
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        from torch.utils.tensorboard import SummaryWriter
+
         trainer.extend(
             TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter),
             trigger=(args.report_interval_iters, "iteration"),
diff --git a/espnet/asr/pytorch_backend/recog.py b/espnet/asr/pytorch_backend/recog.py
index f4299dcf924..6c6d4ce1194 100644
--- a/espnet/asr/pytorch_backend/recog.py
+++ b/espnet/asr/pytorch_backend/recog.py
@@ -1,5 +1,6 @@
 """V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`."""
 
+from distutils.version import LooseVersion
 import json
 import logging
 
@@ -42,8 +43,38 @@ def recog_v2(args):
     set_deterministic_pytorch(args)
     model, train_args = load_trained_model(args.model)
     assert isinstance(model, ASRInterface)
-    model.eval()
 
+    if args.quantize_config is not None:
+        q_config = set([getattr(torch.nn, q) for q in args.quantize_config])
+    else:
+        q_config = {torch.nn.Linear}
+
+    if args.quantize_asr_model:
+        logging.info("Use quantized asr model for decoding")
+
+        # See https://github.com/espnet/espnet/pull/3616 for more information.
+        if (
+            torch.__version__ < LooseVersion("1.4.0")
+            and "lstm" in train_args.etype
+            and torch.nn.LSTM in q_config
+        ):
+            raise ValueError(
+                "Quantized LSTM in ESPnet is only supported with torch 1.4+."
+            )
+
+        if args.quantize_dtype == "float16" and torch.__version__ < LooseVersion(
+            "1.5.0"
+        ):
+            raise ValueError(
+                "float16 dtype for dynamic quantization is not supported with torch "
+                "version < 1.5.0. Switching to qint8 dtype instead."
+            )
+
+        dtype = getattr(torch, args.quantize_dtype)
+
+        model = torch.quantization.quantize_dynamic(model, q_config, dtype=dtype)
+
+    model.eval()
     load_inputs_and_targets = LoadInputsAndTargets(
         mode="asr",
         load_output=False,
@@ -61,6 +92,10 @@ def recog_v2(args):
         lm_class = dynamic_import_lm(lm_model_module, lm_args.backend)
         lm = lm_class(len(train_args.char_list), lm_args)
         torch_load(args.rnnlm, lm)
+        if args.quantize_lm_model:
+            logging.info("Use quantized lm model")
+            dtype = getattr(torch, args.quantize_dtype)
+            lm = torch.quantization.quantize_dynamic(lm, q_config, dtype=dtype)
         lm.eval()
     else:
         lm = None
diff --git a/espnet/bin/asr_align.py b/espnet/bin/asr_align.py
index b26a275cdcd..e1ba35ffaee 100755
--- a/espnet/bin/asr_align.py
+++ b/espnet/bin/asr_align.py
@@ -279,10 +279,18 @@ def ctc_align(args, device):
             segment_names[name] = segment_names_per_audio
     # apply configuration
     config = CtcSegmentationParameters()
+    subsampling_factor = 1
+    frame_duration_ms = 10
     if args.subsampling_factor is not None:
-        config.subsampling_factor = args.subsampling_factor
+        subsampling_factor = args.subsampling_factor
     if args.frame_duration is not None:
-        config.frame_duration_ms = args.frame_duration
+        frame_duration_ms = args.frame_duration
+    # Backwards compatibility to ctc_segmentation <= 1.5.3
+    if hasattr(config, "index_duration"):
+        config.index_duration = frame_duration_ms * subsampling_factor / 1000
+    else:
+        config.subsampling_factor = subsampling_factor
+        config.frame_duration_ms = frame_duration_ms
     if args.min_window_size is not None:
         config.min_window_size = args.min_window_size
     if args.max_window_size is not None:
@@ -309,9 +317,7 @@ def ctc_align(args, device):
         )
     if args.scoring_length is not None:
         config.score_min_mean_over_L = args.scoring_length
-    logging.info(
-        f"Frame timings: {config.frame_duration_ms}ms * {config.subsampling_factor}"
-    )
+    logging.info(f"Frame timings: {frame_duration_ms}ms * {subsampling_factor}")
     # Iterate over audio files to decode and align
     for idx, name in enumerate(js.keys(), 1):
         logging.info("(%d/%d) Aligning " + name, idx, len(js.keys()))
diff --git a/espnet/bin/asr_recog.py b/espnet/bin/asr_recog.py
index dc7c64a76f1..3275ecf2243 100755
--- a/espnet/bin/asr_recog.py
+++ b/espnet/bin/asr_recog.py
@@ -116,7 +116,9 @@ def get_parser():
         default=0.0,
         help="""Input length ratio to obtain max output length.
                         If maxlenratio=0.0 (default), it uses a end-detect function
-                        to automatically find maximum hypothesis lengths""",
+                        to automatically find maximum hypothesis lengths.
+                        If maxlenratio<0.0, its absolute value is interpreted
+                        as a constant max output length""",
     )
     parser.add_argument(
         "--minlenratio",
@@ -148,44 +150,63 @@ def get_parser():
         "--search-type",
         type=str,
         default="default",
-        choices=["default", "nsc", "tsd", "alsd"],
+        choices=["default", "nsc", "tsd", "alsd", "maes"],
         help="""Type of beam search implementation to use during inference.
-        Can be either: default beam search, n-step constrained beam search ("nsc"),
-        time-synchronous decoding ("tsd") or alignment-length synchronous decoding
-        ("alsd").
-        Additional associated parameters: "nstep" + "prefix-alpha" (for nsc),
-        "max-sym-exp" (for tsd) and "u-max" (for alsd)""",
+        Can be either: default beam search ("default"),
+        N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"),
+        Alignment-Length Synchronous Decoding ("alsd") or
+        modified Adaptive Expansion Search ("maes").""",
     )
     parser.add_argument(
         "--nstep",
         type=int,
         default=1,
-        help="Number of expansion steps allowed in NSC beam search.",
+        help="""Number of expansion steps allowed in NSC beam search or mAES
+        (nstep > 0 for NSC and nstep > 1 for mAES).""",
     )
     parser.add_argument(
         "--prefix-alpha",
         type=int,
         default=2,
-        help="Length prefix difference allowed in NSC beam search.",
+        help="Length prefix difference allowed in NSC beam search or mAES.",
     )
     parser.add_argument(
         "--max-sym-exp",
         type=int,
         default=2,
-        help="Number of symbol expansions allowed in TSD decoding.",
+        help="Number of symbol expansions allowed in TSD.",
     )
     parser.add_argument(
         "--u-max",
         type=int,
         default=400,
-        help="Length prefix difference allowed in ALSD beam search.",
+        help="Length prefix difference allowed in ALSD.",
+    )
+    parser.add_argument(
+        "--expansion-gamma",
+        type=float,
+        default=2.3,
+        help="Allowed logp difference for prune-by-value method in mAES.",
+    )
+    parser.add_argument(
+        "--expansion-beta",
+        type=int,
+        default=2,
+        help="""Number of additional candidates for expanded hypotheses
+                selection in mAES.""",
     )
     parser.add_argument(
         "--score-norm",
         type=strtobool,
         nargs="?",
         default=True,
-        help="Normalize transducer scores by length",
+        help="Normalize final hypotheses' score by length",
+    )
+    parser.add_argument(
+        "--softmax-temperature",
+        type=float,
+        default=1.0,
+        help="Penalization term for softmax function.",
     )
     # rnnlm related
     parser.add_argument(
@@ -257,7 +278,34 @@ def get_parser():
         default=0.999,
         help="Threshold probability for CTC output",
     )
-
+    # quantize model related
+    parser.add_argument(
+        "--quantize-config",
+        nargs="*",
+        help="""Config for dynamic quantization provided as a list of modules,
+        separated by a comma. E.g.: --quantize-config=[Linear,LSTM,GRU].
+        Each specified module should be an attribute of 'torch.nn', e.g.:
+        torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...""",
+    )
+    parser.add_argument(
+        "--quantize-dtype",
+        type=str,
+        default="qint8",
+        choices=["float16", "qint8"],
+        help="Dtype for dynamic quantization.",
+    )
+    parser.add_argument(
+        "--quantize-asr-model",
+        type=bool,
+        default=False,
+        help="Apply dynamic quantization to ASR model.",
+    )
+    parser.add_argument(
+        "--quantize-lm-model",
+        type=bool,
+        default=False,
+        help="Apply dynamic quantization to LM.",
+    )
     return parser
 
 
diff --git a/espnet/bin/asr_train.py b/espnet/bin/asr_train.py
index 89b7272fe89..7279cf15b7b 100755
--- a/espnet/bin/asr_train.py
+++ b/espnet/bin/asr_train.py
@@ -12,18 +12,13 @@
 import subprocess
 import sys
 
-from distutils.version import LooseVersion
-
 import configargparse
 import numpy as np
-import torch
 
 from espnet import __version__
 from espnet.utils.cli_utils import strtobool
 from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")
-
 
 # NOTE: you need this func to generate our sphinx doc
 def get_parser(parser=None, required=True):
@@ -586,7 +581,7 @@ def main(cmd_args):
             else:
                 ngpu = len(p.stderr.decode().split("\n")) - 1
     else:
-        if is_torch_1_2_plus and args.ngpu != 1:
+        if args.ngpu != 1:
             logging.debug(
                 "There are some bugs with multi-GPU processing in PyTorch 1.2+"
                 + " (see https://github.com/pytorch/pytorch/issues/21108)"
diff --git a/espnet/bin/mt_train.py b/espnet/bin/mt_train.py
index 7251617e098..36b04b4bb31 100755
--- a/espnet/bin/mt_train.py
+++ b/espnet/bin/mt_train.py
@@ -12,18 +12,13 @@
 import subprocess
 import sys
 
-from distutils.version import LooseVersion
-
 import configargparse
 import numpy as np
-import torch
 
 from espnet import __version__
 from espnet.utils.cli_utils import strtobool
 from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")
-
 
 # NOTE: you need this func to generate our sphinx doc
 def get_parser(parser=None, required=True):
@@ -438,7 +433,7 @@ def main(cmd_args):
                 ngpu = len(p.stderr.decode().split("\n")) - 1
         args.ngpu = ngpu
     else:
-        if is_torch_1_2_plus and args.ngpu != 1:
+        if args.ngpu != 1:
             logging.debug(
                 "There are some bugs with multi-GPU processing in PyTorch 1.2+"
                 + " (see https://github.com/pytorch/pytorch/issues/21108)"
diff --git a/espnet/bin/st_train.py b/espnet/bin/st_train.py
index 4398d6aaa0c..6c1b9769290 100755
--- a/espnet/bin/st_train.py
+++ b/espnet/bin/st_train.py
@@ -6,7 +6,6 @@
 
 """End-to-end speech translation model training script."""
 
-from distutils.version import LooseVersion
 import logging
 import os
 import random
@@ -15,14 +14,11 @@
 
 import configargparse
 import numpy as np
-import torch
 
 from espnet import __version__
 from espnet.utils.cli_utils import strtobool
 from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2")
-
 
 # NOTE: you need this func to generate our sphinx doc
 def get_parser(parser=None, required=True):
@@ -508,7 +504,7 @@ def main(cmd_args):
                 ngpu = len(p.stderr.decode().split("\n")) - 1
         args.ngpu = ngpu
     else:
-        if is_torch_1_2_plus and args.ngpu != 1:
+        if args.ngpu != 1:
             logging.debug(
                 "There are some bugs with multi-GPU processing in PyTorch 1.2+"
                 + " (see https://github.com/pytorch/pytorch/issues/21108)"
diff --git a/espnet/bin/tts_decode.py b/espnet/bin/tts_decode.py
index 8c04b102458..71e53439c57 100755
--- a/espnet/bin/tts_decode.py
+++ b/espnet/bin/tts_decode.py
@@ -8,7 +8,6 @@
 import configargparse
 import logging
 import os
-import platform
 import subprocess
 import sys
 
@@ -135,26 +134,16 @@ def main(args):
 
     # check CUDA_VISIBLE_DEVICES
     if args.ngpu > 0:
-        # python 2 case
-        if platform.python_version_tuple()[0] == "2":
-            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]):
-                cvd = subprocess.check_output(
+        if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
+            cvd = (
+                subprocess.check_output(
                     ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
-                ).strip()
-                logging.info("CLSP: use gpu" + cvd)
-                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
-        # python 3 case
-        else:
-            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
-                cvd = (
-                    subprocess.check_output(
-                        ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
-                    )
-                    .decode()
-                    .strip()
                 )
-                logging.info("CLSP: use gpu" + cvd)
-                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
+                .decode()
+                .strip()
+            )
+            logging.info("CLSP: use gpu" + cvd)
+            os.environ["CUDA_VISIBLE_DEVICES"] = cvd
 
         cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
         if cvd is None:
diff --git a/espnet/bin/vc_decode.py b/espnet/bin/vc_decode.py
index b45f5efde68..1802b76769f 100755
--- a/espnet/bin/vc_decode.py
+++ b/espnet/bin/vc_decode.py
@@ -8,7 +8,6 @@
 import configargparse
 import logging
 import os
-import platform
 import subprocess
 import sys
 
@@ -129,26 +128,16 @@ def main(args):
 
     # check CUDA_VISIBLE_DEVICES
     if args.ngpu > 0:
-        # python 2 case
-        if platform.python_version_tuple()[0] == "2":
-            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]):
-                cvd = subprocess.check_output(
+        if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
+            cvd = (
+                subprocess.check_output(
                     ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
-                ).strip()
-                logging.info("CLSP: use gpu" + cvd)
-                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
-        # python 3 case
-        else:
-            if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
-                cvd = (
-                    subprocess.check_output(
-                        ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
-                    )
-                    .decode()
-                    .strip()
                 )
-                logging.info("CLSP: use gpu" + cvd)
-                os.environ["CUDA_VISIBLE_DEVICES"] = cvd
+                .decode()
+                .strip()
+            )
+            logging.info("CLSP: use gpu" + cvd)
+            os.environ["CUDA_VISIBLE_DEVICES"] = cvd
 
         cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
         if cvd is None:
diff --git a/espnet/lm/chainer_backend/lm.py b/espnet/lm/chainer_backend/lm.py
index eb13f288b5a..3cfcd6fd2d5 100644
--- a/espnet/lm/chainer_backend/lm.py
+++ b/espnet/lm/chainer_backend/lm.py
@@ -38,7 +38,6 @@
 from espnet.scheduler.scheduler import dynamic_import_scheduler
 
 from espnet.utils.training.tensorboard_logger import TensorboardLogger
-from tensorboardX import SummaryWriter
 
 from espnet.utils.deterministic_utils import set_deterministic_chainer
 from espnet.utils.training.evaluator import BaseEvaluator
@@ -456,6 +455,11 @@ def train(args):
 
     set_early_stop(trainer, args, is_lm=True)
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        try:
+            from tensorboardX import SummaryWriter
+        except Exception:
+            logging.error("Please install tensorboardx")
+            raise
         writer = SummaryWriter(args.tensorboard_dir)
         trainer.extend(
             TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
diff --git a/espnet/lm/pytorch_backend/lm.py b/espnet/lm/pytorch_backend/lm.py
index 1c0249527fa..2b4efe529f7 100644
--- a/espnet/lm/pytorch_backend/lm.py
+++ b/espnet/lm/pytorch_backend/lm.py
@@ -38,7 +38,6 @@
 from espnet.asr.asr_utils import torch_snapshot
 
 from espnet.utils.training.tensorboard_logger import TensorboardLogger
-from tensorboardX import SummaryWriter
 
 from espnet.utils.deterministic_utils import set_deterministic_pytorch
 from espnet.utils.training.evaluator import BaseEvaluator
@@ -382,6 +381,8 @@ def train(args):
 
     set_early_stop(trainer, args, is_lm=True)
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        from torch.utils.tensorboard import SummaryWriter
+
         writer = SummaryWriter(args.tensorboard_dir)
         trainer.extend(
             TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
diff --git a/espnet/mt/mt_utils.py b/espnet/mt/mt_utils.py
index 50aa792ba38..a9a7ff59763 100644
--- a/espnet/mt/mt_utils.py
+++ b/espnet/mt/mt_utils.py
@@ -4,7 +4,7 @@
 # Copyright 2019 Kyoto University (Hirofumi Inaguma)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-"""Utility funcitons for the text translation task."""
+"""Utility functions for the text translation task."""
 
 import logging
 
diff --git a/espnet/mt/pytorch_backend/mt.py b/espnet/mt/pytorch_backend/mt.py
index 88474c944ed..47f5b817b03 100644
--- a/espnet/mt/pytorch_backend/mt.py
+++ b/espnet/mt/pytorch_backend/mt.py
@@ -6,15 +6,14 @@
 
 """Training/decoding definition for the text translation task."""
 
+import itertools
 import json
 import logging
 import os
-import sys
 
 from chainer import training
 from chainer.training import extensions
 import numpy as np
-from tensorboardX import SummaryWriter
 import torch
 
 from espnet.asr.asr_utils import adadelta_eps_decay
@@ -43,15 +42,6 @@
 from espnet.asr.pytorch_backend.asr import CustomUpdater
 from espnet.asr.pytorch_backend.asr import load_trained_model
 
-import matplotlib
-
-matplotlib.use("Agg")
-
-if sys.version_info[0] == 2:
-    from itertools import izip_longest as zip_longest
-else:
-    from itertools import zip_longest as zip_longest
-
 
 class CustomConverter(object):
     """Custom batch converter for Pytorch."""
@@ -502,6 +492,8 @@ def train(args):
     set_early_stop(trainer, args)
 
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        from torch.utils.tensorboard import SummaryWriter
+
         trainer.extend(
             TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter),
             trigger=(args.report_interval_iters, "iteration"),
@@ -562,7 +554,7 @@ def trans(args):
 
         def grouper(n, iterable, fillvalue=None):
             kargs = [iter(iterable)] * n
-            return zip_longest(*kargs, fillvalue=fillvalue)
+            return itertools.zip_longest(*kargs, fillvalue=fillvalue)
 
         # sort data
         keys = list(js.keys())
diff --git a/espnet/nets/asr_interface.py b/espnet/nets/asr_interface.py
index eba4ef0e722..e6fb47d7fd1 100644
--- a/espnet/nets/asr_interface.py
+++ b/espnet/nets/asr_interface.py
@@ -77,7 +77,7 @@ def recognize_batch(self, x, recog_args, char_list=None, rnnlm=None):
         raise NotImplementedError("Batch decoding is not supported yet.")
 
     def calculate_all_attentions(self, xs, ilens, ys):
-        """Caluculate attention.
+        """Calculate attention.
 
         :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...]
         :param ndarray ilens: batch of lengths of input sequences (B)
@@ -88,7 +88,7 @@ def calculate_all_attentions(self, xs, ilens, ys):
         raise NotImplementedError("calculate_all_attentions method is not implemented")
 
     def calculate_all_ctc_probs(self, xs, ilens, ys):
-        """Caluculate CTC probability.
+        """Calculate CTC probability.
 
         :param list xs_pad: list of padded input sequences [(T1, idim), (T2, idim), ...]
         :param ndarray ilens: batch of lengths of input sequences (B)
diff --git a/espnet/nets/batch_beam_search.py b/espnet/nets/batch_beam_search.py
index ba861f3f154..9418fadea46 100644
--- a/espnet/nets/batch_beam_search.py
+++ b/espnet/nets/batch_beam_search.py
@@ -341,8 +341,8 @@ def post_process(
             running_hyps.yseq[torch.arange(n_batch), running_hyps.length - 1]
             == self.eos
         )
-        for b in torch.nonzero(is_eos).view(-1):
+        for b in torch.nonzero(is_eos, as_tuple=False).view(-1):
             hyp = self._select(running_hyps, b)
             ended_hyps.append(hyp)
-        remained_ids = torch.nonzero(is_eos == 0).view(-1)
+        remained_ids = torch.nonzero(is_eos == 0, as_tuple=False).view(-1)
         return self._batch_select(running_hyps, remained_ids)
diff --git a/espnet/nets/batch_beam_search_online.py b/espnet/nets/batch_beam_search_online.py
new file mode 100644
index 00000000000..9190a09144a
--- /dev/null
+++ b/espnet/nets/batch_beam_search_online.py
@@ -0,0 +1,311 @@
+"""Parallel beam search module for online simulation."""
+
+from espnet.nets.batch_beam_search import (
+    BatchBeamSearch,  # noqa: H301
+    BatchHypothesis,  # noqa: H301
+)
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.e2e_asr_common import end_detect
+import logging
+import torch
+from typing import (
+    List,  # noqa: H301
+    Tuple,  # noqa: H301
+    Dict,  # noqa: H301
+    Any,  # noqa: H301
+)
+
+
+class BatchBeamSearchOnline(BatchBeamSearch):
+    """Online beam search implementation.
+
+    This simulates streaming decoding.
+    It requires encoded features of entire utterance and
+    extracts block by block from it as it shoud be done
+    in streaming processing.
+    This is based on Tsunoo et al, "STREAMING TRANSFORMER ASR
+    WITH BLOCKWISE SYNCHRONOUS BEAM SEARCH"
+    (https://arxiv.org/abs/2006.14941).
+    """
+
+    def __init__(
+        self,
+        *args,
+        block_size=40,
+        hop_size=16,
+        look_ahead=16,
+        disable_repetition_detection=False,
+        encoded_feat_length_limit=0,
+        decoder_text_length_limit=0,
+        **kwargs,
+    ):
+        """Initialize beam search."""
+        super().__init__(*args, **kwargs)
+        self.block_size = block_size
+        self.hop_size = hop_size
+        self.look_ahead = look_ahead
+        self.disable_repetition_detection = disable_repetition_detection
+        self.encoded_feat_length_limit = encoded_feat_length_limit
+        self.decoder_text_length_limit = decoder_text_length_limit
+
+        self.reset()
+
+    def reset(self):
+        """Reset parameters."""
+        self.encbuffer = None
+        self.running_hyps = None
+        self.prev_hyps = []
+        self.ended_hyps = []
+        self.processed_block = 0
+        self.process_idx = 0
+        self.prev_output = None
+
+    def score_full(
+        self, hyp: BatchHypothesis, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            x (torch.Tensor): Corresponding input feature
+
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.full_scorers.items():
+            if (
+                self.decoder_text_length_limit > 0
+                and len(hyp.yseq) > 0
+                and len(hyp.yseq[0]) > self.decoder_text_length_limit
+            ):
+                temp_yseq = hyp.yseq.narrow(
+                    1, -self.decoder_text_length_limit, self.decoder_text_length_limit
+                ).clone()
+                temp_yseq[:, 0] = self.sos
+                self.running_hyps.states["decoder"] = [
+                    None for _ in self.running_hyps.states["decoder"]
+                ]
+                scores[k], states[k] = d.batch_score(temp_yseq, hyp.states[k], x)
+            else:
+                scores[k], states[k] = d.batch_score(hyp.yseq, hyp.states[k], x)
+        return scores, states
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        is_final: bool = True,
+    ) -> List[Hypothesis]:
+        """Perform beam search.
+
+        Args:
+            x (torch.Tensor): Encoded speech feature (T, D)
+            maxlenratio (float): Input length ratio to obtain max output length.
+                If maxlenratio=0.0 (default), it uses a end-detect function
+                to automatically find maximum hypothesis lengths
+            minlenratio (float): Input length ratio to obtain min output length.
+
+        Returns:
+            list[Hypothesis]: N-best decoding results
+
+        """
+        if self.encbuffer is None:
+            self.encbuffer = x
+        else:
+            self.encbuffer = torch.cat([self.encbuffer, x], axis=0)
+
+        x = self.encbuffer
+
+        # set length bounds
+        if maxlenratio == 0:
+            maxlen = x.shape[0]
+        else:
+            maxlen = max(1, int(maxlenratio * x.size(0)))
+
+        ret = None
+        while True:
+            cur_end_frame = (
+                self.block_size - self.look_ahead + self.hop_size * self.processed_block
+            )
+            if cur_end_frame < x.shape[0]:
+                h = x.narrow(0, 0, cur_end_frame)
+                block_is_final = False
+            else:
+                if is_final:
+                    h = x
+                    block_is_final = True
+                else:
+                    break
+
+            logging.debug("Start processing block: %d", self.processed_block)
+            logging.debug(
+                "  Feature length: {}, current position: {}".format(
+                    h.shape[0], self.process_idx
+                )
+            )
+            if (
+                self.encoded_feat_length_limit > 0
+                and h.shape[0] > self.encoded_feat_length_limit
+            ):
+                h = h.narrow(
+                    0,
+                    h.shape[0] - self.encoded_feat_length_limit,
+                    self.encoded_feat_length_limit,
+                )
+
+            if self.running_hyps is None:
+                self.running_hyps = self.init_hyp(h)
+            ret = self.process_one_block(h, block_is_final, maxlen, maxlenratio)
+            logging.debug("Finished processing block: %d", self.processed_block)
+            self.processed_block += 1
+            if block_is_final:
+                return ret
+        if ret is None:
+            if self.prev_output is None:
+                return []
+            else:
+                return self.prev_output
+        else:
+            self.prev_output = ret
+            # N-best results
+            return ret
+
+    def process_one_block(self, h, is_final, maxlen, maxlenratio):
+        """Recognize one block."""
+        # extend states for ctc
+        self.extend(h, self.running_hyps)
+        while self.process_idx < maxlen:
+            logging.debug("position " + str(self.process_idx))
+            best = self.search(self.running_hyps, h)
+
+            if self.process_idx == maxlen - 1:
+                # end decoding
+                self.running_hyps = self.post_process(
+                    self.process_idx, maxlen, maxlenratio, best, self.ended_hyps
+                )
+            n_batch = best.yseq.shape[0]
+            local_ended_hyps = []
+            is_local_eos = best.yseq[torch.arange(n_batch), best.length - 1] == self.eos
+            prev_repeat = False
+            for i in range(is_local_eos.shape[0]):
+                if is_local_eos[i]:
+                    hyp = self._select(best, i)
+                    local_ended_hyps.append(hyp)
+                # NOTE(tsunoo): check repetitions here
+                # This is a implicit implementation of
+                # Eq (11) in https://arxiv.org/abs/2006.14941
+                # A flag prev_repeat is used instead of using set
+                # NOTE(fujihara): I made it possible to turned off
+                # the below lines using disable_repetition_detection flag,
+                # because this criteria is too sensitive that the beam
+                # search starts only after the entire inputs are available.
+                # Empirically, this flag didn't affect the performance.
+                elif (
+                    not self.disable_repetition_detection
+                    and not prev_repeat
+                    and best.yseq[i, -1] in best.yseq[i, :-1]
+                    and not is_final
+                ):
+                    prev_repeat = True
+            if prev_repeat:
+                logging.info("Detected repetition.")
+                break
+
+            if (
+                is_final
+                and maxlenratio == 0.0
+                and end_detect(
+                    [lh.asdict() for lh in self.ended_hyps], self.process_idx
+                )
+            ):
+                logging.info(f"end detected at {self.process_idx}")
+                return self.assemble_hyps(self.ended_hyps)
+
+            if len(local_ended_hyps) > 0 and not is_final:
+                logging.info("Detected hyp(s) reaching EOS in this block.")
+                break
+
+            self.prev_hyps = self.running_hyps
+            self.running_hyps = self.post_process(
+                self.process_idx, maxlen, maxlenratio, best, self.ended_hyps
+            )
+
+            if is_final:
+                for hyp in local_ended_hyps:
+                    self.ended_hyps.append(hyp)
+
+            if len(self.running_hyps) == 0:
+                logging.info("no hypothesis. Finish decoding.")
+                return self.assemble_hyps(self.ended_hyps)
+            else:
+                logging.debug(f"remained hypotheses: {len(self.running_hyps)}")
+            # increment number
+            self.process_idx += 1
+
+        if is_final:
+            return self.assemble_hyps(self.ended_hyps)
+        else:
+            for hyp in self.ended_hyps:
+                local_ended_hyps.append(hyp)
+            rets = self.assemble_hyps(local_ended_hyps)
+
+            if self.process_idx > 1 and len(self.prev_hyps) > 0:
+                self.running_hyps = self.prev_hyps
+                self.process_idx -= 1
+                self.prev_hyps = []
+
+            # N-best results
+            return rets
+
+    def assemble_hyps(self, ended_hyps):
+        """Assemble the hypotheses."""
+        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
+        # check the number of hypotheses reaching to eos
+        if len(nbest_hyps) == 0:
+            logging.warning(
+                "there is no N-best results, perform recognition "
+                "again with smaller minlenratio."
+            )
+            return []
+
+        # report the best result
+        best = nbest_hyps[0]
+        for k, v in best.scores.items():
+            logging.info(
+                f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
+            )
+        logging.info(f"total log probability: {best.score:.2f}")
+        logging.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
+        logging.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
+        if self.token_list is not None:
+            logging.info(
+                "best hypo: "
+                + "".join([self.token_list[x] for x in best.yseq[1:-1]])
+                + "\n"
+            )
+        return nbest_hyps
+
+    def extend(self, x: torch.Tensor, hyps: Hypothesis) -> List[Hypothesis]:
+        """Extend probabilities and states with more encoded chunks.
+
+        Args:
+            x (torch.Tensor): The extended encoder output feature
+            hyps (Hypothesis): Current list of hypothesis
+
+        Returns:
+            Hypothesis: The extended hypothesis
+
+        """
+        for k, d in self.scorers.items():
+            if hasattr(d, "extend_prob"):
+                d.extend_prob(x)
+            if hasattr(d, "extend_state"):
+                hyps.states[k] = d.extend_state(hyps.states[k])
diff --git a/espnet/nets/batch_beam_search_online_sim.py b/espnet/nets/batch_beam_search_online_sim.py
index c3b348654ed..2c0ecf3bfb1 100644
--- a/espnet/nets/batch_beam_search_online_sim.py
+++ b/espnet/nets/batch_beam_search_online_sim.py
@@ -260,7 +260,7 @@ def extend(self, x: torch.Tensor, hyps: Hypothesis) -> List[Hypothesis]:
             hyps (Hypothesis): Current list of hypothesis
 
         Returns:
-            Hypothesis: The exxtended hypothesis
+            Hypothesis: The extended hypothesis
 
         """
         for k, d in self.scorers.items():
diff --git a/espnet/nets/beam_search.py b/espnet/nets/beam_search.py
index fa41753c948..0f33d8c63bf 100644
--- a/espnet/nets/beam_search.py
+++ b/espnet/nets/beam_search.py
@@ -343,6 +343,8 @@ def forward(
             maxlenratio (float): Input length ratio to obtain max output length.
                 If maxlenratio=0.0 (default), it uses a end-detect function
                 to automatically find maximum hypothesis lengths
+                If maxlenratio<0.0, its absolute value is interpreted
+                as a constant max output length.
             minlenratio (float): Input length ratio to obtain min output length.
 
         Returns:
@@ -352,6 +354,8 @@ def forward(
         # set length bounds
         if maxlenratio == 0:
             maxlen = x.shape[0]
+        elif maxlenratio < 0:
+            maxlen = -1 * int(maxlenratio)
         else:
             maxlen = max(1, int(maxlenratio * x.size(0)))
         minlen = int(minlenratio * x.size(0))
diff --git a/espnet/nets/beam_search_transducer.py b/espnet/nets/beam_search_transducer.py
index 925374a163c..a14dcd8618a 100644
--- a/espnet/nets/beam_search_transducer.py
+++ b/espnet/nets/beam_search_transducer.py
@@ -1,29 +1,33 @@
-"""Search algorithms for transducer models."""
+"""Search algorithms for Transducer models."""
 
+import logging
 from typing import List
 from typing import Union
 
 import numpy as np
 import torch
 
-from espnet.nets.pytorch_backend.transducer.utils import create_lm_batch_state
+from espnet.nets.pytorch_backend.transducer.custom_decoder import CustomDecoder
+from espnet.nets.pytorch_backend.transducer.joint_network import JointNetwork
+from espnet.nets.pytorch_backend.transducer.rnn_decoder import RNNDecoder
+from espnet.nets.pytorch_backend.transducer.utils import create_lm_batch_states
 from espnet.nets.pytorch_backend.transducer.utils import init_lm_state
 from espnet.nets.pytorch_backend.transducer.utils import is_prefix
 from espnet.nets.pytorch_backend.transducer.utils import recombine_hyps
+from espnet.nets.pytorch_backend.transducer.utils import select_k_expansions
 from espnet.nets.pytorch_backend.transducer.utils import select_lm_state
-from espnet.nets.pytorch_backend.transducer.utils import substract
+from espnet.nets.pytorch_backend.transducer.utils import subtract
+from espnet.nets.transducer_decoder_interface import ExtendedHypothesis
 from espnet.nets.transducer_decoder_interface import Hypothesis
-from espnet.nets.transducer_decoder_interface import NSCHypothesis
-from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface
 
 
 class BeamSearchTransducer:
-    """Beam search implementation for transducer."""
+    """Beam search implementation for Transducer."""
 
     def __init__(
         self,
-        decoder: Union[TransducerDecoderInterface, torch.nn.Module],
-        joint_network: torch.nn.Module,
+        decoder: Union[RNNDecoder, CustomDecoder],
+        joint_network: JointNetwork,
         beam_size: int,
         lm: torch.nn.Module = None,
         lm_weight: float = 0.1,
@@ -32,24 +36,34 @@ def __init__(
         u_max: int = 50,
         nstep: int = 1,
         prefix_alpha: int = 1,
+        expansion_gamma: int = 2.3,
+        expansion_beta: int = 2,
         score_norm: bool = True,
+        softmax_temperature: float = 1.0,
         nbest: int = 1,
+        quantization: bool = False,
     ):
-        """Initialize transducer beam search.
+        """Initialize Transducer search module.
 
         Args:
-            decoder: Decoder class to use
-            joint_network: Joint Network class
-            beam_size: Number of hypotheses kept during search
-            lm: LM class to use
-            lm_weight: lm weight for soft fusion
-            search_type: type of algorithm to use for search
-            max_sym_exp: number of maximum symbol expansions at each time step ("tsd")
-            u_max: maximum output sequence length ("alsd")
-            nstep: number of maximum expansion steps at each time step ("nsc")
-            prefix_alpha: maximum prefix length in prefix search ("nsc")
-            score_norm: normalize final scores by length ("default")
-            nbest: number of returned final hypothesis
+            decoder: Decoder module.
+            joint_network: Joint network module.
+            beam_size: Beam size.
+            lm: LM class.
+            lm_weight: LM weight for soft fusion.
+            search_type: Search algorithm to use during inference.
+            max_sym_exp: Number of maximum symbol expansions at each time step. (TSD)
+            u_max: Maximum output sequence length. (ALSD)
+            nstep: Number of maximum expansion steps at each time step. (NSC/mAES)
+            prefix_alpha: Maximum prefix length in prefix search. (NSC/mAES)
+            expansion_beta:
+              Number of additional candidates for expanded hypotheses selection. (mAES)
+            expansion_gamma: Allowed logp difference for prune-by-value method. (mAES)
+            score_norm: Normalize final scores by length. ("default")
+            softmax_temperature: Penalization term for softmax function.
+            nbest: Number of final hypothesis.
+            quantization: Whether dynamic quantization is used.
+
         """
         self.decoder = decoder
         self.joint_network = joint_network
@@ -57,69 +71,91 @@ def __init__(
         self.beam_size = beam_size
         self.hidden_size = decoder.dunits
         self.vocab_size = decoder.odim
-        self.blank = decoder.blank
+
+        self.blank_id = decoder.blank_id
 
         if self.beam_size <= 1:
             self.search_algorithm = self.greedy_search
         elif search_type == "default":
             self.search_algorithm = self.default_beam_search
         elif search_type == "tsd":
+            self.max_sym_exp = max_sym_exp
+
             self.search_algorithm = self.time_sync_decoding
         elif search_type == "alsd":
+            self.u_max = u_max
+
             self.search_algorithm = self.align_length_sync_decoding
         elif search_type == "nsc":
+            self.nstep = nstep
+            self.prefix_alpha = prefix_alpha
+
             self.search_algorithm = self.nsc_beam_search
+        elif search_type == "maes":
+            self.nstep = nstep if nstep > 1 else 2
+            self.prefix_alpha = prefix_alpha
+            self.expansion_gamma = expansion_gamma
+            self.expansion_beta = expansion_beta
+
+            self.search_algorithm = self.modified_adaptive_expansion_search
         else:
             raise NotImplementedError
 
-        self.lm = lm
-        self.lm_weight = lm_weight
-
         if lm is not None:
             self.use_lm = True
+
+            self.lm = lm
             self.is_wordlm = True if hasattr(lm.predictor, "wordlm") else False
+
             self.lm_predictor = lm.predictor.wordlm if self.is_wordlm else lm.predictor
             self.lm_layers = len(self.lm_predictor.rnn)
+            self.lm_weight = lm_weight
         else:
             self.use_lm = False
 
-        self.max_sym_exp = max_sym_exp
-        self.u_max = u_max
-        self.nstep = nstep
-        self.prefix_alpha = prefix_alpha
-        self.score_norm = score_norm
+        if softmax_temperature > 1.0 and lm is not None:
+            logging.warning(
+                "Softmax temperature is not supported with LM decoding."
+                "Setting softmax-temperature value to 1.0."
+            )
+
+            self.softmax_temperature = 1.0
+        else:
+            self.softmax_temperature = softmax_temperature
 
+        self.quantization = quantization
+
+        self.score_norm = score_norm
         self.nbest = nbest
 
-    def __call__(self, h: torch.Tensor) -> Union[List[Hypothesis], List[NSCHypothesis]]:
+    def __call__(
+        self, enc_out: torch.Tensor
+    ) -> Union[List[Hypothesis], List[ExtendedHypothesis]]:
         """Perform beam search.
 
         Args:
-            h: Encoded speech features (T_max, D_enc)
+            enc_out: Encoder output sequence. (T, D_enc)
 
         Returns:
             nbest_hyps: N-best decoding results
 
         """
-        self.decoder.set_device(h.device)
+        self.decoder.set_device(enc_out.device)
 
-        if not hasattr(self.decoder, "decoders"):
-            self.decoder.set_data_type(h.dtype)
-
-        nbest_hyps = self.search_algorithm(h)
+        nbest_hyps = self.search_algorithm(enc_out)
 
         return nbest_hyps
 
     def sort_nbest(
-        self, hyps: Union[List[Hypothesis], List[NSCHypothesis]]
-    ) -> Union[List[Hypothesis], List[NSCHypothesis]]:
+        self, hyps: Union[List[Hypothesis], List[ExtendedHypothesis]]
+    ) -> Union[List[Hypothesis], List[ExtendedHypothesis]]:
         """Sort hypotheses by score or score given sequence length.
 
         Args:
-            hyps: list of hypotheses
+            hyps: Hypothesis.
 
         Return:
-            hyps: sorted list of hypotheses
+            hyps: Sorted hypothesis.
 
         """
         if self.score_norm:
@@ -129,45 +165,95 @@ def sort_nbest(
 
         return hyps[: self.nbest]
 
-    def greedy_search(self, h: torch.Tensor) -> List[Hypothesis]:
-        """Greedy search implementation for transformer-transducer.
+    def prefix_search(
+        self, hyps: List[ExtendedHypothesis], enc_out_t: torch.Tensor
+    ) -> List[ExtendedHypothesis]:
+        """Prefix search for NSC and mAES strategies.
+
+        Based on https://arxiv.org/pdf/1211.3711.pdf
+
+        """
+        for j, hyp_j in enumerate(hyps[:-1]):
+            for hyp_i in hyps[(j + 1) :]:
+                curr_id = len(hyp_j.yseq)
+                pref_id = len(hyp_i.yseq)
+
+                if (
+                    is_prefix(hyp_j.yseq, hyp_i.yseq)
+                    and (curr_id - pref_id) <= self.prefix_alpha
+                ):
+                    logp = torch.log_softmax(
+                        self.joint_network(
+                            enc_out_t, hyp_i.dec_out[-1], quantization=self.quantization
+                        )
+                        / self.softmax_temperature,
+                        dim=-1,
+                    )
+
+                    curr_score = hyp_i.score + float(logp[hyp_j.yseq[pref_id]])
+
+                    for k in range(pref_id, (curr_id - 1)):
+                        logp = torch.log_softmax(
+                            self.joint_network(
+                                enc_out_t,
+                                hyp_j.dec_out[k],
+                                quantization=self.quantization,
+                            )
+                            / self.softmax_temperature,
+                            dim=-1,
+                        )
+
+                        curr_score += float(logp[hyp_j.yseq[k + 1]])
+
+                    hyp_j.score = np.logaddexp(hyp_j.score, curr_score)
+
+        return hyps
+
+    def greedy_search(self, enc_out: torch.Tensor) -> List[Hypothesis]:
+        """Greedy search implementation.
 
         Args:
-            h: Encoded speech features (T_max, D_enc)
+            enc_out: Encoder output sequence. (T, D_enc)
 
         Returns:
-            hyp: 1-best decoding results
+            hyp: 1-best hypotheses.
 
         """
         dec_state = self.decoder.init_state(1)
 
-        hyp = Hypothesis(score=0.0, yseq=[self.blank], dec_state=dec_state)
+        hyp = Hypothesis(score=0.0, yseq=[self.blank_id], dec_state=dec_state)
         cache = {}
 
-        y, state, _ = self.decoder.score(hyp, cache)
+        dec_out, state, _ = self.decoder.score(hyp, cache)
 
-        for i, hi in enumerate(h):
-            ytu = torch.log_softmax(self.joint_network(hi, y), dim=-1)
-            logp, pred = torch.max(ytu, dim=-1)
+        for enc_out_t in enc_out:
+            logp = torch.log_softmax(
+                self.joint_network(enc_out_t, dec_out, quantization=self.quantization)
+                / self.softmax_temperature,
+                dim=-1,
+            )
+            top_logp, pred = torch.max(logp, dim=-1)
 
-            if pred != self.blank:
+            if pred != self.blank_id:
                 hyp.yseq.append(int(pred))
-                hyp.score += float(logp)
+                hyp.score += float(top_logp)
 
                 hyp.dec_state = state
 
-                y, state, _ = self.decoder.score(hyp, cache)
+                dec_out, state, _ = self.decoder.score(hyp, cache)
 
         return [hyp]
 
-    def default_beam_search(self, h: torch.Tensor) -> List[Hypothesis]:
+    def default_beam_search(self, enc_out: torch.Tensor) -> List[Hypothesis]:
         """Beam search implementation.
 
+        Modified from https://arxiv.org/pdf/1211.3711.pdf
+
         Args:
-            x: Encoded speech features (T_max, D_enc)
+            enc_out: Encoder output sequence. (T, D)
 
         Returns:
-            nbest_hyps: N-best decoding results
+            nbest_hyps: N-best hypothesis.
 
         """
         beam = min(self.beam_size, self.vocab_size)
@@ -175,10 +261,10 @@ def default_beam_search(self, h: torch.Tensor) -> List[Hypothesis]:
 
         dec_state = self.decoder.init_state(1)
 
-        kept_hyps = [Hypothesis(score=0.0, yseq=[self.blank], dec_state=dec_state)]
+        kept_hyps = [Hypothesis(score=0.0, yseq=[self.blank_id], dec_state=dec_state)]
         cache = {}
 
-        for hi in h:
+        for enc_out_t in enc_out:
             hyps = kept_hyps
             kept_hyps = []
 
@@ -186,14 +272,20 @@ def default_beam_search(self, h: torch.Tensor) -> List[Hypothesis]:
                 max_hyp = max(hyps, key=lambda x: x.score)
                 hyps.remove(max_hyp)
 
-                y, state, lm_tokens = self.decoder.score(max_hyp, cache)
+                dec_out, state, lm_tokens = self.decoder.score(max_hyp, cache)
 
-                ytu = torch.log_softmax(self.joint_network(hi, y), dim=-1)
-                top_k = ytu[1:].topk(beam_k, dim=-1)
+                logp = torch.log_softmax(
+                    self.joint_network(
+                        enc_out_t, dec_out, quantization=self.quantization
+                    )
+                    / self.softmax_temperature,
+                    dim=-1,
+                )
+                top_k = logp[1:].topk(beam_k, dim=-1)
 
                 kept_hyps.append(
                     Hypothesis(
-                        score=(max_hyp.score + float(ytu[0:1])),
+                        score=(max_hyp.score + float(logp[0:1])),
                         yseq=max_hyp.yseq[:],
                         dec_state=max_hyp.dec_state,
                         lm_state=max_hyp.lm_state,
@@ -231,16 +323,16 @@ def default_beam_search(self, h: torch.Tensor) -> List[Hypothesis]:
 
         return self.sort_nbest(kept_hyps)
 
-    def time_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
+    def time_sync_decoding(self, enc_out: torch.Tensor) -> List[Hypothesis]:
         """Time synchronous beam search implementation.
 
         Based on https://ieeexplore.ieee.org/document/9053040
 
         Args:
-            h: Encoded speech features (T_max, D_enc)
+            enc_out: Encoder output sequence. (T, D)
 
         Returns:
-            nbest_hyps: N-best decoding results
+            nbest_hyps: N-best hypothesis.
 
         """
         beam = min(self.beam_size, self.vocab_size)
@@ -249,7 +341,7 @@ def time_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
 
         B = [
             Hypothesis(
-                yseq=[self.blank],
+                yseq=[self.blank_id],
                 score=0.0,
                 dec_state=self.decoder.select_state(beam_state, 0),
             )
@@ -259,23 +351,27 @@ def time_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
         if self.use_lm and not self.is_wordlm:
             B[0].lm_state = init_lm_state(self.lm_predictor)
 
-        for hi in h:
+        for enc_out_t in enc_out:
             A = []
             C = B
 
-            h_enc = hi.unsqueeze(0)
+            enc_out_t = enc_out_t.unsqueeze(0)
 
             for v in range(self.max_sym_exp):
                 D = []
 
-                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
                     C,
                     beam_state,
                     cache,
                     self.use_lm,
                 )
 
-                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
+                beam_logp = torch.log_softmax(
+                    self.joint_network(enc_out_t, beam_dec_out)
+                    / self.softmax_temperature,
+                    dim=-1,
+                )
                 beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)
 
                 seq_A = [h.yseq for h in A]
@@ -299,7 +395,7 @@ def time_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
 
                 if v < (self.max_sym_exp - 1):
                     if self.use_lm:
-                        beam_lm_states = create_lm_batch_state(
+                        beam_lm_states = create_lm_batch_states(
                             [c.lm_state for c in C], self.lm_layers, self.is_wordlm
                         )
 
@@ -331,28 +427,28 @@ def time_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
 
         return self.sort_nbest(B)
 
-    def align_length_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
+    def align_length_sync_decoding(self, enc_out: torch.Tensor) -> List[Hypothesis]:
         """Alignment-length synchronous beam search implementation.
 
         Based on https://ieeexplore.ieee.org/document/9053040
 
         Args:
-            h: Encoded speech features (T_max, D_enc)
+            h: Encoder output sequences. (T, D)
 
         Returns:
-            nbest_hyps: N-best decoding results
+            nbest_hyps: N-best hypothesis.
 
         """
         beam = min(self.beam_size, self.vocab_size)
 
-        h_length = int(h.size(0))
-        u_max = min(self.u_max, (h_length - 1))
+        t_max = int(enc_out.size(0))
+        u_max = min(self.u_max, (t_max - 1))
 
         beam_state = self.decoder.init_state(beam)
 
         B = [
             Hypothesis(
-                yseq=[self.blank],
+                yseq=[self.blank_id],
                 score=0.0,
                 dec_state=self.decoder.select_state(beam_state, 0),
             )
@@ -363,36 +459,40 @@ def align_length_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
         if self.use_lm and not self.is_wordlm:
             B[0].lm_state = init_lm_state(self.lm_predictor)
 
-        for i in range(h_length + u_max):
+        for i in range(t_max + u_max):
             A = []
 
             B_ = []
-            h_states = []
+            B_enc_out = []
             for hyp in B:
                 u = len(hyp.yseq) - 1
-                t = i - u + 1
+                t = i - u
 
-                if t > (h_length - 1):
+                if t > (t_max - 1):
                     continue
 
                 B_.append(hyp)
-                h_states.append((t, h[t]))
+                B_enc_out.append((t, enc_out[t]))
 
             if B_:
-                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
                     B_,
                     beam_state,
                     cache,
                     self.use_lm,
                 )
 
-                h_enc = torch.stack([h[1] for h in h_states])
+                beam_enc_out = torch.stack([x[1] for x in B_enc_out])
 
-                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
+                beam_logp = torch.log_softmax(
+                    self.joint_network(beam_enc_out, beam_dec_out)
+                    / self.softmax_temperature,
+                    dim=-1,
+                )
                 beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)
 
                 if self.use_lm:
-                    beam_lm_states = create_lm_batch_state(
+                    beam_lm_states = create_lm_batch_states(
                         [b.lm_state for b in B_], self.lm_layers, self.is_wordlm
                     )
 
@@ -410,7 +510,7 @@ def align_length_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
 
                     A.append(new_hyp)
 
-                    if h_states[i][0] == (h_length - 1):
+                    if B_enc_out[i][0] == (t_max - 1):
                         final.append(new_hyp)
 
                     for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
@@ -438,21 +538,18 @@ def align_length_sync_decoding(self, h: torch.Tensor) -> List[Hypothesis]:
         else:
             return B
 
-    def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
+    def nsc_beam_search(self, enc_out: torch.Tensor) -> List[ExtendedHypothesis]:
         """N-step constrained beam search implementation.
 
-        Based and modified from https://arxiv.org/pdf/2002.03577.pdf.
+        Based on/Modified from https://arxiv.org/pdf/2002.03577.pdf.
         Please reference ESPnet (b-flo, PR #2444) for any usage outside ESPnet
         until further modifications.
 
-        Note: the algorithm is not in his "complete" form but works almost as
-        intended.
-
         Args:
-            h: Encoded speech features (T_max, D_enc)
+            enc_out: Encoder output sequence. (T, D_enc)
 
         Returns:
-            nbest_hyps: N-best decoding results
+            nbest_hyps: N-best hypothesis.
 
         """
         beam = min(self.beam_size, self.vocab_size)
@@ -461,8 +558,8 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
         beam_state = self.decoder.init_state(beam)
 
         init_tokens = [
-            NSCHypothesis(
-                yseq=[self.blank],
+            ExtendedHypothesis(
+                yseq=[self.blank_id],
                 score=0.0,
                 dec_state=self.decoder.select_state(beam_state, 0),
             )
@@ -470,7 +567,7 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
 
         cache = {}
 
-        beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
+        beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
             init_tokens,
             beam_state,
             cache,
@@ -492,60 +589,43 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
             lm_scores = None
 
         kept_hyps = [
-            NSCHypothesis(
-                yseq=[self.blank],
+            ExtendedHypothesis(
+                yseq=[self.blank_id],
                 score=0.0,
                 dec_state=state,
-                y=[beam_y[0]],
+                dec_out=[beam_dec_out[0]],
                 lm_state=lm_state,
                 lm_scores=lm_scores,
             )
         ]
 
-        for hi in h:
-            hyps = sorted(kept_hyps, key=lambda x: len(x.yseq), reverse=True)
+        for enc_out_t in enc_out:
+            hyps = self.prefix_search(
+                sorted(kept_hyps, key=lambda x: len(x.yseq), reverse=True),
+                enc_out_t,
+            )
             kept_hyps = []
 
-            h_enc = hi.unsqueeze(0)
-
-            for j, hyp_j in enumerate(hyps[:-1]):
-                for hyp_i in hyps[(j + 1) :]:
-                    curr_id = len(hyp_j.yseq)
-                    next_id = len(hyp_i.yseq)
-
-                    if (
-                        is_prefix(hyp_j.yseq, hyp_i.yseq)
-                        and (curr_id - next_id) <= self.prefix_alpha
-                    ):
-                        ytu = torch.log_softmax(
-                            self.joint_network(hi, hyp_i.y[-1]), dim=-1
-                        )
-
-                        curr_score = hyp_i.score + float(ytu[hyp_j.yseq[next_id]])
-
-                        for k in range(next_id, (curr_id - 1)):
-                            ytu = torch.log_softmax(
-                                self.joint_network(hi, hyp_j.y[k]), dim=-1
-                            )
-
-                            curr_score += float(ytu[hyp_j.yseq[k + 1]])
-
-                        hyp_j.score = np.logaddexp(hyp_j.score, curr_score)
+            beam_enc_out = enc_out_t.unsqueeze(0)
 
             S = []
             V = []
             for n in range(self.nstep):
-                beam_y = torch.stack([hyp.y[-1] for hyp in hyps])
+                beam_dec_out = torch.stack([hyp.dec_out[-1] for hyp in hyps])
 
-                beam_logp = torch.log_softmax(self.joint_network(h_enc, beam_y), dim=-1)
+                beam_logp = torch.log_softmax(
+                    self.joint_network(beam_enc_out, beam_dec_out)
+                    / self.softmax_temperature,
+                    dim=-1,
+                )
                 beam_topk = beam_logp[:, 1:].topk(beam_k, dim=-1)
 
                 for i, hyp in enumerate(hyps):
                     S.append(
-                        NSCHypothesis(
+                        ExtendedHypothesis(
                             yseq=hyp.yseq[:],
                             score=hyp.score + float(beam_logp[i, 0:1]),
-                            y=hyp.y[:],
+                            dec_out=hyp.dec_out[:],
                             dec_state=hyp.dec_state,
                             lm_state=hyp.lm_state,
                             lm_scores=hyp.lm_scores,
@@ -559,10 +639,10 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
                             score += self.lm_weight * float(hyp.lm_scores[k])
 
                         V.append(
-                            NSCHypothesis(
+                            ExtendedHypothesis(
                                 yseq=hyp.yseq[:] + [int(k)],
                                 score=score,
-                                y=hyp.y[:],
+                                dec_out=hyp.dec_out[:],
                                 dec_state=hyp.dec_state,
                                 lm_state=hyp.lm_state,
                                 lm_scores=hyp.lm_scores,
@@ -570,14 +650,14 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
                         )
 
                 V.sort(key=lambda x: x.score, reverse=True)
-                V = substract(V, hyps)[:beam]
+                V = subtract(V, hyps)[:beam]
 
                 beam_state = self.decoder.create_batch_states(
                     beam_state,
                     [v.dec_state for v in V],
                     [v.yseq for v in V],
                 )
-                beam_y, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
                     V,
                     beam_state,
                     cache,
@@ -585,7 +665,7 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
                 )
 
                 if self.use_lm:
-                    beam_lm_states = create_lm_batch_state(
+                    beam_lm_states = create_lm_batch_states(
                         [v.lm_state for v in V], self.lm_layers, self.is_wordlm
                     )
                     beam_lm_states, beam_lm_scores = self.lm.buff_predict(
@@ -594,7 +674,7 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
 
                 if n < (self.nstep - 1):
                     for i, v in enumerate(V):
-                        v.y.append(beam_y[i])
+                        v.dec_out.append(beam_dec_out[i])
 
                         v.dec_state = self.decoder.select_state(beam_state, i)
 
@@ -607,14 +687,16 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
                     hyps = V[:]
                 else:
                     beam_logp = torch.log_softmax(
-                        self.joint_network(h_enc, beam_y), dim=-1
+                        self.joint_network(beam_enc_out, beam_dec_out)
+                        / self.softmax_temperature,
+                        dim=-1,
                     )
 
                     for i, v in enumerate(V):
                         if self.nstep != 1:
                             v.score += float(beam_logp[i, 0])
 
-                        v.y.append(beam_y[i])
+                        v.dec_out.append(beam_dec_out[i])
 
                         v.dec_state = self.decoder.select_state(beam_state, i)
 
@@ -627,3 +709,175 @@ def nsc_beam_search(self, h: torch.Tensor) -> List[NSCHypothesis]:
             kept_hyps = sorted((S + V), key=lambda x: x.score, reverse=True)[:beam]
 
         return self.sort_nbest(kept_hyps)
+
+    def modified_adaptive_expansion_search(
+        self, enc_out: torch.Tensor
+    ) -> List[ExtendedHypothesis]:
+        """It's the modified Adaptive Expansion Search (mAES) implementation.
+
+        Based on/modified from https://ieeexplore.ieee.org/document/9250505 and NSC.
+
+        Args:
+            enc_out: Encoder output sequence. (T, D_enc)
+
+        Returns:
+            nbest_hyps: N-best hypothesis.
+
+        """
+        beam = min(self.beam_size, self.vocab_size)
+        beam_state = self.decoder.init_state(beam)
+
+        init_tokens = [
+            ExtendedHypothesis(
+                yseq=[self.blank_id],
+                score=0.0,
+                dec_state=self.decoder.select_state(beam_state, 0),
+            )
+        ]
+
+        cache = {}
+
+        beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
+            init_tokens,
+            beam_state,
+            cache,
+            self.use_lm,
+        )
+
+        state = self.decoder.select_state(beam_state, 0)
+
+        if self.use_lm:
+            beam_lm_states, beam_lm_scores = self.lm.buff_predict(
+                None, beam_lm_tokens, 1
+            )
+            lm_state = select_lm_state(
+                beam_lm_states, 0, self.lm_layers, self.is_wordlm
+            )
+            lm_scores = beam_lm_scores[0]
+        else:
+            lm_state = None
+            lm_scores = None
+
+        kept_hyps = [
+            ExtendedHypothesis(
+                yseq=[self.blank_id],
+                score=0.0,
+                dec_state=state,
+                dec_out=[beam_dec_out[0]],
+                lm_state=lm_state,
+                lm_scores=lm_scores,
+            )
+        ]
+
+        for enc_out_t in enc_out:
+            hyps = self.prefix_search(
+                sorted(kept_hyps, key=lambda x: len(x.yseq), reverse=True),
+                enc_out_t,
+            )
+            kept_hyps = []
+
+            beam_enc_out = enc_out_t.unsqueeze(0)
+
+            list_b = []
+            for n in range(self.nstep):
+                beam_dec_out = torch.stack([h.dec_out[-1] for h in hyps])
+
+                beam_logp = torch.log_softmax(
+                    self.joint_network(beam_enc_out, beam_dec_out)
+                    / self.softmax_temperature,
+                    dim=-1,
+                )
+                k_expansions = select_k_expansions(
+                    hyps, beam_logp, beam, self.expansion_gamma, self.expansion_beta
+                )
+
+                list_exp = []
+                for i, hyp in enumerate(hyps):
+                    for k, new_score in k_expansions[i]:
+                        new_hyp = ExtendedHypothesis(
+                            yseq=hyp.yseq[:],
+                            score=new_score,
+                            dec_out=hyp.dec_out[:],
+                            dec_state=hyp.dec_state,
+                            lm_state=hyp.lm_state,
+                            lm_scores=hyp.lm_scores,
+                        )
+
+                        if k == 0:
+                            list_b.append(new_hyp)
+                        else:
+                            new_hyp.yseq.append(int(k))
+
+                            if self.use_lm:
+                                new_hyp.score += self.lm_weight * float(
+                                    hyp.lm_scores[k]
+                                )
+
+                            list_exp.append(new_hyp)
+
+                if not list_exp:
+                    kept_hyps = sorted(list_b, key=lambda x: x.score, reverse=True)[
+                        :beam
+                    ]
+
+                    break
+                else:
+                    beam_state = self.decoder.create_batch_states(
+                        beam_state,
+                        [hyp.dec_state for hyp in list_exp],
+                        [hyp.yseq for hyp in list_exp],
+                    )
+
+                    beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                        list_exp,
+                        beam_state,
+                        cache,
+                        self.use_lm,
+                    )
+
+                    if self.use_lm:
+                        beam_lm_states = create_lm_batch_states(
+                            [hyp.lm_state for hyp in list_exp],
+                            self.lm_layers,
+                            self.is_wordlm,
+                        )
+                        beam_lm_states, beam_lm_scores = self.lm.buff_predict(
+                            beam_lm_states, beam_lm_tokens, len(list_exp)
+                        )
+
+                    if n < (self.nstep - 1):
+                        for i, hyp in enumerate(list_exp):
+                            hyp.dec_out.append(beam_dec_out[i])
+                            hyp.dec_state = self.decoder.select_state(beam_state, i)
+
+                            if self.use_lm:
+                                hyp.lm_state = select_lm_state(
+                                    beam_lm_states, i, self.lm_layers, self.is_wordlm
+                                )
+                                hyp.lm_scores = beam_lm_scores[i]
+
+                        hyps = list_exp[:]
+                    else:
+                        beam_logp = torch.log_softmax(
+                            self.joint_network(beam_enc_out, beam_dec_out)
+                            / self.softmax_temperature,
+                            dim=-1,
+                        )
+
+                        for i, hyp in enumerate(list_exp):
+                            hyp.score += float(beam_logp[i, 0])
+
+                            hyp.dec_out.append(beam_dec_out[i])
+                            hyp.dec_state = self.decoder.select_state(beam_state, i)
+
+                            if self.use_lm:
+                                hyp.lm_state = select_lm_state(
+                                    beam_lm_states, i, self.lm_layers, self.is_wordlm
+                                )
+                                hyp.lm_scores = beam_lm_scores[i]
+
+                        kept_hyps = sorted(
+                            list_b + list_exp, key=lambda x: x.score, reverse=True
+                        )[:beam]
+
+        return self.sort_nbest(kept_hyps)
diff --git a/espnet/nets/chainer_backend/ctc.py b/espnet/nets/chainer_backend/ctc.py
index 222ae0c3d93..f1788df4c74 100644
--- a/espnet/nets/chainer_backend/ctc.py
+++ b/espnet/nets/chainer_backend/ctc.py
@@ -106,7 +106,7 @@ def __call__(self, hs, ys):
         """Core function of the Warp-CTC layer.
 
         Args:
-            hs (iterable of chainer.Variable | N-dimention array):
+            hs (iterable of chainer.Variable | N-dimension array):
                 Input variable from encoder.
             ys (iterable of chainer.Variable | N-dimension array):
                 Input variable of decoder.
diff --git a/espnet/nets/chainer_backend/e2e_asr.py b/espnet/nets/chainer_backend/e2e_asr.py
index dc589ef1a12..eb3a9a37f98 100644
--- a/espnet/nets/chainer_backend/e2e_asr.py
+++ b/espnet/nets/chainer_backend/e2e_asr.py
@@ -94,7 +94,7 @@ def forward(self, xs, ilens, ys):
         """E2E forward propagation.
 
         Args:
-            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
             ilens (chainer.Variable): Batch of length of each input batch. (B,)
             ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
 
@@ -150,7 +150,7 @@ def recognize(self, x, recog_args, char_list, rnnlm=None):
         Args:
             x (chainer.Variable): Input tensor for recognition.
             recog_args (parser.args): Arguments of config file.
-            char_list (List[str]): List of Charactors.
+            char_list (List[str]): List of Characters.
             rnnlm (Module): RNNLM module defined at `espnet.lm.chainer_backend.lm`.
 
         Returns:
diff --git a/espnet/nets/chainer_backend/e2e_asr_transformer.py b/espnet/nets/chainer_backend/e2e_asr_transformer.py
index 67f05c71e14..07c63d23697 100644
--- a/espnet/nets/chainer_backend/e2e_asr_transformer.py
+++ b/espnet/nets/chainer_backend/e2e_asr_transformer.py
@@ -154,7 +154,7 @@ def __init__(self, idim, odim, args, ignore_id=-1, flag_return=True):
         self.char_list = args.char_list
         self.space = args.sym_space
         self.blank = args.sym_blank
-        self.scale_emb = args.adim ** 0.5
+        self.scale_emb = args.adim**0.5
         self.sos = odim - 1
         self.eos = odim - 1
         self.subsample = get_subsample(args, mode="asr", arch="transformer")
@@ -252,7 +252,7 @@ def forward(self, xs, ilens, ys_pad, calculate_attentions=False):
         """E2E forward propagation.
 
         Args:
-            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
             ilens (chainer.Variable): Batch of length of each input batch. (B,)
             ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
             calculate_attentions (bool): If true, return value is the output of encoder.
@@ -381,7 +381,7 @@ def recognize_beam(self, h, lpz, recog_args, char_list=None, rnnlm=None):
         """E2E beam search.
 
         Args:
-            h (ndarray): Encoder ouput features (B, T, D) or (T, D).
+            h (ndarray): Encoder output features (B, T, D) or (T, D).
             lpz (ndarray): Log probabilities from CTC.
             recog_args (Namespace): Argment namespace contraining options.
             char_list (List[str]): List of characters.
@@ -506,7 +506,7 @@ def recognize_beam(self, h, lpz, recog_args, char_list=None, rnnlm=None):
 
             # add eos in the final loop to avoid that there are no ended hyps
             if i == maxlen - 1:
-                logging.info("adding <eos> in the last postion in the loop")
+                logging.info("adding <eos> in the last position in the loop")
                 for hyp in hyps:
                     hyp["yseq"].append(self.eos)
 
diff --git a/espnet/nets/chainer_backend/rnn/decoders.py b/espnet/nets/chainer_backend/rnn/decoders.py
index e4a94a33dd2..308f509a8b3 100644
--- a/espnet/nets/chainer_backend/rnn/decoders.py
+++ b/espnet/nets/chainer_backend/rnn/decoders.py
@@ -32,7 +32,7 @@ class Decoder(chainer.Chain):
         att (Module): Attention module defined at
             `espnet.espnet.nets.chainer_backend.attentions`.
         verbose (int): Verbosity level.
-        char_list (List[str]): List of all charactors.
+        char_list (List[str]): List of all characters.
         labeldist (numpy.array): Distributed array of counted transcript length.
         lsm_weight (float): Weight to use when calculating the training loss.
         sampling_probability (float): Threshold for scheduled sampling.
@@ -225,7 +225,7 @@ def recognize_beam(self, h, lpz, recog_args, char_list, rnnlm=None):
             h (chainer.Variable): One of the output from the encoder.
             lpz (chainer.Variable | None): Result of net propagation.
             recog_args (Namespace): The argument.
-            char_list (List[str]): List of all charactors.
+            char_list (List[str]): List of all characters.
             rnnlm (Module): RNNLM module. Defined at `espnet.lm.chainer_backend.lm`
 
         Returns:
diff --git a/espnet/nets/chainer_backend/rnn/encoders.py b/espnet/nets/chainer_backend/rnn/encoders.py
index e534c144860..0590ccf8108 100644
--- a/espnet/nets/chainer_backend/rnn/encoders.py
+++ b/espnet/nets/chainer_backend/rnn/encoders.py
@@ -59,7 +59,7 @@ def __call__(self, xs, ilens):
         """RNNP forward.
 
         Args:
-            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
             ilens (chainer.Variable): Batch of length of each input batch. (B,)
 
         Returns:
@@ -122,7 +122,7 @@ def __call__(self, xs, ilens):
         """BRNN forward propagation.
 
         Args:
-            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
             ilens (chainer.Variable): Batch of length of each input batch. (B,)
 
         Returns:
@@ -175,7 +175,7 @@ def __call__(self, xs, ilens):
         """VGG2L forward propagation.
 
         Args:
-            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
             ilens (chainer.Variable): Batch of length of each features. (B,)
 
         Returns:
@@ -294,7 +294,7 @@ def __call__(self, xs, ilens):
         """Encoder forward.
 
         Args:
-            xs (chainer.Variable): Batch of padded charactor ids. (B, Tmax)
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
             ilens (chainer.variable): Batch of length of each features. (B,)
 
         Returns:
diff --git a/espnet/nets/chainer_backend/transformer/ctc.py b/espnet/nets/chainer_backend/transformer/ctc.py
index caac6545655..83abacb5073 100644
--- a/espnet/nets/chainer_backend/transformer/ctc.py
+++ b/espnet/nets/chainer_backend/transformer/ctc.py
@@ -118,7 +118,7 @@ def forward(self, hs, ys):
         """Core function of the Warp-CTC layer.
 
         Args:
-            hs (iterable of chainer.Variable | N-dimention array):
+            hs (iterable of chainer.Variable | N-dimension array):
                 Input variable from encoder.
             ys (iterable of N-dimension array): Input variable of decoder.
 
diff --git a/espnet/nets/chainer_backend/transformer/decoder.py b/espnet/nets/chainer_backend/transformer/decoder.py
index c216de8e51a..75c3a7ef410 100644
--- a/espnet/nets/chainer_backend/transformer/decoder.py
+++ b/espnet/nets/chainer_backend/transformer/decoder.py
@@ -25,7 +25,7 @@ class Decoder(chainer.Chain):
         h (int): Number of attention heads.
         dropout (float): Dropout rate.
         initialW (Initializer): Initializer to initialize the weight.
-        initial_bias (Initializer): Initializer to initialize teh bias.
+        initial_bias (Initializer): Initializer to initialize the bias.
 
     """
 
diff --git a/espnet/nets/chainer_backend/transformer/encoder.py b/espnet/nets/chainer_backend/transformer/encoder.py
index af4e98d0504..c0a8e7e64e7 100644
--- a/espnet/nets/chainer_backend/transformer/encoder.py
+++ b/espnet/nets/chainer_backend/transformer/encoder.py
@@ -105,7 +105,7 @@ def forward(self, e, ilens):
         """Compute Encoder layer.
 
         Args:
-            e (chainer.Variable): Batch of padded charactor. (B, Tmax)
+            e (chainer.Variable): Batch of padded character. (B, Tmax)
             ilens (chainer.Variable): Batch of length of each input batch. (B,)
 
         Returns:
diff --git a/espnet/nets/ctc_prefix_score.py b/espnet/nets/ctc_prefix_score.py
index ede03285164..0c67ecd096d 100644
--- a/espnet/nets/ctc_prefix_score.py
+++ b/espnet/nets/ctc_prefix_score.py
@@ -161,7 +161,7 @@ def __call__(self, y, state, scoring_ids=None, att_w=None):
             )
             r[t] = torch.logsumexp(rr, 1) + x_[:, t]
 
-        # compute log prefix probabilites log(psi)
+        # compute log prefix probabilities log(psi)
         log_phi_x = torch.cat((log_phi[0].unsqueeze(0), log_phi[:-1]), dim=0) + x_[0]
         if scoring_ids is not None:
             log_psi = torch.full(
@@ -334,7 +334,7 @@ def __call__(self, y, cs, r_prev):
             log_phi = r_sum
 
         # compute forward probabilities log(r_t^n(h)), log(r_t^b(h)),
-        # and log prefix probabilites log(psi)
+        # and log prefix probabilities log(psi)
         start = max(output_length, 1)
         log_psi = r[start - 1, 0]
         for t in six.moves.range(start, self.input_length):
diff --git a/espnet/nets/e2e_asr_common.py b/espnet/nets/e2e_asr_common.py
index 17d2349afb0..92f90796a3a 100644
--- a/espnet/nets/e2e_asr_common.py
+++ b/espnet/nets/e2e_asr_common.py
@@ -10,7 +10,6 @@
 import logging
 import sys
 
-import editdistance
 from itertools import groupby
 import numpy as np
 import six
@@ -160,6 +159,8 @@ def calculate_cer_ctc(self, ys_hat, ys_pad):
         :return: average sentence-level CER score
         :rtype float
         """
+        import editdistance
+
         cers, char_ref_lens = [], []
         for i, y in enumerate(ys_hat):
             y_hat = [x[0] for x in groupby(y)]
@@ -217,6 +218,8 @@ def calculate_cer(self, seqs_hat, seqs_true):
         :return: average sentence-level CER score
         :rtype float
         """
+        import editdistance
+
         char_eds, char_ref_lens = [], []
         for i, seq_hat_text in enumerate(seqs_hat):
             seq_true_text = seqs_true[i]
@@ -234,6 +237,8 @@ def calculate_wer(self, seqs_hat, seqs_true):
         :return: average sentence-level WER score
         :rtype float
         """
+        import editdistance
+
         word_eds, word_ref_lens = [], []
         for i, seq_hat_text in enumerate(seqs_hat):
             seq_true_text = seqs_true[i]
diff --git a/espnet/nets/mt_interface.py b/espnet/nets/mt_interface.py
index 0e1e61d0d5b..b68cb64305e 100644
--- a/espnet/nets/mt_interface.py
+++ b/espnet/nets/mt_interface.py
@@ -76,7 +76,7 @@ def translate_batch(self, x, trans_args, char_list=None, rnnlm=None):
         raise NotImplementedError("Batch decoding is not supported yet.")
 
     def calculate_all_attentions(self, xs, ilens, ys):
-        """Caluculate attention.
+        """Calculate attention.
 
         :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...]
         :param ndarray ilens: batch of lengths of input sequences (B)
diff --git a/espnet/nets/pytorch_backend/conformer/contextual_block_encoder_layer.py b/espnet/nets/pytorch_backend/conformer/contextual_block_encoder_layer.py
new file mode 100644
index 00000000000..6f02e5ef151
--- /dev/null
+++ b/espnet/nets/pytorch_backend/conformer/contextual_block_encoder_layer.py
@@ -0,0 +1,309 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 21 16:57:31 2021.
+
+@author: Keqi Deng (UCAS)
+"""
+
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+import torch
+from torch import nn
+
+
+class ContextualBlockEncoderLayer(nn.Module):
+    """Contexutal Block Encoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        total_layer_num (int): Total number of layers
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+
+    def __init__(
+        self,
+        size,
+        self_attn,
+        feed_forward,
+        feed_forward_macaron,
+        conv_module,
+        dropout_rate,
+        total_layer_num,
+        normalize_before=True,
+        concat_after=False,
+    ):
+        """Construct an EncoderLayer object."""
+        super(ContextualBlockEncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm1 = LayerNorm(size)
+        self.norm2 = LayerNorm(size)
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        self.total_layer_num = total_layer_num
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+
+    def forward(
+        self,
+        x,
+        mask,
+        infer_mode=False,
+        past_ctx=None,
+        next_ctx=None,
+        is_short_segment=False,
+        layer_idx=0,
+        cache=None,
+    ):
+        """Calculate forward propagation."""
+        if self.training or not infer_mode:
+            return self.forward_train(x, mask, past_ctx, next_ctx, layer_idx, cache)
+        else:
+            return self.forward_infer(
+                x, mask, past_ctx, next_ctx, is_short_segment, layer_idx, cache
+            )
+
+    def forward_train(
+        self, x, mask, past_ctx=None, next_ctx=None, layer_idx=0, cache=None
+    ):
+        """Compute encoded features.
+
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            past_ctx (torch.Tensor): Previous contexutal vector
+            next_ctx (torch.Tensor): Next contexutal vector
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+            cur_ctx (torch.Tensor): Current contexutal vector
+            next_ctx (torch.Tensor): Next contexutal vector
+            layer_idx (int): layer index number
+
+        """
+        nbatch = x.size(0)
+        nblock = x.size(1)
+
+        if past_ctx is not None:
+            if next_ctx is None:
+                # store all context vectors in one tensor
+                next_ctx = past_ctx.new_zeros(
+                    nbatch, nblock, self.total_layer_num, x.size(-1)
+                )
+            else:
+                x[:, :, 0] = past_ctx[:, :, layer_idx]
+
+        # reshape ( nbatch, nblock, block_size + 2, dim )
+        #     -> ( nbatch * nblock, block_size + 2, dim )
+        x = x.view(-1, x.size(-2), x.size(-1))
+        if mask is not None:
+            mask = mask.view(-1, mask.size(-2), mask.size(-1))
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if self.concat_after:
+            x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+
+        layer_idx += 1
+        # reshape ( nbatch * nblock, block_size + 2, dim )
+        #       -> ( nbatch, nblock, block_size + 2, dim )
+        x = x.view(nbatch, -1, x.size(-2), x.size(-1)).squeeze(1)
+        if mask is not None:
+            mask = mask.view(nbatch, -1, mask.size(-2), mask.size(-1)).squeeze(1)
+
+        if next_ctx is not None and layer_idx < self.total_layer_num:
+            next_ctx[:, 0, layer_idx, :] = x[:, 0, -1, :]
+            next_ctx[:, 1:, layer_idx, :] = x[:, 0:-1, -1, :]
+
+        return x, mask, False, next_ctx, next_ctx, False, layer_idx
+
+    def forward_infer(
+        self,
+        x,
+        mask,
+        past_ctx=None,
+        next_ctx=None,
+        is_short_segment=False,
+        layer_idx=0,
+        cache=None,
+    ):
+        """Compute encoded features.
+
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            past_ctx (torch.Tensor): Previous contexutal vector
+            next_ctx (torch.Tensor): Next contexutal vector
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+            cur_ctx (torch.Tensor): Current contexutal vector
+            next_ctx (torch.Tensor): Next contexutal vector
+            layer_idx (int): layer index number
+
+        """
+        nbatch = x.size(0)
+        nblock = x.size(1)
+        # if layer_idx == 0, next_ctx has to be None
+        if layer_idx == 0:
+            assert next_ctx is None
+            next_ctx = x.new_zeros(nbatch, self.total_layer_num, x.size(-1))
+
+        # reshape ( nbatch, nblock, block_size + 2, dim )
+        #     -> ( nbatch * nblock, block_size + 2, dim )
+        x = x.view(-1, x.size(-2), x.size(-1))
+        if mask is not None:
+            mask = mask.view(-1, mask.size(-2), mask.size(-1))
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if self.concat_after:
+            x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+
+        # reshape ( nbatch * nblock, block_size + 2, dim )
+        #       -> ( nbatch, nblock, block_size + 2, dim )
+        x = x.view(nbatch, nblock, x.size(-2), x.size(-1))
+        if mask is not None:
+            mask = mask.view(nbatch, nblock, mask.size(-2), mask.size(-1))
+
+        # Propagete context information (the last frame of each block)
+        # to the first frame
+        # of the next block
+
+        if not is_short_segment:
+            if past_ctx is None:
+                # First block of an utterance
+                x[:, 0, 0, :] = x[:, 0, -1, :]
+            else:
+                x[:, 0, 0, :] = past_ctx[:, layer_idx, :]
+            if nblock > 1:
+                x[:, 1:, 0, :] = x[:, 0:-1, -1, :]
+            next_ctx[:, layer_idx, :] = x[:, -1, -1, :]
+        else:
+            next_ctx = None
+
+        return x, mask, True, past_ctx, next_ctx, is_short_segment, layer_idx + 1
diff --git a/espnet/nets/pytorch_backend/conformer/encoder.py b/espnet/nets/pytorch_backend/conformer/encoder.py
index 980d15a18b8..515cf7e3f7c 100644
--- a/espnet/nets/pytorch_backend/conformer/encoder.py
+++ b/espnet/nets/pytorch_backend/conformer/encoder.py
@@ -37,7 +37,7 @@ class Encoder(torch.nn.Module):
 
     Args:
         idim (int): Input dimension.
-        attention_dim (int): Dimention of attention.
+        attention_dim (int): Dimension of attention.
         attention_heads (int): The number of heads of multi head attention.
         linear_units (int): The number of units of position-wise feed forward.
         num_blocks (int): The number of decoder blocks.
@@ -60,6 +60,11 @@ class Encoder(torch.nn.Module):
         zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
         cnn_module_kernel (int): Kernerl size of convolution module.
         padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
+            indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type
+            signature.)
 
     """
 
@@ -86,6 +91,10 @@ def __init__(
         zero_triu=False,
         cnn_module_kernel=31,
         padding_idx=-1,
+        stochastic_depth_rate=0.0,
+        intermediate_layers=None,
+        ctc_softmax=None,
+        conditioning_layer_dim=None,
     ):
         """Construct an Encoder object."""
         super(Encoder, self).__init__()
@@ -214,11 +223,20 @@ def __init__(
                 dropout_rate,
                 normalize_before,
                 concat_after,
+                stochastic_depth_rate * float(1 + lnum) / num_blocks,
             ),
         )
         if self.normalize_before:
             self.after_norm = LayerNorm(attention_dim)
 
+        self.intermediate_layers = intermediate_layers
+        self.use_conditioning = True if ctc_softmax is not None else False
+        if self.use_conditioning:
+            self.ctc_softmax = ctc_softmax
+            self.conditioning_layer = torch.nn.Linear(
+                conditioning_layer_dim, attention_dim
+            )
+
     def forward(self, xs, masks):
         """Encode input sequence.
 
@@ -236,10 +254,43 @@ def forward(self, xs, masks):
         else:
             xs = self.embed(xs)
 
-        xs, masks = self.encoders(xs, masks)
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (
+                    self.intermediate_layers is not None
+                    and layer_idx + 1 in self.intermediate_layers
+                ):
+                    # intermediate branches also require normalization.
+                    encoder_output = xs
+                    if isinstance(encoder_output, tuple):
+                        encoder_output = encoder_output[0]
+
+                    if self.normalize_before:
+                        encoder_output = self.after_norm(encoder_output)
+
+                    intermediate_outputs.append(encoder_output)
+
+                    if self.use_conditioning:
+                        intermediate_result = self.ctc_softmax(encoder_output)
+
+                        if isinstance(xs, tuple):
+                            x, pos_emb = xs[0], xs[1]
+                            x = x + self.conditioning_layer(intermediate_result)
+                            xs = (x, pos_emb)
+                        else:
+                            xs = xs + self.conditioning_layer(intermediate_result)
+
         if isinstance(xs, tuple):
             xs = xs[0]
 
         if self.normalize_before:
             xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
         return xs, masks
diff --git a/espnet/nets/pytorch_backend/conformer/encoder_layer.py b/espnet/nets/pytorch_backend/conformer/encoder_layer.py
index e8571e01eee..bc620261aee 100644
--- a/espnet/nets/pytorch_backend/conformer/encoder_layer.py
+++ b/espnet/nets/pytorch_backend/conformer/encoder_layer.py
@@ -36,7 +36,9 @@ class EncoderLayer(nn.Module):
             if True, additional linear will be applied.
             i.e. x -> x + linear(concat(x, att(x)))
             if False, no additional linear will be applied. i.e. x -> x + att(x)
-
+        stochastic_depth_rate (float): Proability to skip this layer.
+            During training, the layer may skip residual computation and return input
+            as-is with given probability.
     """
 
     def __init__(
@@ -49,6 +51,7 @@ def __init__(
         dropout_rate,
         normalize_before=True,
         concat_after=False,
+        stochastic_depth_rate=0.0,
     ):
         """Construct an EncoderLayer object."""
         super(EncoderLayer, self).__init__()
@@ -72,6 +75,7 @@ def __init__(
         self.concat_after = concat_after
         if self.concat_after:
             self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
 
     def forward(self, x_input, mask, cache=None):
         """Compute encoded features.
@@ -93,12 +97,29 @@ def forward(self, x_input, mask, cache=None):
         else:
             x, pos_emb = x_input, None
 
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = torch.cat([cache, x], dim=1)
+            if pos_emb is not None:
+                return (x, pos_emb), mask
+            return x, mask
+
         # whether to use macaron style
         if self.feed_forward_macaron is not None:
             residual = x
             if self.normalize_before:
                 x = self.norm_ff_macaron(x)
-            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x)
+            )
             if not self.normalize_before:
                 x = self.norm_ff_macaron(x)
 
@@ -122,9 +143,9 @@ def forward(self, x_input, mask, cache=None):
 
         if self.concat_after:
             x_concat = torch.cat((x, x_att), dim=-1)
-            x = residual + self.concat_linear(x_concat)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
         else:
-            x = residual + self.dropout(x_att)
+            x = residual + stoch_layer_coeff * self.dropout(x_att)
         if not self.normalize_before:
             x = self.norm_mha(x)
 
@@ -133,7 +154,7 @@ def forward(self, x_input, mask, cache=None):
             residual = x
             if self.normalize_before:
                 x = self.norm_conv(x)
-            x = residual + self.dropout(self.conv_module(x))
+            x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
             if not self.normalize_before:
                 x = self.norm_conv(x)
 
@@ -141,7 +162,9 @@ def forward(self, x_input, mask, cache=None):
         residual = x
         if self.normalize_before:
             x = self.norm_ff(x)
-        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+            self.feed_forward(x)
+        )
         if not self.normalize_before:
             x = self.norm_ff(x)
 
diff --git a/espnet/nets/pytorch_backend/ctc.py b/espnet/nets/pytorch_backend/ctc.py
index 46f762bccff..f834967f645 100644
--- a/espnet/nets/pytorch_backend/ctc.py
+++ b/espnet/nets/pytorch_backend/ctc.py
@@ -24,6 +24,7 @@ def __init__(self, odim, eprojs, dropout_rate, ctc_type="warpctc", reduce=True):
         self.dropout_rate = dropout_rate
         self.loss = None
         self.ctc_lo = torch.nn.Linear(eprojs, odim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
         self.probs = None  # for visualization
 
         # In case of Pytorch >= 1.7.0, CTC will be always builtin
@@ -33,12 +34,6 @@ def __init__(self, odim, eprojs, dropout_rate, ctc_type="warpctc", reduce=True):
             else "builtin"
         )
 
-        # ctc_type = buitin not support Pytorch=1.0.1
-        if self.ctc_type == "builtin" and (
-            LooseVersion(torch.__version__) < LooseVersion("1.1.0")
-        ):
-            self.ctc_type = "cudnnctc"
-
         if ctc_type != self.ctc_type:
             logging.warning(f"CTC was set to {self.ctc_type} due to PyTorch version.")
 
@@ -81,7 +76,7 @@ def loss_fn(self, th_pred, th_target, th_ilen, th_olen):
         elif self.ctc_type == "gtnctc":
             targets = [t.tolist() for t in th_target]
             log_probs = torch.nn.functional.log_softmax(th_pred, dim=2)
-            return self.ctc_loss(log_probs, targets, 0, "none")
+            return self.ctc_loss(log_probs, targets, th_ilen, 0, "none")
         else:
             raise NotImplementedError
 
@@ -99,7 +94,7 @@ def forward(self, hs_pad, hlens, ys_pad):
         ys = [y[y != self.ignore_id] for y in ys_pad]  # parse padded ys
 
         # zero padding for hs
-        ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
+        ys_hat = self.ctc_lo(self.dropout(hs_pad))
         if self.ctc_type != "gtnctc":
             ys_hat = ys_hat.transpose(0, 1)
 
diff --git a/espnet/nets/pytorch_backend/e2e_asr.py b/espnet/nets/pytorch_backend/e2e_asr.py
index 5644b99e3ee..0008e84d4c4 100644
--- a/espnet/nets/pytorch_backend/e2e_asr.py
+++ b/espnet/nets/pytorch_backend/e2e_asr.py
@@ -11,7 +11,6 @@
 
 import chainer
 from chainer import reporter
-import editdistance
 import numpy as np
 import six
 import torch
@@ -218,6 +217,8 @@ def forward(self, xs_pad, ilens, ys_pad):
         :return: loss value
         :rtype: torch.Tensor
         """
+        import editdistance
+
         # 0. Frontend
         if self.frontend is not None:
             hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
diff --git a/espnet/nets/pytorch_backend/e2e_asr_conformer.py b/espnet/nets/pytorch_backend/e2e_asr_conformer.py
index 16cd2418ab7..4bcbad139e8 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_conformer.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_conformer.py
@@ -72,5 +72,9 @@ def __init__(self, idim, odim, args, ignore_id=-1):
             use_cnn_module=args.use_cnn_module,
             zero_triu=args.zero_triu,
             cnn_module_kernel=args.cnn_module_kernel,
+            stochastic_depth_rate=args.stochastic_depth_rate,
+            intermediate_layers=self.intermediate_ctc_layers,
+            ctc_softmax=self.ctc.softmax if args.self_conditioning else None,
+            conditioning_layer_dim=odim,
         )
         self.reset_parameters(args)
diff --git a/espnet/nets/pytorch_backend/e2e_asr_maskctc.py b/espnet/nets/pytorch_backend/e2e_asr_maskctc.py
index c283f7de5bb..7e7f6c3312d 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_maskctc.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_maskctc.py
@@ -77,6 +77,13 @@ def __init__(self, idim, odim, args, ignore_id=-1):
         self.eos = odim - 2
         self.odim = odim
 
+        self.intermediate_ctc_weight = args.intermediate_ctc_weight
+        self.intermediate_ctc_layers = []
+        if args.intermediate_ctc_layer != "":
+            self.intermediate_ctc_layers = [
+                int(i) for i in args.intermediate_ctc_layer.split(",")
+            ]
+
         if args.maskctc_use_conformer_encoder:
             if args.transformer_attn_dropout_rate is None:
                 args.transformer_attn_dropout_rate = args.conformer_dropout_rate
@@ -96,6 +103,8 @@ def __init__(self, idim, odim, args, ignore_id=-1):
                 macaron_style=args.macaron_style,
                 use_cnn_module=args.use_cnn_module,
                 cnn_module_kernel=args.cnn_module_kernel,
+                stochastic_depth_rate=args.stochastic_depth_rate,
+                intermediate_layers=self.intermediate_ctc_layers,
             )
         self.reset_parameters(args)
 
@@ -115,7 +124,7 @@ def forward(self, xs_pad, ilens, ys_pad):
         # 1. forward encoder
         xs_pad = xs_pad[:, : max(ilens)]  # for data parallel
         src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)
-        hs_pad, hs_mask = self.encoder(xs_pad, src_mask)
+        hs_pad, hs_mask, hs_intermediates = self.encoder(xs_pad, src_mask)
         self.hs_pad = hs_pad
 
         # 2. forward decoder
@@ -134,6 +143,7 @@ def forward(self, xs_pad, ilens, ys_pad):
 
         # 4. compute ctc loss
         loss_ctc, cer_ctc = None, None
+        loss_intermediate_ctc = 0.0
         if self.mtlalpha > 0:
             batch_size = xs_pad.size(0)
             hs_len = hs_mask.view(batch_size, -1).sum(1)
@@ -144,6 +154,15 @@ def forward(self, xs_pad, ilens, ys_pad):
             # for visualization
             if not self.training:
                 self.ctc.softmax(hs_pad)
+            if self.intermediate_ctc_weight > 0 and self.intermediate_ctc_layers:
+                for hs_intermediate in hs_intermediates:
+                    # assuming hs_intermediates and hs_pad has same length / padding
+                    loss_inter = self.ctc(
+                        hs_intermediate.view(batch_size, -1, self.adim), hs_len, ys_pad
+                    )
+                    loss_intermediate_ctc += loss_inter
+
+                loss_intermediate_ctc /= len(self.intermediate_ctc_layers)
 
         # 5. compute cer/wer
         if self.training or self.error_calculator is None or self.decoder is None:
@@ -158,7 +177,11 @@ def forward(self, xs_pad, ilens, ys_pad):
             loss_att_data = float(loss_att)
             loss_ctc_data = None
         else:
-            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
+            self.loss = (
+                alpha * loss_ctc
+                + self.intermediate_ctc_weight * loss_intermediate_ctc
+                + (1 - alpha - self.intermediate_ctc_weight) * loss_att
+            )
             loss_att_data = float(loss_att)
             loss_ctc_data = float(loss_ctc)
 
@@ -194,6 +217,9 @@ def f(yl):
         self.eval()
         h = self.encode(x).unsqueeze(0)
 
+        input_len = h.squeeze(0)
+        logging.info("input lengths: " + str(input_len.size(0)))
+
         # greedy ctc outputs
         ctc_probs, ctc_ids = torch.exp(self.ctc.log_softmax(h)).max(dim=-1)
         y_hat = torch.stack([x[0] for x in groupby(ctc_ids[0])])
diff --git a/espnet/nets/pytorch_backend/e2e_asr_mix.py b/espnet/nets/pytorch_backend/e2e_asr_mix.py
index 1615f7e275e..377aabe5162 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_mix.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_mix.py
@@ -14,7 +14,6 @@
 import os
 import sys
 
-import editdistance
 import numpy as np
 import six
 import torch
@@ -285,6 +284,8 @@ def forward(self, xs_pad, ilens, ys_pad):
         :return: accuracy in attention decoder
         :rtype: float
         """
+        import editdistance
+
         # 0. Frontend
         if self.frontend is not None:
             hs_pad, hlens, mask = self.frontend(to_torch_tensor(xs_pad), ilens)
@@ -322,7 +323,7 @@ def forward(self, xs_pad, ilens, ys_pad):
                             hlens[i // self.num_spkrs],
                             ys_pad[i % self.num_spkrs],
                         )
-                        for i in range(self.num_spkrs ** 2)
+                        for i in range(self.num_spkrs**2)
                     ],
                     dim=1,
                 )  # (B, num_spkrs^2)
@@ -440,13 +441,13 @@ def forward(self, xs_pad, ilens, ys_pad):
                     editdistance.eval(
                         hyp_words[ns // self.num_spkrs], ref_words[ns % self.num_spkrs]
                     )
-                    for ns in range(self.num_spkrs ** 2)
+                    for ns in range(self.num_spkrs**2)
                 ]  # h1r1,h1r2,h2r1,h2r2
                 tmp_char_ed = [
                     editdistance.eval(
                         hyp_chars[ns // self.num_spkrs], ref_chars[ns % self.num_spkrs]
                     )
-                    for ns in range(self.num_spkrs ** 2)
+                    for ns in range(self.num_spkrs**2)
                 ]  # h1r1,h1r2,h2r1,h2r2
 
                 word_eds.append(self.pit.min_pit_sample(torch.tensor(tmp_word_ed))[0])
@@ -675,7 +676,7 @@ def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
                             hlens[i // self.num_spkrs],
                             ys_pad[i % self.num_spkrs],
                         )
-                        for i in range(self.num_spkrs ** 2)
+                        for i in range(self.num_spkrs**2)
                     ],
                     1,
                 )  # (B, num_spkrs^2)
diff --git a/espnet/nets/pytorch_backend/e2e_asr_mix_transformer.py b/espnet/nets/pytorch_backend/e2e_asr_mix_transformer.py
index 176c5049661..4622e9214ae 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_mix_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_mix_transformer.py
@@ -124,7 +124,7 @@ def forward(self, xs_pad, ilens, ys_pad):
                     hs_len[i // self.num_spkrs],
                     ys_pad[i % self.num_spkrs],
                 )
-                for i in range(self.num_spkrs ** 2)
+                for i in range(self.num_spkrs**2)
             ],
             dim=1,
         )  # (B, num_spkrs^2)
@@ -376,7 +376,7 @@ def recog(self, enc_output, recog_args, char_list=None, rnnlm=None, use_jit=Fals
 
             # add eos in the final loop to avoid that there are no ended hyps
             if i == maxlen - 1:
-                logging.info("adding <eos> in the last postion in the loop")
+                logging.info("adding <eos> in the last position in the loop")
                 for hyp in hyps:
                     hyp["yseq"].append(self.eos)
 
diff --git a/espnet/nets/pytorch_backend/e2e_asr_mulenc.py b/espnet/nets/pytorch_backend/e2e_asr_mulenc.py
index 7c4d2d70314..3e7f78366da 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_mulenc.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_mulenc.py
@@ -12,7 +12,6 @@
 
 import chainer
 from chainer import reporter
-import editdistance
 import numpy as np
 import torch
 
@@ -493,6 +492,8 @@ def forward(self, xs_pad_list, ilens_list, ys_pad):
         :return: loss value
         :rtype: torch.Tensor
         """
+        import editdistance
+
         if self.replace_sos:
             tgt_lang_ids = ys_pad[:, 0:1]
             ys_pad = ys_pad[:, 1:]  # remove target language ID in the beginning
@@ -839,7 +840,7 @@ def calculate_all_attentions(self, xs_pad_list, ilens_list, ys_pad):
             # 1. Encoder
             if self.replace_sos:
                 tgt_lang_ids = ys_pad[:, 0:1]
-                ys_pad = ys_pad[:, 1:]  # remove target language ID in the beggining
+                ys_pad = ys_pad[:, 1:]  # remove target language ID in the beginning
             else:
                 tgt_lang_ids = None
 
diff --git a/espnet/nets/pytorch_backend/e2e_asr_transducer.py b/espnet/nets/pytorch_backend/e2e_asr_transducer.py
index 4699503eed0..9ce0cb45dca 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_transducer.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_transducer.py
@@ -1,80 +1,104 @@
 """Transducer speech recognition model (pytorch)."""
 
+from argparse import ArgumentParser
 from argparse import Namespace
-from collections import Counter
 from dataclasses import asdict
 import logging
 import math
 import numpy
+from typing import List
 
 import chainer
 import torch
 
 from espnet.nets.asr_interface import ASRInterface
-from espnet.nets.pytorch_backend.ctc import ctc_for
+from espnet.nets.beam_search_transducer import BeamSearchTransducer
 from espnet.nets.pytorch_backend.nets_utils import get_subsample
 from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
 from espnet.nets.pytorch_backend.transducer.arguments import (
-    add_encoder_general_arguments,  # noqa: H301
-    add_rnn_encoder_arguments,  # noqa: H301
+    add_auxiliary_task_arguments,  # noqa: H301
+    add_custom_decoder_arguments,  # noqa: H301
     add_custom_encoder_arguments,  # noqa: H301
+    add_custom_training_arguments,  # noqa: H301
     add_decoder_general_arguments,  # noqa: H301
+    add_encoder_general_arguments,  # noqa: H301
     add_rnn_decoder_arguments,  # noqa: H301
-    add_custom_decoder_arguments,  # noqa: H301
-    add_custom_training_arguments,  # noqa: H301
+    add_rnn_encoder_arguments,  # noqa: H301
     add_transducer_arguments,  # noqa: H301
-    add_auxiliary_task_arguments,  # noqa: H301
 )
-from espnet.nets.pytorch_backend.transducer.auxiliary_task import AuxiliaryTask
 from espnet.nets.pytorch_backend.transducer.custom_decoder import CustomDecoder
 from espnet.nets.pytorch_backend.transducer.custom_encoder import CustomEncoder
 from espnet.nets.pytorch_backend.transducer.error_calculator import ErrorCalculator
 from espnet.nets.pytorch_backend.transducer.initializer import initializer
-from espnet.nets.pytorch_backend.transducer.joint_network import JointNetwork
-from espnet.nets.pytorch_backend.transducer.loss import TransLoss
-from espnet.nets.pytorch_backend.transducer.rnn_decoder import DecoderRNNT
+from espnet.nets.pytorch_backend.transducer.rnn_decoder import RNNDecoder
 from espnet.nets.pytorch_backend.transducer.rnn_encoder import encoder_for
-from espnet.nets.pytorch_backend.transducer.utils import prepare_loss_inputs
-from espnet.nets.pytorch_backend.transducer.utils import valid_aux_task_layer_list
+from espnet.nets.pytorch_backend.transducer.transducer_tasks import TransducerTasks
+from espnet.nets.pytorch_backend.transducer.utils import get_decoder_input
+from espnet.nets.pytorch_backend.transducer.utils import valid_aux_encoder_output_layers
 from espnet.nets.pytorch_backend.transformer.attention import (
     MultiHeadedAttention,  # noqa: H301
     RelPositionMultiHeadedAttention,  # noqa: H301
 )
-from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
-    LabelSmoothingLoss,  # noqa: H301
-)
 from espnet.nets.pytorch_backend.transformer.mask import target_mask
 from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
 from espnet.utils.fill_missing_args import fill_missing_args
 
 
 class Reporter(chainer.Chain):
-    """A chainer reporter wrapper for transducer models."""
+    """A chainer reporter wrapper for Transducer models."""
+
+    def report(
+        self,
+        loss: float,
+        loss_trans: float,
+        loss_ctc: float,
+        loss_aux_trans: float,
+        loss_symm_kl_div: float,
+        loss_lm: float,
+        cer: float,
+        wer: float,
+    ):
+        """Instantiate reporter attributes.
 
-    def report(self, loss, cer, wer):
-        """Instantiate reporter attributes."""
+        Args:
+            loss: Model loss.
+            loss_trans: Main Transducer loss.
+            loss_ctc: CTC loss.
+            loss_aux_trans: Auxiliary Transducer loss.
+            loss_symm_kl_div: Symmetric KL-divergence loss.
+            loss_lm: Label smoothing loss.
+            cer: Character Error Rate.
+            wer: Word Error Rate.
+
+        """
+        chainer.reporter.report({"loss": loss}, self)
+        chainer.reporter.report({"loss_trans": loss_trans}, self)
+        chainer.reporter.report({"loss_ctc": loss_ctc}, self)
+        chainer.reporter.report({"loss_lm": loss_lm}, self)
+        chainer.reporter.report({"loss_aux_trans": loss_aux_trans}, self)
+        chainer.reporter.report({"loss_symm_kl_div": loss_symm_kl_div}, self)
         chainer.reporter.report({"cer": cer}, self)
         chainer.reporter.report({"wer": wer}, self)
-        chainer.reporter.report({"loss": loss}, self)
 
         logging.info("loss:" + str(loss))
 
 
 class E2E(ASRInterface, torch.nn.Module):
-    """E2E module for transducer models.
+    """E2E module for Transducer models.
 
     Args:
-        idim (int): dimension of inputs
-        odim (int): dimension of outputs
-        args (Namespace): argument Namespace containing options
-        ignore_id (int): padding symbol id
-        blank_id (int): blank symbol id
+        idim: Dimension of inputs.
+        odim: Dimension of outputs.
+        args: Namespace containing model options.
+        ignore_id: Padding symbol ID.
+        blank_id: Blank symbol ID.
+        training: Whether the model is initialized in training or inference mode.
 
     """
 
     @staticmethod
-    def add_arguments(parser):
-        """Add arguments for transducer model."""
+    def add_arguments(parser: ArgumentParser) -> ArgumentParser:
+        """Add arguments for Transducer model."""
         E2E.encoder_add_general_arguments(parser)
         E2E.encoder_add_rnn_arguments(parser)
         E2E.encoder_add_custom_arguments(parser)
@@ -90,7 +114,7 @@ def add_arguments(parser):
         return parser
 
     @staticmethod
-    def encoder_add_general_arguments(parser):
+    def encoder_add_general_arguments(parser: ArgumentParser) -> ArgumentParser:
         """Add general arguments for encoder."""
         group = parser.add_argument_group("Encoder general arguments")
         group = add_encoder_general_arguments(group)
@@ -98,7 +122,7 @@ def encoder_add_general_arguments(parser):
         return parser
 
     @staticmethod
-    def encoder_add_rnn_arguments(parser):
+    def encoder_add_rnn_arguments(parser: ArgumentParser) -> ArgumentParser:
         """Add arguments for RNN encoder."""
         group = parser.add_argument_group("RNN encoder arguments")
         group = add_rnn_encoder_arguments(group)
@@ -106,7 +130,7 @@ def encoder_add_rnn_arguments(parser):
         return parser
 
     @staticmethod
-    def encoder_add_custom_arguments(parser):
+    def encoder_add_custom_arguments(parser: ArgumentParser) -> ArgumentParser:
         """Add arguments for Custom encoder."""
         group = parser.add_argument_group("Custom encoder arguments")
         group = add_custom_encoder_arguments(group)
@@ -114,7 +138,7 @@ def encoder_add_custom_arguments(parser):
         return parser
 
     @staticmethod
-    def decoder_add_general_arguments(parser):
+    def decoder_add_general_arguments(parser: ArgumentParser) -> ArgumentParser:
         """Add general arguments for decoder."""
         group = parser.add_argument_group("Decoder general arguments")
         group = add_decoder_general_arguments(group)
@@ -122,7 +146,7 @@ def decoder_add_general_arguments(parser):
         return parser
 
     @staticmethod
-    def decoder_add_rnn_arguments(parser):
+    def decoder_add_rnn_arguments(parser: ArgumentParser) -> ArgumentParser:
         """Add arguments for RNN decoder."""
         group = parser.add_argument_group("RNN decoder arguments")
         group = add_rnn_decoder_arguments(group)
@@ -130,7 +154,7 @@ def decoder_add_rnn_arguments(parser):
         return parser
 
     @staticmethod
-    def decoder_add_custom_arguments(parser):
+    def decoder_add_custom_arguments(parser: ArgumentParser) -> ArgumentParser:
         """Add arguments for Custom decoder."""
         group = parser.add_argument_group("Custom decoder arguments")
         group = add_custom_decoder_arguments(group)
@@ -138,7 +162,7 @@ def decoder_add_custom_arguments(parser):
         return parser
 
     @staticmethod
-    def training_add_custom_arguments(parser):
+    def training_add_custom_arguments(parser: ArgumentParser) -> ArgumentParser:
         """Add arguments for Custom architecture training."""
         group = parser.add_argument_group("Training arguments for custom archictecture")
         group = add_custom_training_arguments(group)
@@ -146,15 +170,15 @@ def training_add_custom_arguments(parser):
         return parser
 
     @staticmethod
-    def transducer_add_arguments(parser):
-        """Add arguments for transducer model."""
+    def transducer_add_arguments(parser: ArgumentParser) -> ArgumentParser:
+        """Add arguments for Transducer model."""
         group = parser.add_argument_group("Transducer model arguments")
         group = add_transducer_arguments(group)
 
         return parser
 
     @staticmethod
-    def auxiliary_task_add_arguments(parser):
+    def auxiliary_task_add_arguments(parser: ArgumentParser) -> ArgumentParser:
         """Add arguments for auxiliary task."""
         group = parser.add_argument_group("Auxiliary task arguments")
         group = add_auxiliary_task_arguments(group)
@@ -166,7 +190,7 @@ def attention_plot_class(self):
         """Get attention plot class."""
         return PlotAttentionReport
 
-    def get_total_subsampling_factor(self):
+    def get_total_subsampling_factor(self) -> float:
         """Get total subsampling factor."""
         if self.etype == "custom":
             return self.encoder.conv_subsampling_factor * int(
@@ -175,121 +199,145 @@ def get_total_subsampling_factor(self):
         else:
             return self.enc.conv_subsampling_factor * int(numpy.prod(self.subsample))
 
-    def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0, training=True):
-        """Construct an E2E object for transducer model."""
+    def __init__(
+        self,
+        idim: int,
+        odim: int,
+        args: Namespace,
+        ignore_id: int = -1,
+        blank_id: int = 0,
+        training: bool = True,
+    ):
+        """Construct an E2E object for Transducer model."""
         torch.nn.Module.__init__(self)
 
         args = fill_missing_args(args, self.add_arguments)
 
-        self.is_rnnt = True
-        self.transducer_weight = args.transducer_weight
+        self.is_transducer = True
 
-        self.use_aux_task = (
-            True if (args.aux_task_type is not None and training) else False
+        self.use_auxiliary_enc_outputs = (
+            True if (training and args.use_aux_transducer_loss) else False
         )
 
-        self.use_aux_ctc = args.aux_ctc and training
-        self.aux_ctc_weight = args.aux_ctc_weight
-
-        self.use_aux_cross_entropy = args.aux_cross_entropy and training
-        self.aux_cross_entropy_weight = args.aux_cross_entropy_weight
+        self.subsample = get_subsample(
+            args, mode="asr", arch="transformer" if args.etype == "custom" else "rnn-t"
+        )
 
-        if self.use_aux_task:
+        if self.use_auxiliary_enc_outputs:
             n_layers = (
-                (len(args.enc_block_arch) * args.enc_block_repeat - 1)
+                ((len(args.enc_block_arch) * args.enc_block_repeat) - 1)
                 if args.enc_block_arch is not None
                 else (args.elayers - 1)
             )
 
-            aux_task_layer_list = valid_aux_task_layer_list(
-                args.aux_task_layer_list,
+            aux_enc_output_layers = valid_aux_encoder_output_layers(
+                args.aux_transducer_loss_enc_output_layers,
                 n_layers,
+                args.use_symm_kl_div_loss,
+                self.subsample,
             )
         else:
-            aux_task_layer_list = []
+            aux_enc_output_layers = []
 
-        if "custom" in args.etype:
+        if args.etype == "custom":
             if args.enc_block_arch is None:
                 raise ValueError(
                     "When specifying custom encoder type, --enc-block-arch"
-                    "should also be specified in training config. See"
-                    "egs/vivos/asr1/conf/transducer/train_*.yaml for more info."
+                    "should be set in training config."
                 )
 
-            self.subsample = get_subsample(args, mode="asr", arch="transformer")
-
             self.encoder = CustomEncoder(
                 idim,
                 args.enc_block_arch,
-                input_layer=args.custom_enc_input_layer,
+                args.custom_enc_input_layer,
                 repeat_block=args.enc_block_repeat,
                 self_attn_type=args.custom_enc_self_attn_type,
                 positional_encoding_type=args.custom_enc_positional_encoding_type,
                 positionwise_activation_type=args.custom_enc_pw_activation_type,
                 conv_mod_activation_type=args.custom_enc_conv_mod_activation_type,
-                aux_task_layer_list=aux_task_layer_list,
+                aux_enc_output_layers=aux_enc_output_layers,
+                input_layer_dropout_rate=args.custom_enc_input_dropout_rate,
+                input_layer_pos_enc_dropout_rate=(
+                    args.custom_enc_input_pos_enc_dropout_rate
+                ),
             )
             encoder_out = self.encoder.enc_out
-
-            self.most_dom_list = args.enc_block_arch[:]
         else:
-            self.subsample = get_subsample(args, mode="asr", arch="rnn-t")
-
             self.enc = encoder_for(
                 args,
                 idim,
                 self.subsample,
-                aux_task_layer_list=aux_task_layer_list,
+                aux_enc_output_layers=aux_enc_output_layers,
             )
             encoder_out = args.eprojs
 
-        if "custom" in args.dtype:
+        if args.dtype == "custom":
             if args.dec_block_arch is None:
                 raise ValueError(
                     "When specifying custom decoder type, --dec-block-arch"
-                    "should also be specified in training config. See"
-                    "egs/vivos/asr1/conf/transducer/train_*.yaml for more info."
+                    "should be set in training config."
                 )
 
             self.decoder = CustomDecoder(
                 odim,
                 args.dec_block_arch,
-                input_layer=args.custom_dec_input_layer,
+                args.custom_dec_input_layer,
                 repeat_block=args.dec_block_repeat,
                 positionwise_activation_type=args.custom_dec_pw_activation_type,
-                dropout_rate_embed=args.dropout_rate_embed_decoder,
+                input_layer_dropout_rate=args.dropout_rate_embed_decoder,
+                blank_id=blank_id,
             )
             decoder_out = self.decoder.dunits
-
-            if "custom" in args.etype:
-                self.most_dom_list += args.dec_block_arch[:]
-            else:
-                self.most_dom_list = args.dec_block_arch[:]
         else:
-            self.dec = DecoderRNNT(
+            self.dec = RNNDecoder(
                 odim,
                 args.dtype,
                 args.dlayers,
                 args.dunits,
-                blank_id,
                 args.dec_embed_dim,
-                args.dropout_rate_decoder,
-                args.dropout_rate_embed_decoder,
+                dropout_rate=args.dropout_rate_decoder,
+                dropout_rate_embed=args.dropout_rate_embed_decoder,
+                blank_id=blank_id,
             )
             decoder_out = args.dunits
 
-        self.joint_network = JointNetwork(
-            odim, encoder_out, decoder_out, args.joint_dim, args.joint_activation_type
+        self.transducer_tasks = TransducerTasks(
+            encoder_out,
+            decoder_out,
+            args.joint_dim,
+            odim,
+            joint_activation_type=args.joint_activation_type,
+            transducer_loss_weight=args.transducer_weight,
+            ctc_loss=args.use_ctc_loss,
+            ctc_loss_weight=args.ctc_loss_weight,
+            ctc_loss_dropout_rate=args.ctc_loss_dropout_rate,
+            lm_loss=args.use_lm_loss,
+            lm_loss_weight=args.lm_loss_weight,
+            lm_loss_smoothing_rate=args.lm_loss_smoothing_rate,
+            aux_transducer_loss=args.use_aux_transducer_loss,
+            aux_transducer_loss_weight=args.aux_transducer_loss_weight,
+            aux_transducer_loss_mlp_dim=args.aux_transducer_loss_mlp_dim,
+            aux_trans_loss_mlp_dropout_rate=args.aux_transducer_loss_mlp_dropout_rate,
+            symm_kl_div_loss=args.use_symm_kl_div_loss,
+            symm_kl_div_loss_weight=args.symm_kl_div_loss_weight,
+            fastemit_lambda=args.fastemit_lambda,
+            blank_id=blank_id,
+            ignore_id=ignore_id,
+            training=training,
         )
 
-        if hasattr(self, "most_dom_list"):
-            self.most_dom_dim = sorted(
-                Counter(
-                    d["d_hidden"] for d in self.most_dom_list if "d_hidden" in d
-                ).most_common(),
-                key=lambda x: x[0],
-                reverse=True,
-            )[0][0]
+        if training and (args.report_cer or args.report_wer):
+            self.error_calculator = ErrorCalculator(
+                self.decoder if args.dtype == "custom" else self.dec,
+                self.transducer_tasks.joint_network,
+                args.char_list,
+                args.sym_space,
+                args.sym_blank,
+                args.report_cer,
+                args.report_wer,
+            )
+        else:
+            self.error_calculator = None
 
         self.etype = args.etype
         self.dtype = args.dtype
@@ -305,236 +353,183 @@ def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0, training=True):
         self.odim = odim
 
         self.reporter = Reporter()
-        self.error_calculator = None
 
         self.default_parameters(args)
 
-        if training:
-            self.criterion = TransLoss(args.trans_type, self.blank_id)
-
-            decoder = self.decoder if self.dtype == "custom" else self.dec
-
-            if args.report_cer or args.report_wer:
-                self.error_calculator = ErrorCalculator(
-                    decoder,
-                    self.joint_network,
-                    args.char_list,
-                    args.sym_space,
-                    args.sym_blank,
-                    args.report_cer,
-                    args.report_wer,
-                )
-
-            if self.use_aux_task:
-                self.auxiliary_task = AuxiliaryTask(
-                    decoder,
-                    self.joint_network,
-                    self.criterion,
-                    args.aux_task_type,
-                    args.aux_task_weight,
-                    encoder_out,
-                    args.joint_dim,
-                )
-
-            if self.use_aux_ctc:
-                self.aux_ctc = ctc_for(
-                    Namespace(
-                        num_encs=1,
-                        eprojs=encoder_out,
-                        dropout_rate=args.aux_ctc_dropout_rate,
-                        ctc_type="warpctc",
-                    ),
-                    odim,
-                )
-
-            if self.use_aux_cross_entropy:
-                self.aux_decoder_output = torch.nn.Linear(decoder_out, odim)
-
-                self.aux_cross_entropy = LabelSmoothingLoss(
-                    odim, ignore_id, args.aux_cross_entropy_smoothing
-                )
-
         self.loss = None
         self.rnnlm = None
 
-    def default_parameters(self, args):
-        """Initialize/reset parameters for transducer.
+    def default_parameters(self, args: Namespace):
+        """Initialize/reset parameters for Transducer.
 
         Args:
-            args (Namespace): argument Namespace containing options
+            args: Namespace containing model options.
 
         """
         initializer(self, args)
 
-    def forward(self, xs_pad, ilens, ys_pad):
+    def forward(
+        self, feats: torch.Tensor, feats_len: torch.Tensor, labels: torch.Tensor
+    ) -> torch.Tensor:
         """E2E forward.
 
         Args:
-            xs_pad (torch.Tensor): batch of padded source sequences (B, Tmax, idim)
-            ilens (torch.Tensor): batch of lengths of input sequences (B)
-            ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
+            feats: Feature sequences. (B, F, D_feats)
+            feats_len: Feature sequences lengths. (B,)
+            labels: Label ID sequences. (B, L)
 
         Returns:
-            loss (torch.Tensor): transducer loss value
+            loss: Transducer loss value
 
         """
         # 1. encoder
-        xs_pad = xs_pad[:, : max(ilens)]
+        feats = feats[:, : max(feats_len)]
 
-        if "custom" in self.etype:
-            src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)
+        if self.etype == "custom":
+            feats_mask = (
+                make_non_pad_mask(feats_len.tolist()).to(feats.device).unsqueeze(-2)
+            )
 
-            _hs_pad, hs_mask = self.encoder(xs_pad, src_mask)
+            _enc_out, _enc_out_len = self.encoder(feats, feats_mask)
         else:
-            _hs_pad, hs_mask, _ = self.enc(xs_pad, ilens)
+            _enc_out, _enc_out_len, _ = self.enc(feats, feats_len)
 
-        if self.use_aux_task:
-            hs_pad, aux_hs_pad = _hs_pad[0], _hs_pad[1]
+        if self.use_auxiliary_enc_outputs:
+            enc_out, aux_enc_out = _enc_out[0], _enc_out[1]
+            enc_out_len, aux_enc_out_len = _enc_out_len[0], _enc_out_len[1]
         else:
-            hs_pad, aux_hs_pad = _hs_pad, None
-
-        # 1.5. transducer preparation related
-        ys_in_pad, ys_out_pad, target, pred_len, target_len = prepare_loss_inputs(
-            ys_pad, hs_mask
-        )
+            enc_out, aux_enc_out = _enc_out, None
+            enc_out_len, aux_enc_out_len = _enc_out_len, None
 
         # 2. decoder
-        if "custom" in self.dtype:
-            ys_mask = target_mask(ys_in_pad, self.blank_id)
-            pred_pad, _ = self.decoder(ys_in_pad, ys_mask, hs_pad)
-        else:
-            pred_pad = self.dec(hs_pad, ys_in_pad)
-
-        z = self.joint_network(hs_pad.unsqueeze(2), pred_pad.unsqueeze(1))
+        dec_in = get_decoder_input(labels, self.blank_id, self.ignore_id)
 
-        # 3. loss computation
-        loss_trans = self.criterion(z, target, pred_len, target_len)
-
-        if self.use_aux_task and aux_hs_pad is not None:
-            loss_trans += self.auxiliary_task(
-                aux_hs_pad, pred_pad, z, target, pred_len, target_len
-            )
-
-        if self.use_aux_ctc:
-            if "custom" in self.etype:
-                hs_mask = torch.IntTensor(
-                    [h.size(1) for h in hs_mask],
-                ).to(hs_mask.device)
-
-            loss_ctc = self.aux_ctc(hs_pad, hs_mask, ys_pad)
-        else:
-            loss_ctc = 0
+        if self.dtype == "custom":
+            self.decoder.set_device(enc_out.device)
 
-        if self.use_aux_cross_entropy:
-            loss_ce = self.aux_cross_entropy(
-                self.aux_decoder_output(pred_pad), ys_out_pad
-            )
+            dec_in_mask = target_mask(dec_in, self.blank_id)
+            dec_out, _ = self.decoder(dec_in, dec_in_mask)
         else:
-            loss_ce = 0
-
-        loss = (
-            self.transducer_weight * loss_trans
-            + self.aux_ctc_weight * loss_ctc
-            + self.aux_cross_entropy_weight * loss_ce
+            self.dec.set_device(enc_out.device)
+
+            dec_out = self.dec(dec_in)
+
+        # 3. Transducer task and auxiliary tasks computation
+        losses = self.transducer_tasks(
+            enc_out,
+            aux_enc_out,
+            dec_out,
+            labels,
+            enc_out_len,
+            aux_enc_out_len,
         )
 
-        self.loss = loss
-        loss_data = float(loss)
-
-        # 4. compute cer/wer
         if self.training or self.error_calculator is None:
             cer, wer = None, None
         else:
-            cer, wer = self.error_calculator(hs_pad, ys_pad)
+            cer, wer = self.error_calculator(
+                enc_out, self.transducer_tasks.get_target()
+            )
+
+        self.loss = sum(losses)
+        loss_data = float(self.loss)
 
         if not math.isnan(loss_data):
-            self.reporter.report(loss_data, cer, wer)
+            self.reporter.report(
+                loss_data,
+                *[float(loss) for loss in losses],
+                cer,
+                wer,
+            )
         else:
             logging.warning("loss (=%f) is not correct", loss_data)
 
         return self.loss
 
-    def encode_custom(self, x):
+    def encode_custom(self, feats: numpy.ndarray) -> torch.Tensor:
         """Encode acoustic features.
 
         Args:
-            x (ndarray): input acoustic feature (T, D)
+            feats: Feature sequence. (F, D_feats)
 
         Returns:
-            x (torch.Tensor): encoded features (T, D_enc)
+            enc_out: Encoded feature sequence. (T, D_enc)
 
         """
-        x = torch.as_tensor(x).unsqueeze(0)
-        enc_output, _ = self.encoder(x, None)
+        feats = torch.as_tensor(feats).unsqueeze(0)
+        enc_out, _ = self.encoder(feats, None)
 
-        return enc_output.squeeze(0)
+        return enc_out.squeeze(0)
 
-    def encode_rnn(self, x):
+    def encode_rnn(self, feats: numpy.ndarray) -> torch.Tensor:
         """Encode acoustic features.
 
         Args:
-            x (ndarray): input acoustic feature (T, D)
+            feats: Feature sequence. (F, D_feats)
 
         Returns:
-            x (torch.Tensor): encoded features (T, D_enc)
+            enc_out: Encoded feature sequence. (T, D_enc)
 
         """
         p = next(self.parameters())
 
-        ilens = [x.shape[0]]
-        x = x[:: self.subsample[0], :]
+        feats_len = [feats.shape[0]]
 
-        h = torch.as_tensor(x, device=p.device, dtype=p.dtype)
-        hs = h.contiguous().unsqueeze(0)
+        feats = feats[:: self.subsample[0], :]
+        feats = torch.as_tensor(feats, device=p.device, dtype=p.dtype)
+        feats = feats.contiguous().unsqueeze(0)
 
-        hs, _, _ = self.enc(hs, ilens)
+        enc_out, _, _ = self.enc(feats, feats_len)
 
-        return hs.squeeze(0)
+        return enc_out.squeeze(0)
 
-    def recognize(self, x, beam_search):
+    def recognize(
+        self, feats: numpy.ndarray, beam_search: BeamSearchTransducer
+    ) -> List:
         """Recognize input features.
 
         Args:
-            x (ndarray): input acoustic feature (T, D)
-            beam_search (class): beam search class
+            feats: Feature sequence. (F, D_feats)
+            beam_search: Beam search class.
 
         Returns:
-            nbest_hyps (list): n-best decoding results
+            nbest_hyps: N-best decoding results.
 
         """
         self.eval()
 
-        if "custom" in self.etype:
-            h = self.encode_custom(x)
+        if self.etype == "custom":
+            enc_out = self.encode_custom(feats)
         else:
-            h = self.encode_rnn(x)
+            enc_out = self.encode_rnn(feats)
 
-        nbest_hyps = beam_search(h)
+        nbest_hyps = beam_search(enc_out)
 
         return [asdict(n) for n in nbest_hyps]
 
-    def calculate_all_attentions(self, xs_pad, ilens, ys_pad):
+    def calculate_all_attentions(
+        self, feats: torch.Tensor, feats_len: torch.Tensor, labels: torch.Tensor
+    ) -> numpy.ndarray:
         """E2E attention calculation.
 
         Args:
-            xs_pad (torch.Tensor): batch of padded input sequences (B, Tmax, idim)
-            ilens (torch.Tensor): batch of lengths of input sequences (B)
-            ys_pad (torch.Tensor):
-                batch of padded character id sequence tensor (B, Lmax)
+            feats: Feature sequences. (B, F, D_feats)
+            feats_len: Feature sequences lengths. (B,)
+            labels: Label ID sequences. (B, L)
 
         Returns:
-            ret (ndarray): attention weights with the following shape,
-                1) multi-head case => attention weights (B, H, Lmax, Tmax),
-                2) other case => attention weights (B, Lmax, Tmax).
+            ret: Attention weights with the following shape,
+                1) multi-head case => attention weights. (B, D_att, U, T),
+                2) other case => attention weights. (B, U, T)
 
         """
         self.eval()
 
-        if "custom" not in self.etype and "custom" not in self.dtype:
+        if self.etype != "custom" and self.dtype != "custom":
             return []
         else:
             with torch.no_grad():
-                self.forward(xs_pad, ilens, ys_pad)
+                self.forward(feats, feats_len, labels)
 
             ret = dict()
             for name, m in self.named_modules():
diff --git a/espnet/nets/pytorch_backend/e2e_asr_transformer.py b/espnet/nets/pytorch_backend/e2e_asr_transformer.py
index b0cd0931dc9..b13c7e452b6 100644
--- a/espnet/nets/pytorch_backend/e2e_asr_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_asr_transformer.py
@@ -85,6 +85,24 @@ def __init__(self, idim, odim, args, ignore_id=-1):
 
         if args.transformer_attn_dropout_rate is None:
             args.transformer_attn_dropout_rate = args.dropout_rate
+
+        self.adim = args.adim  # used for CTC (equal to d_model)
+        self.mtlalpha = args.mtlalpha
+
+        if args.mtlalpha > 0.0:
+            self.ctc = CTC(
+                odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
+            )
+        else:
+            self.ctc = None
+
+        self.intermediate_ctc_weight = args.intermediate_ctc_weight
+        self.intermediate_ctc_layers = []
+        if args.intermediate_ctc_layer != "":
+            self.intermediate_ctc_layers = [
+                int(i) for i in args.intermediate_ctc_layer.split(",")
+            ]
+
         self.encoder = Encoder(
             idim=idim,
             selfattention_layer_type=args.transformer_encoder_selfattn_layer_type,
@@ -99,6 +117,10 @@ def __init__(self, idim, odim, args, ignore_id=-1):
             dropout_rate=args.dropout_rate,
             positional_dropout_rate=args.dropout_rate,
             attention_dropout_rate=args.transformer_attn_dropout_rate,
+            stochastic_depth_rate=args.stochastic_depth_rate,
+            intermediate_layers=self.intermediate_ctc_layers,
+            ctc_softmax=self.ctc.softmax if args.self_conditioning else None,
+            conditioning_layer_dim=odim,
         )
         if args.mtlalpha < 1:
             self.decoder = Decoder(
@@ -134,14 +156,6 @@ def __init__(self, idim, odim, args, ignore_id=-1):
         self.reporter = Reporter()
 
         self.reset_parameters(args)
-        self.adim = args.adim  # used for CTC (equal to d_model)
-        self.mtlalpha = args.mtlalpha
-        if args.mtlalpha > 0.0:
-            self.ctc = CTC(
-                odim, args.adim, args.dropout_rate, ctc_type=args.ctc_type, reduce=True
-            )
-        else:
-            self.ctc = None
 
         if args.report_cer or args.report_wer:
             self.error_calculator = ErrorCalculator(
@@ -176,7 +190,7 @@ def forward(self, xs_pad, ilens, ys_pad):
         # 1. forward encoder
         xs_pad = xs_pad[:, : max(ilens)]  # for data parallel
         src_mask = make_non_pad_mask(ilens.tolist()).to(xs_pad.device).unsqueeze(-2)
-        hs_pad, hs_mask = self.encoder(xs_pad, src_mask)
+        hs_pad, hs_mask, hs_intermediates = self.encoder(xs_pad, src_mask)
         self.hs_pad = hs_pad
 
         # 2. forward decoder
@@ -200,6 +214,7 @@ def forward(self, xs_pad, ilens, ys_pad):
         # TODO(karita) show predicted text
         # TODO(karita) calculate these stats
         cer_ctc = None
+        loss_intermediate_ctc = 0.0
         if self.mtlalpha == 0.0:
             loss_ctc = None
         else:
@@ -213,6 +228,16 @@ def forward(self, xs_pad, ilens, ys_pad):
             if not self.training:
                 self.ctc.softmax(hs_pad)
 
+            if self.intermediate_ctc_weight > 0 and self.intermediate_ctc_layers:
+                for hs_intermediate in hs_intermediates:
+                    # assuming hs_intermediates and hs_pad has same length / padding
+                    loss_inter = self.ctc(
+                        hs_intermediate.view(batch_size, -1, self.adim), hs_len, ys_pad
+                    )
+                    loss_intermediate_ctc += loss_inter
+
+                loss_intermediate_ctc /= len(self.intermediate_ctc_layers)
+
         # 5. compute cer/wer
         if self.training or self.error_calculator is None or self.decoder is None:
             cer, wer = None, None
@@ -228,10 +253,20 @@ def forward(self, xs_pad, ilens, ys_pad):
             loss_ctc_data = None
         elif alpha == 1:
             self.loss = loss_ctc
+            if self.intermediate_ctc_weight > 0:
+                self.loss = (
+                    1 - self.intermediate_ctc_weight
+                ) * loss_ctc + self.intermediate_ctc_weight * loss_intermediate_ctc
             loss_att_data = None
             loss_ctc_data = float(loss_ctc)
         else:
             self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
+            if self.intermediate_ctc_weight > 0:
+                self.loss = (
+                    (1 - alpha - self.intermediate_ctc_weight) * loss_att
+                    + alpha * loss_ctc
+                    + self.intermediate_ctc_weight * loss_intermediate_ctc
+                )
             loss_att_data = float(loss_att)
             loss_ctc_data = float(loss_ctc)
 
@@ -257,7 +292,7 @@ def encode(self, x):
         """
         self.eval()
         x = torch.as_tensor(x).unsqueeze(0)
-        enc_output, _ = self.encoder(x, None)
+        enc_output, _, _ = self.encoder(x, None)
         return enc_output.squeeze(0)
 
     def recognize(self, x, recog_args, char_list=None, rnnlm=None, use_jit=False):
@@ -417,7 +452,7 @@ def recognize(self, x, recog_args, char_list=None, rnnlm=None, use_jit=False):
 
             # add eos in the final loop to avoid that there are no ended hyps
             if i == maxlen - 1:
-                logging.info("adding <eos> in the last postion in the loop")
+                logging.info("adding <eos> in the last position in the loop")
                 for hyp in hyps:
                     hyp["yseq"].append(self.eos)
 
diff --git a/espnet/nets/pytorch_backend/e2e_mt.py b/espnet/nets/pytorch_backend/e2e_mt.py
index 71f819a7ef3..9dffdd7ba8d 100644
--- a/espnet/nets/pytorch_backend/e2e_mt.py
+++ b/espnet/nets/pytorch_backend/e2e_mt.py
@@ -272,7 +272,7 @@ def target_language_biasing(self, xs_pad, ilens, ys_pad):
         :rtype: torch.Tensor (B, 1)
         """
         if self.multilingual:
-            # remove language ID in the beggining
+            # remove language ID in the beginning
             tgt_lang_ids = ys_pad[:, 0].unsqueeze(1)
             xs_pad = xs_pad[:, 1:]  # remove source language IDs here
             ys_pad = ys_pad[:, 1:]
diff --git a/espnet/nets/pytorch_backend/e2e_mt_transformer.py b/espnet/nets/pytorch_backend/e2e_mt_transformer.py
index c7422ada4b8..5e4b9bb70e1 100644
--- a/espnet/nets/pytorch_backend/e2e_mt_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_mt_transformer.py
@@ -141,11 +141,11 @@ def reset_parameters(self, args):
         """Initialize parameters."""
         initialize(self, args.transformer_init)
         torch.nn.init.normal_(
-            self.encoder.embed[0].weight, mean=0, std=args.adim ** -0.5
+            self.encoder.embed[0].weight, mean=0, std=args.adim**-0.5
         )
         torch.nn.init.constant_(self.encoder.embed[0].weight[self.pad], 0)
         torch.nn.init.normal_(
-            self.decoder.embed[0].weight, mean=0, std=args.adim ** -0.5
+            self.decoder.embed[0].weight, mean=0, std=args.adim**-0.5
         )
         torch.nn.init.constant_(self.decoder.embed[0].weight[self.pad], 0)
 
@@ -331,7 +331,7 @@ def translate(self, x, trans_args, char_list=None):
 
             # add eos in the final loop to avoid that there are no ended hyps
             if i == maxlen - 1:
-                logging.info("adding <eos> in the last postion in the loop")
+                logging.info("adding <eos> in the last position in the loop")
                 for hyp in hyps:
                     hyp["yseq"].append(self.eos)
 
diff --git a/espnet/nets/pytorch_backend/e2e_st.py b/espnet/nets/pytorch_backend/e2e_st.py
index f64e786f361..1464c896833 100644
--- a/espnet/nets/pytorch_backend/e2e_st.py
+++ b/espnet/nets/pytorch_backend/e2e_st.py
@@ -9,7 +9,6 @@
 import math
 import os
 
-import editdistance
 import nltk
 
 import chainer
@@ -290,7 +289,7 @@ def forward(self, xs_pad, ilens, ys_pad, ys_pad_src):
         # 0. Extract target language ID
         if self.multilingual:
             tgt_lang_ids = ys_pad[:, 0:1]
-            ys_pad = ys_pad[:, 1:]  # remove target language ID in the beggining
+            ys_pad = ys_pad[:, 1:]  # remove target language ID in the beginning
         else:
             tgt_lang_ids = None
 
@@ -403,6 +402,8 @@ def forward_asr(self, hs_pad, hlens, ys_pad):
         :return: word error rate from attetion decoder prediction
         :rtype: float
         """
+        import editdistance
+
         loss_att, loss_ctc = 0.0, 0.0
         acc = None
         cer, wer = None, None
@@ -618,7 +619,7 @@ def calculate_all_attentions(self, xs_pad, ilens, ys_pad, ys_pad_src):
             # 1. Encoder
             if self.multilingual:
                 tgt_lang_ids = ys_pad[:, 0:1]
-                ys_pad = ys_pad[:, 1:]  # remove target language ID in the beggining
+                ys_pad = ys_pad[:, 1:]  # remove target language ID in the beginning
             else:
                 tgt_lang_ids = None
             hpad, hlens, _ = self.enc(xs_pad, ilens)
diff --git a/espnet/nets/pytorch_backend/e2e_st_transformer.py b/espnet/nets/pytorch_backend/e2e_st_transformer.py
index 64c3b7dcc47..8c6406cb9ee 100644
--- a/espnet/nets/pytorch_backend/e2e_st_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_st_transformer.py
@@ -185,7 +185,7 @@ def reset_parameters(self, args):
         initialize(self, args.transformer_init)
         if self.mt_weight > 0:
             torch.nn.init.normal_(
-                self.encoder_mt.embed[0].weight, mean=0, std=args.adim ** -0.5
+                self.encoder_mt.embed[0].weight, mean=0, std=args.adim**-0.5
             )
             torch.nn.init.constant_(self.encoder_mt.embed[0].weight[self.pad], 0)
 
@@ -207,7 +207,7 @@ def forward(self, xs_pad, ilens, ys_pad, ys_pad_src):
         tgt_lang_ids = None
         if self.multilingual:
             tgt_lang_ids = ys_pad[:, 0:1]
-            ys_pad = ys_pad[:, 1:]  # remove target language ID in the beggining
+            ys_pad = ys_pad[:, 1:]  # remove target language ID in the beginning
 
         # 1. forward encoder
         xs_pad = xs_pad[:, : max(ilens)]  # for data parallel
@@ -478,7 +478,7 @@ def translate(
 
             # add eos in the final loop to avoid that there are no ended hyps
             if i == maxlen - 1:
-                logging.info("adding <eos> in the last postion in the loop")
+                logging.info("adding <eos> in the last position in the loop")
                 for hyp in hyps:
                     hyp["yseq"].append(self.eos)
 
diff --git a/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py b/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py
index c41dd4262b3..2e543d932e7 100644
--- a/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py
+++ b/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py
@@ -62,8 +62,8 @@ def forward(self, att_ws, ilens, olens):
 
         Args:
             att_ws (Tensor): Batch of attention weights (B, T_max_out, T_max_in).
-            ilens (LongTensor): Batch of input lenghts (B,).
-            olens (LongTensor): Batch of output lenghts (B,).
+            ilens (LongTensor): Batch of input lengths (B,).
+            olens (LongTensor): Batch of output lengths (B,).
 
         Returns:
             Tensor: Guided attention loss value.
@@ -121,7 +121,7 @@ def _make_guided_attention_mask(ilen, olen, sigma):
         grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
         grid_x, grid_y = grid_x.float().to(olen.device), grid_y.float().to(ilen.device)
         return 1.0 - torch.exp(
-            -((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma ** 2))
+            -((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2))
         )
 
     @staticmethod
@@ -256,7 +256,7 @@ def _load_state_dict_pre_hook(
         unexpected_keys,
         error_msgs,
     ):
-        """Apply pre hook fucntion before loading state dict.
+        """Apply pre hook function before loading state dict.
 
         From v.0.6.1 `bce_criterion.pos_weight` param is registered as a parameter but
         old models do not include it and as a result, it causes missing key error when
@@ -736,13 +736,18 @@ def forward(
 
         # modifiy mod part of groundtruth
         if self.reduction_factor > 1:
+            assert olens.ge(
+                self.reduction_factor
+            ).all(), "Output length must be greater than or equal to reduction factor."
             olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
             max_out = max(olens)
             ys = ys[:, :max_out]
             labels = labels[:, :max_out]
-            labels[:, -1] = 1.0  # make sure at least one frame has 1
+            labels = torch.scatter(
+                labels, 1, (olens - 1).unsqueeze(1), 1.0
+            )  # see #3388
 
-        # caluculate taco2 loss
+        # calculate taco2 loss
         l1_loss, mse_loss, bce_loss = self.taco2_loss(
             after_outs, before_outs, logits, ys, labels, olens
         )
@@ -753,7 +758,7 @@ def forward(
             {"bce_loss": bce_loss.item()},
         ]
 
-        # caluculate attention loss
+        # calculate attention loss
         if self.use_guided_attn_loss:
             # NOTE(kan-bayashi):
             # length of output for auto-regressive input will be changed when r > 1
@@ -767,13 +772,13 @@ def forward(
                 {"attn_loss": attn_loss.item()},
             ]
 
-        # caluculate cbhg loss
+        # calculate cbhg loss
         if self.use_cbhg:
             # remove unnecessary padded part (for multi-gpus)
             if max_out != extras.shape[1]:
                 extras = extras[:, :max_out]
 
-            # caluculate cbhg outputs & loss and report them
+            # calculate cbhg outputs & loss and report them
             cbhg_outs, _ = self.cbhg(after_outs, olens)
             cbhg_l1_loss, cbhg_mse_loss = self.cbhg_loss(cbhg_outs, extras, olens)
             loss = loss + cbhg_l1_loss + cbhg_mse_loss
diff --git a/espnet/nets/pytorch_backend/e2e_tts_transformer.py b/espnet/nets/pytorch_backend/e2e_tts_transformer.py
index 3628e7df00a..9f860285d55 100644
--- a/espnet/nets/pytorch_backend/e2e_tts_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_tts_transformer.py
@@ -45,8 +45,8 @@ def forward(self, att_ws, ilens, olens):
         Args:
             att_ws (Tensor):
                 Batch of multi head attention weights (B, H, T_max_out, T_max_in).
-            ilens (LongTensor): Batch of input lenghts (B,).
-            olens (LongTensor): Batch of output lenghts (B,).
+            ilens (LongTensor): Batch of input lengths (B,).
+            olens (LongTensor): Batch of output lengths (B,).
 
         Returns:
             Tensor: Guided attention loss value.
@@ -749,13 +749,18 @@ def forward(self, xs, ilens, ys, labels, olens, spembs=None, *args, **kwargs):
 
         # modifiy mod part of groundtruth
         if self.reduction_factor > 1:
+            assert olens.ge(
+                self.reduction_factor
+            ).all(), "Output length must be greater than or equal to reduction factor."
             olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
             max_olen = max(olens)
             ys = ys[:, :max_olen]
             labels = labels[:, :max_olen]
-            labels[:, -1] = 1.0  # make sure at least one frame has 1
+            labels = torch.scatter(
+                labels, 1, (olens - 1).unsqueeze(1), 1.0
+            )  # see #3388
 
-        # caluculate loss values
+        # calculate loss values
         l1_loss, l2_loss, bce_loss = self.criterion(
             after_outs, before_outs, logits, ys, labels, olens
         )
diff --git a/espnet/nets/pytorch_backend/e2e_vc_tacotron2.py b/espnet/nets/pytorch_backend/e2e_vc_tacotron2.py
index e876c42cf7c..049d9407f8a 100644
--- a/espnet/nets/pytorch_backend/e2e_vc_tacotron2.py
+++ b/espnet/nets/pytorch_backend/e2e_vc_tacotron2.py
@@ -540,7 +540,7 @@ def forward(
             hs = torch.cat([hs, spembs], dim=-1)
         after_outs, before_outs, logits, att_ws = self.dec(hs, hlens, ys)
 
-        # caluculate src reconstruction
+        # calculate src reconstruction
         if self.src_reconstruction_loss_lambda > 0:
             B, _in_length, _adim = hs.shape
             xt, xtlens = self.src_reconstructor(hs, hlens)
@@ -548,7 +548,7 @@ def forward(
             if self.encoder_reduction_factor > 1:
                 xt = xt.view(B, -1, self.idim)
 
-        # caluculate trg reconstruction
+        # calculate trg reconstruction
         if self.trg_reconstruction_loss_lambda > 0:
             olens_trg_cp = olens.new(
                 sorted([olen // self.reduction_factor for olen in olens], reverse=True)
@@ -572,11 +572,16 @@ def forward(
 
         # modifiy mod part of groundtruth
         if self.reduction_factor > 1:
+            assert olens.ge(
+                self.reduction_factor
+            ).all(), "Output length must be greater than or equal to reduction factor."
             olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
             max_out = max(olens)
             ys = ys[:, :max_out]
             labels = labels[:, :max_out]
-            labels[:, -1] = 1.0  # make sure at least one frame has 1
+            labels = torch.scatter(
+                labels, 1, (olens - 1).unsqueeze(1), 1.0
+            )  # see #3388
         if self.encoder_reduction_factor > 1:
             ilens = ilens.new(
                 [ilen - ilen % self.encoder_reduction_factor for ilen in ilens]
@@ -584,7 +589,7 @@ def forward(
             max_in = max(ilens)
             xs = xs[:, :max_in]
 
-        # caluculate taco2 loss
+        # calculate taco2 loss
         l1_loss, mse_loss, bce_loss = self.taco2_loss(
             after_outs, before_outs, logits, ys, labels, olens
         )
@@ -595,7 +600,7 @@ def forward(
             {"bce_loss": bce_loss.item()},
         ]
 
-        # caluculate context_perservation loss
+        # calculate context_preservation loss
         if self.src_reconstruction_loss_lambda > 0:
             src_recon_l1_loss, src_recon_mse_loss = self.src_reconstruction_loss(
                 xt, xs, ilens
@@ -615,7 +620,7 @@ def forward(
                 {"trg_recon_mse_loss": trg_recon_mse_loss.item()},
             ]
 
-        # caluculate attention loss
+        # calculate attention loss
         if self.use_guided_attn_loss:
             # NOTE(kan-bayashi): length of output for auto-regressive input
             #   will be changed when r > 1
@@ -635,13 +640,13 @@ def forward(
                 {"attn_loss": attn_loss.item()},
             ]
 
-        # caluculate cbhg loss
+        # calculate cbhg loss
         if self.use_cbhg:
             # remove unnecessary padded part (for multi-gpus)
             if max_out != spcs.shape[1]:
                 spcs = spcs[:, :max_out]
 
-            # caluculate cbhg outputs & loss and report them
+            # calculate cbhg outputs & loss and report them
             cbhg_outs, _ = self.cbhg(after_outs, olens)
             cbhg_l1_loss, cbhg_mse_loss = self.cbhg_loss(cbhg_outs, spcs, olens)
             loss = loss + cbhg_l1_loss + cbhg_mse_loss
diff --git a/espnet/nets/pytorch_backend/e2e_vc_transformer.py b/espnet/nets/pytorch_backend/e2e_vc_transformer.py
index 1b3c6913c17..c4e0144d412 100644
--- a/espnet/nets/pytorch_backend/e2e_vc_transformer.py
+++ b/espnet/nets/pytorch_backend/e2e_vc_transformer.py
@@ -718,13 +718,18 @@ def forward(self, xs, ilens, ys, labels, olens, spembs=None, *args, **kwargs):
 
         # modifiy mod part of groundtruth
         if self.reduction_factor > 1:
+            assert olens.ge(
+                self.reduction_factor
+            ).all(), "Output length must be greater than or equal to reduction factor."
             olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
             max_olen = max(olens)
             ys = ys[:, :max_olen]
             labels = labels[:, :max_olen]
-            labels[:, -1] = 1.0  # make sure at least one frame has 1
+            labels = torch.scatter(
+                labels, 1, (olens - 1).unsqueeze(1), 1.0
+            )  # see #3388
 
-        # caluculate loss values
+        # calculate loss values
         l1_loss, l2_loss, bce_loss = self.criterion(
             after_outs, before_outs, logits, ys, labels, olens
         )
diff --git a/espnet/nets/pytorch_backend/fastspeech/length_regulator.py b/espnet/nets/pytorch_backend/fastspeech/length_regulator.py
index 4f14560a84d..474ce0e5f9b 100644
--- a/espnet/nets/pytorch_backend/fastspeech/length_regulator.py
+++ b/espnet/nets/pytorch_backend/fastspeech/length_regulator.py
@@ -8,14 +8,10 @@
 
 import logging
 
-from distutils.version import LooseVersion
-
 import torch
 
 from espnet.nets.pytorch_backend.nets_utils import pad_list
 
-is_torch_1_1_plus = LooseVersion(torch.__version__) >= LooseVersion("1.1")
-
 
 class LengthRegulator(torch.nn.Module):
     """Length regulator module for feed-forward Transformer.
@@ -38,12 +34,8 @@ def __init__(self, pad_value=0.0):
             pad_value (float, optional): Value used for padding.
 
         """
-        super(LengthRegulator, self).__init__()
+        super().__init__()
         self.pad_value = pad_value
-        if is_torch_1_1_plus:
-            self.repeat_fn = self._repeat_one_sequence
-        else:
-            self.repeat_fn = self._legacy_repeat_one_sequence
 
     def forward(self, xs, ds, alpha=1.0):
         """Calculate forward propagation.
@@ -66,36 +58,10 @@ def forward(self, xs, ds, alpha=1.0):
                 "predicted durations includes all 0 sequences. "
                 "fill the first element with 1."
             )
-            # NOTE(kan-bayashi): This case must not be happend in teacher forcing.
+            # NOTE(kan-bayashi): This case must not be happened in teacher forcing.
             #   It will be happened in inference with a bad duration predictor.
             #   So we do not need to care the padded sequence case here.
             ds[ds.sum(dim=1).eq(0)] = 1
 
-        return pad_list([self.repeat_fn(x, d) for x, d in zip(xs, ds)], self.pad_value)
-
-    def _repeat_one_sequence(self, x, d):
-        """Repeat each frame according to duration for torch 1.1+."""
-        return torch.repeat_interleave(x, d, dim=0)
-
-    def _legacy_repeat_one_sequence(self, x, d):
-        """Repeat each frame according to duration for torch 1.0.
-
-        Examples:
-            >>> x = torch.tensor([[1], [2], [3]])
-            tensor([[1],
-                    [2],
-                    [3]])
-            >>> d = torch.tensor([1, 2, 3])
-            tensor([1, 2, 3])
-            >>> self._repeat_one_sequence(x, d)
-            tensor([[1],
-                    [2],
-                    [2],
-                    [3],
-                    [3],
-                    [3]])
-
-        """
-        return torch.cat(
-            [x_.repeat(int(d_), 1) for x_, d_ in zip(x, d) if d_ != 0], dim=0
-        )
+        repeat = [torch.repeat_interleave(x, d, dim=0) for x, d in zip(xs, ds)]
+        return pad_list(repeat, self.pad_value)
diff --git a/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py b/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py
index fc8031b807e..1495c81a40d 100644
--- a/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py
+++ b/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py
@@ -1,4 +1,4 @@
-from distutils.version import LooseVersion
+"""DNN beamformer module."""
 from typing import Tuple
 
 import torch
@@ -12,9 +12,6 @@
 from espnet.nets.pytorch_backend.frontends.mask_estimator import MaskEstimator
 from torch_complex.tensor import ComplexTensor
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
-is_torch_1_3_plus = LooseVersion(torch.__version__) >= LooseVersion("1.3.0")
-
 
 class DNN_Beamformer(torch.nn.Module):
     """DNN mask based Beamformer
@@ -158,16 +155,14 @@ def forward(
         B, _, C = psd_in.size()[:3]
         assert psd_in.size(2) == psd_in.size(3), psd_in.size()
         # psd_in: (B, F, C, C)
-        datatype = torch.bool if is_torch_1_3_plus else torch.uint8
-        datatype2 = torch.bool if is_torch_1_2_plus else torch.uint8
         psd = psd_in.masked_fill(
-            torch.eye(C, dtype=datatype, device=psd_in.device).type(datatype2), 0
+            torch.eye(C, dtype=torch.bool, device=psd_in.device), 0
         )
         # psd: (B, F, C, C) -> (B, C, F)
         psd = (psd.sum(dim=-1) / (C - 1)).transpose(-1, -2)
 
         # Calculate amplitude
-        psd_feat = (psd.real ** 2 + psd.imag ** 2) ** 0.5
+        psd_feat = (psd.real**2 + psd.imag**2) ** 0.5
 
         # (B, C, F) -> (B, C, F2)
         mlp_psd = self.mlp_psd(psd_feat)
diff --git a/espnet/nets/pytorch_backend/frontends/dnn_wpe.py b/espnet/nets/pytorch_backend/frontends/dnn_wpe.py
index 33ccd11c71a..8bfe599d2f7 100644
--- a/espnet/nets/pytorch_backend/frontends/dnn_wpe.py
+++ b/espnet/nets/pytorch_backend/frontends/dnn_wpe.py
@@ -62,7 +62,7 @@ def forward(
 
         for i in range(self.iterations):
             # Calculate power: (..., C, T)
-            power = enhanced.real ** 2 + enhanced.imag ** 2
+            power = enhanced.real**2 + enhanced.imag**2
             if i == 0 and self.use_dnn_mask:
                 # mask: (B, F, C, T)
                 (mask,), _ = self.mask_est(enhanced, ilens)
diff --git a/espnet/nets/pytorch_backend/frontends/feature_transform.py b/espnet/nets/pytorch_backend/frontends/feature_transform.py
index 700f63fdd08..53915d28815 100644
--- a/espnet/nets/pytorch_backend/frontends/feature_transform.py
+++ b/espnet/nets/pytorch_backend/frontends/feature_transform.py
@@ -64,7 +64,7 @@ def forward(
             h = x
 
         # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
-        h = h.real ** 2 + h.imag ** 2
+        h = h.real**2 + h.imag**2
 
         h, _ = self.logmel(h, ilens)
         if self.stats_file is not None:
diff --git a/espnet/nets/pytorch_backend/frontends/mask_estimator.py b/espnet/nets/pytorch_backend/frontends/mask_estimator.py
index 48aaf0a1df1..861527c7a90 100644
--- a/espnet/nets/pytorch_backend/frontends/mask_estimator.py
+++ b/espnet/nets/pytorch_backend/frontends/mask_estimator.py
@@ -46,7 +46,7 @@ def forward(
         xs = xs.permute(0, 2, 3, 1)
 
         # Calculate amplitude: (B, C, T, F) -> (B, C, T, F)
-        xs = (xs.real ** 2 + xs.imag ** 2) ** 0.5
+        xs = (xs.real**2 + xs.imag**2) ** 0.5
         # xs: (B, C, T, F) -> xs: (B * C, T, F)
         xs = xs.contiguous().view(-1, xs.size(-2), xs.size(-1))
         # ilens: (B,) -> ilens_: (B * C)
diff --git a/espnet/nets/pytorch_backend/gtn_ctc.py b/espnet/nets/pytorch_backend/gtn_ctc.py
index c7c71545c74..c5b765618d7 100644
--- a/espnet/nets/pytorch_backend/gtn_ctc.py
+++ b/espnet/nets/pytorch_backend/gtn_ctc.py
@@ -38,7 +38,7 @@ def create_ctc_graph(target, blank_idx):
         return g_criterion
 
     @staticmethod
-    def forward(ctx, log_probs, targets, blank_idx=0, reduction="none"):
+    def forward(ctx, log_probs, targets, ilens, blank_idx=0, reduction="none"):
         """Forward computation.
 
         :param torch.tensor log_probs: batched log softmax probabilities (B, Tmax, oDim)
@@ -47,15 +47,16 @@ def forward(ctx, log_probs, targets, blank_idx=0, reduction="none"):
         :return: ctc loss value
         :rtype: torch.Tensor
         """
-        B, T, C = log_probs.shape
+        B, _, C = log_probs.shape
         losses = [None] * B
         scales = [None] * B
         emissions_graphs = [None] * B
 
         def process(b):
             # create emission graph
+            T = ilens[b]
             g_emissions = gtn.linear_graph(T, C, log_probs.requires_grad)
-            cpu_data = log_probs[b].cpu().contiguous()
+            cpu_data = log_probs[b][:T].cpu().contiguous()
             g_emissions.set_weights(cpu_data.data_ptr())
 
             # create criterion graph
@@ -79,7 +80,7 @@ def process(b):
 
         gtn.parallel_for(process, range(B))
 
-        ctx.auxiliary_data = (losses, scales, emissions_graphs, log_probs.shape)
+        ctx.auxiliary_data = (losses, scales, emissions_graphs, log_probs.shape, ilens)
         loss = torch.tensor([losses[b].item() * scales[b] for b in range(B)])
         return torch.mean(loss.cuda() if log_probs.is_cuda else loss)
 
@@ -91,15 +92,16 @@ def backward(ctx, grad_output):
         :return: cumulative gradient output
         :rtype: (torch.Tensor, None, None, None)
         """
-        losses, scales, emissions_graphs, in_shape = ctx.auxiliary_data
+        losses, scales, emissions_graphs, in_shape, ilens = ctx.auxiliary_data
         B, T, C = in_shape
-        input_grad = torch.empty((B, T, C))
+        input_grad = torch.zeros((B, T, C))
 
         def process(b):
+            T = ilens[b]
             gtn.backward(losses[b], False)
             emissions = emissions_graphs[b]
             grad = emissions.grad().weights_to_numpy()
-            input_grad[b] = torch.from_numpy(grad).view(1, T, C) * scales[b]
+            input_grad[b][:T] = torch.from_numpy(grad).view(1, T, C) * scales[b]
 
         gtn.parallel_for(process, range(B))
 
@@ -110,6 +112,7 @@ def process(b):
         return (
             input_grad,
             None,  # targets
+            None,  # ilens
             None,  # blank_idx
             None,  # reduction
         )
diff --git a/espnet/nets/pytorch_backend/nets_utils.py b/espnet/nets/pytorch_backend/nets_utils.py
index 51e13fbf65c..a21ff54a78e 100644
--- a/espnet/nets/pytorch_backend/nets_utils.py
+++ b/espnet/nets/pytorch_backend/nets_utils.py
@@ -61,7 +61,7 @@ def pad_list(xs, pad_value):
     return pad
 
 
-def make_pad_mask(lengths, xs=None, length_dim=-1):
+def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None):
     """Make mask tensor containing indices of padded part.
 
     Args:
@@ -80,7 +80,7 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
         With only lengths.
 
         >>> lengths = [5, 3, 2]
-        >>> make_non_pad_mask(lengths)
+        >>> make_pad_mask(lengths)
         masks = [[0, 0, 0, 0 ,0],
                  [0, 0, 0, 1, 1],
                  [0, 0, 1, 1, 1]]
@@ -153,10 +153,14 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
     if not isinstance(lengths, list):
         lengths = lengths.tolist()
     bs = int(len(lengths))
-    if xs is None:
-        maxlen = int(max(lengths))
+    if maxlen is None:
+        if xs is None:
+            maxlen = int(max(lengths))
+        else:
+            maxlen = xs.size(length_dim)
     else:
-        maxlen = xs.size(length_dim)
+        assert xs is None
+        assert maxlen >= int(max(lengths))
 
     seq_range = torch.arange(0, maxlen, dtype=torch.int64)
     seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
diff --git a/espnet/nets/pytorch_backend/rnn/attentions.py b/espnet/nets/pytorch_backend/rnn/attentions.py
index 8458c156f45..3df28169bd9 100644
--- a/espnet/nets/pytorch_backend/rnn/attentions.py
+++ b/espnet/nets/pytorch_backend/rnn/attentions.py
@@ -305,7 +305,7 @@ def forward(
         backward_window=1,
         forward_window=3,
     ):
-        """Calcualte AttLoc forward propagation.
+        """Calculate AttLoc forward propagation.
 
         :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
         :param list enc_hs_len: padded encoder hidden state length (B)
diff --git a/espnet/nets/pytorch_backend/rnn/decoders.py b/espnet/nets/pytorch_backend/rnn/decoders.py
index dc04e20e925..9e7a60716d8 100644
--- a/espnet/nets/pytorch_backend/rnn/decoders.py
+++ b/espnet/nets/pytorch_backend/rnn/decoders.py
@@ -1,4 +1,4 @@
-from distutils.version import LooseVersion
+"""RNN decoder module."""
 import logging
 import math
 import random
@@ -265,15 +265,11 @@ def forward(self, hs_pad, hlens, ys_pad, strm_idx=0, lang_ids=None):
         z_all = torch.stack(z_all, dim=1).view(batch * olength, -1)
         # compute loss
         y_all = self.output(z_all)
-        if LooseVersion(torch.__version__) < LooseVersion("1.0"):
-            reduction_str = "elementwise_mean"
-        else:
-            reduction_str = "mean"
         self.loss = F.cross_entropy(
             y_all,
             ys_out_pad.view(-1),
             ignore_index=self.ignore_id,
-            reduction=reduction_str,
+            reduction="mean",
         )
         # compute perplexity
         ppl = math.exp(self.loss.item())
diff --git a/espnet/nets/pytorch_backend/tacotron2/decoder.py b/espnet/nets/pytorch_backend/tacotron2/decoder.py
index c5a5b9ba23b..352635ddd16 100644
--- a/espnet/nets/pytorch_backend/tacotron2/decoder.py
+++ b/espnet/nets/pytorch_backend/tacotron2/decoder.py
@@ -155,7 +155,7 @@ class Postnet(torch.nn.Module):
     Conditioning WaveNet on Mel Spectrogram Predictions`_.
     The Postnet predicts refines the predicted
     Mel-filterbank of the decoder,
-    which helps to compensate the detail sturcture of spectrogram.
+    which helps to compensate the detail structure of spectrogram.
 
     .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
        https://arxiv.org/abs/1712.05884
diff --git a/espnet/nets/pytorch_backend/transducer/arguments.py b/espnet/nets/pytorch_backend/transducer/arguments.py
index 96f9fda4a09..feeaec8059f 100644
--- a/espnet/nets/pytorch_backend/transducer/arguments.py
+++ b/espnet/nets/pytorch_backend/transducer/arguments.py
@@ -1,10 +1,11 @@
 """Transducer model arguments."""
 
+from argparse import _ArgumentGroup
 import ast
 from distutils.util import strtobool
 
 
-def add_encoder_general_arguments(group):
+def add_encoder_general_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
     """Define general arguments for encoder."""
     group.add_argument(
         "--etype",
@@ -41,7 +42,7 @@ def add_encoder_general_arguments(group):
     return group
 
 
-def add_rnn_encoder_arguments(group):
+def add_rnn_encoder_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
     """Define arguments for RNN encoder."""
     group.add_argument(
         "--elayers",
@@ -71,7 +72,7 @@ def add_rnn_encoder_arguments(group):
     return group
 
 
-def add_custom_encoder_arguments(group):
+def add_custom_encoder_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
     """Define arguments for Custom encoder."""
     group.add_argument(
         "--enc-block-arch",
@@ -82,7 +83,7 @@ def add_custom_encoder_arguments(group):
     )
     group.add_argument(
         "--enc-block-repeat",
-        default=0,
+        default=1,
         type=int,
         help="Repeat N times the provided encoder blocks if N > 1",
     )
@@ -93,6 +94,18 @@ def add_custom_encoder_arguments(group):
         choices=["conv2d", "vgg2l", "linear", "embed"],
         help="Custom encoder input layer type",
     )
+    group.add_argument(
+        "--custom-enc-input-dropout-rate",
+        type=float,
+        default=0.0,
+        help="Dropout rate of custom encoder input layer",
+    )
+    group.add_argument(
+        "--custom-enc-input-pos-enc-dropout-rate",
+        type=float,
+        default=0.0,
+        help="Dropout rate of positional encoding in custom encoder input layer",
+    )
     group.add_argument(
         "--custom-enc-positional-encoding-type",
         type=str,
@@ -125,7 +138,7 @@ def add_custom_encoder_arguments(group):
     return group
 
 
-def add_decoder_general_arguments(group):
+def add_decoder_general_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
     """Define general arguments for encoder."""
     group.add_argument(
         "--dtype",
@@ -150,7 +163,7 @@ def add_decoder_general_arguments(group):
     return group
 
 
-def add_rnn_decoder_arguments(group):
+def add_rnn_decoder_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
     """Define arguments for RNN decoder."""
     group.add_argument(
         "--dec-embed-dim",
@@ -168,7 +181,7 @@ def add_rnn_decoder_arguments(group):
     return group
 
 
-def add_custom_decoder_arguments(group):
+def add_custom_decoder_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
     """Define arguments for Custom decoder."""
     group.add_argument(
         "--dec-block-arch",
@@ -201,38 +214,58 @@ def add_custom_decoder_arguments(group):
     return group
 
 
-def add_custom_training_arguments(group):
+def add_custom_training_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
     """Define arguments for training with Custom architecture."""
     group.add_argument(
-        "--transformer-warmup-steps",
+        "--optimizer-warmup-steps",
         default=25000,
         type=int,
         help="Optimizer warmup steps",
     )
     group.add_argument(
-        "--transformer-lr",
+        "--noam-lr",
         default=10.0,
         type=float,
         help="Initial value of learning rate",
     )
+    group.add_argument(
+        "--noam-adim",
+        default=0,
+        type=int,
+        help="Most dominant attention dimension for scheduler.",
+    )
+    group.add_argument(
+        "--transformer-warmup-steps",
+        type=int,
+        help="Optimizer warmup steps. The parameter is deprecated, "
+        "please use --optimizer-warmup-steps instead.",
+        dest="optimizer_warmup_steps",
+    )
+    group.add_argument(
+        "--transformer-lr",
+        type=float,
+        help="Initial value of learning rate. The parameter is deprecated, "
+        "please use --noam-lr instead.",
+        dest="noam_lr",
+    )
+    group.add_argument(
+        "--adim",
+        type=int,
+        help="Most dominant attention dimension for scheduler. "
+        "The parameter is deprecated, please use --noam-adim instead.",
+        dest="noam_adim",
+    )
 
     return group
 
 
-def add_transducer_arguments(group):
-    """Define general arguments for transducer model."""
-    group.add_argument(
-        "--trans-type",
-        default="warp-transducer",
-        type=str,
-        choices=["warp-transducer", "warp-rnnt"],
-        help="Type of transducer implementation to calculate loss.",
-    )
+def add_transducer_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
+    """Define general arguments for Transducer model."""
     group.add_argument(
         "--transducer-weight",
         default=1.0,
         type=float,
-        help="Weight of transducer loss when auxiliary task is used.",
+        help="Weight of main Transducer loss.",
     )
     group.add_argument(
         "--joint-dim",
@@ -252,70 +285,102 @@ def add_transducer_arguments(group):
         type=strtobool,
         nargs="?",
         default=True,
-        help="Normalize transducer scores by length",
+        help="Normalize Transducer scores by length",
+    )
+    group.add_argument(
+        "--fastemit-lambda",
+        default=0.0,
+        type=float,
+        help="Regularization parameter for FastEmit (https://arxiv.org/abs/2010.11148)",
     )
 
     return group
 
 
-def add_auxiliary_task_arguments(group):
+def add_auxiliary_task_arguments(group: _ArgumentGroup) -> _ArgumentGroup:
     """Add arguments for auxiliary task."""
     group.add_argument(
-        "--aux-task-type",
+        "--use-ctc-loss",
+        type=strtobool,
         nargs="?",
-        default=None,
-        choices=["default", "symm_kl_div", "both"],
-        help="Type of auxiliary task.",
+        default=False,
+        help="Whether to compute auxiliary CTC loss.",
     )
     group.add_argument(
-        "--aux-task-layer-list",
-        default=None,
-        type=ast.literal_eval,
-        help="List of layers to use for auxiliary task.",
+        "--ctc-loss-weight",
+        default=0.5,
+        type=float,
+        help="Weight of auxiliary CTC loss.",
     )
     group.add_argument(
-        "--aux-task-weight",
-        default=0.3,
+        "--ctc-loss-dropout-rate",
+        default=0.0,
         type=float,
-        help="Weight of auxiliary task loss.",
+        help="Dropout rate for auxiliary CTC.",
     )
     group.add_argument(
-        "--aux-ctc",
+        "--use-lm-loss",
         type=strtobool,
         nargs="?",
         default=False,
-        help="Whether to use CTC as auxiliary task.",
+        help="Whether to compute auxiliary LM loss (label smoothing).",
     )
     group.add_argument(
-        "--aux-ctc-weight",
-        default=1.0,
+        "--lm-loss-weight",
+        default=0.5,
         type=float,
-        help="Weight of auxiliary task loss",
+        help="Weight of auxiliary LM loss.",
     )
     group.add_argument(
-        "--aux-ctc-dropout-rate",
+        "--lm-loss-smoothing-rate",
         default=0.0,
         type=float,
-        help="Dropout rate for auxiliary CTC",
+        help="Smoothing rate for LM loss. If > 0, label smoothing is enabled.",
     )
     group.add_argument(
-        "--aux-cross-entropy",
+        "--use-aux-transducer-loss",
         type=strtobool,
         nargs="?",
         default=False,
-        help="Whether to use CE as auxiliary task for the prediction network.",
+        help="Whether to compute auxiliary Transducer loss.",
+    )
+    group.add_argument(
+        "--aux-transducer-loss-weight",
+        default=0.2,
+        type=float,
+        help="Weight of auxiliary Transducer loss.",
+    )
+    group.add_argument(
+        "--aux-transducer-loss-enc-output-layers",
+        default=None,
+        type=ast.literal_eval,
+        help="List of intermediate encoder layers for auxiliary "
+        "transducer loss computation.",
     )
     group.add_argument(
-        "--aux-cross-entropy-smoothing",
+        "--aux-transducer-loss-mlp-dim",
+        default=320,
+        type=int,
+        help="Multilayer perceptron hidden dimension for auxiliary Transducer loss.",
+    )
+    group.add_argument(
+        "--aux-transducer-loss-mlp-dropout-rate",
         default=0.0,
         type=float,
-        help="Smoothing rate for cross-entropy. If > 0, enables label smoothing loss.",
+        help="Multilayer perceptron dropout rate for auxiliary Transducer loss.",
     )
     group.add_argument(
-        "--aux-cross-entropy-weight",
-        default=0.5,
+        "--use-symm-kl-div-loss",
+        type=strtobool,
+        nargs="?",
+        default=False,
+        help="Whether to compute symmetric KL divergence loss.",
+    )
+    group.add_argument(
+        "--symm-kl-div-loss-weight",
+        default=0.2,
         type=float,
-        help="Weight of auxiliary task loss",
+        help="Weight of symmetric KL divergence loss.",
     )
 
     return group
diff --git a/espnet/nets/pytorch_backend/transducer/auxiliary_task.py b/espnet/nets/pytorch_backend/transducer/auxiliary_task.py
deleted file mode 100644
index 8517cdd4428..00000000000
--- a/espnet/nets/pytorch_backend/transducer/auxiliary_task.py
+++ /dev/null
@@ -1,115 +0,0 @@
-"""Auxiliary task implementation for transducer models."""
-
-from itertools import chain
-from typing import List
-from typing import Union
-
-import torch
-import torch.nn.functional as F
-
-from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface
-
-
-class AuxiliaryTask(torch.nn.Module):
-    """Auxiliary task module."""
-
-    def __init__(
-        self,
-        decoder: Union[torch.nn.Module, TransducerDecoderInterface],
-        joint_network: torch.nn.Module,
-        rnnt_criterion: torch.nn.Module,
-        aux_task_type: str,
-        aux_task_weight: int,
-        encoder_out_dim: int,
-        joint_dim: int,
-    ):
-        """Auxiliary task initialization.
-
-        Args:
-            decoder: Decoder module
-            joint_network: Joint network module
-            aux_task_type: Auxiliary task type
-            aux_task_weight: Auxiliary task weight
-            encoder_out: Encoder output dimension
-            joint_dim: Joint space dimension
-
-        """
-        super().__init__()
-
-        self.rnnt_criterion = rnnt_criterion
-
-        self.mlp_net = torch.nn.Sequential(
-            torch.nn.Linear(encoder_out_dim, joint_dim),
-            torch.nn.ReLU(),
-            torch.nn.Linear(joint_dim, joint_dim),
-        )
-
-        self.decoder = decoder
-        self.joint_network = joint_network
-
-        self.aux_task_type = aux_task_type
-        self.aux_task_weight = aux_task_weight
-
-    def forward(
-        self,
-        enc_out_aux: List,
-        dec_out: torch.Tensor,
-        main_joint: torch.Tensor,
-        target: torch.Tensor,
-        pred_len: torch.Tensor,
-        target_len: torch.Tensor,
-    ) -> float:
-        """Forward auxiliary task.
-
-        Args:
-            enc_out_aux: List of encoder intermediate outputs
-            dec_out: Decoder outputs
-            main_joint: Joint output for main task
-            target: Target labels
-            pred_len: Prediction lengths
-            target_len: Target lengths
-
-        Returns:
-            : Weighted auxiliary loss
-
-        """
-        aux_default = 0
-        aux_symm_kl = 0
-
-        for p in chain(self.decoder.parameters(), self.joint_network.parameters()):
-            p.requires_grad = False
-
-        for i, enc_aux in enumerate(enc_out_aux):
-            aux_mlp = self.mlp_net(enc_aux)
-
-            aux_joint = self.joint_network(
-                aux_mlp.unsqueeze(2),
-                dec_out.unsqueeze(1),
-                is_aux=True,
-            )
-
-            if self.aux_task_type != "symm_kl_div":
-                aux_default += self.rnnt_criterion(
-                    aux_joint,
-                    target,
-                    pred_len,
-                    target_len,
-                )
-
-            if self.aux_task_type != "default":
-                aux_symm_kl += F.kl_div(
-                    F.log_softmax(main_joint, dim=-1),
-                    F.softmax(aux_joint, dim=-1),
-                    reduction="mean",
-                ) + F.kl_div(
-                    F.log_softmax(aux_joint, dim=-1),
-                    F.softmax(main_joint, dim=-1),
-                    reduction="mean",
-                )
-
-        aux_main_loss = aux_default + aux_symm_kl
-
-        for p in chain(self.decoder.parameters(), self.joint_network.parameters()):
-            p.requires_grad = True
-
-        return self.aux_task_weight * aux_main_loss
diff --git a/espnet/nets/pytorch_backend/transducer/blocks.py b/espnet/nets/pytorch_backend/transducer/blocks.py
index 875053ab69b..86abc21e9a8 100644
--- a/espnet/nets/pytorch_backend/transducer/blocks.py
+++ b/espnet/nets/pytorch_backend/transducer/blocks.py
@@ -1,6 +1,10 @@
 """Set of methods to create custom architecture."""
 
-from collections import Counter
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Union
 
 import torch
 
@@ -11,11 +15,11 @@
 
 from espnet.nets.pytorch_backend.nets_utils import get_activation
 
-from espnet.nets.pytorch_backend.transducer.causal_conv1d import CausalConv1d
+from espnet.nets.pytorch_backend.transducer.conv1d_nets import CausalConv1d
+from espnet.nets.pytorch_backend.transducer.conv1d_nets import Conv1d
 from espnet.nets.pytorch_backend.transducer.transformer_decoder_layer import (
-    DecoderLayer,  # noqa: H301
+    TransformerDecoderLayer,  # noqa: H301
 )
-from espnet.nets.pytorch_backend.transducer.tdnn import TDNN
 from espnet.nets.pytorch_backend.transducer.vgg2l import VGG2L
 
 from espnet.nets.pytorch_backend.transformer.attention import (
@@ -35,187 +39,171 @@
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
 
 
-def check_and_prepare(net_part, blocks_arch, input_layer):
-    """Check consecutive block shapes match and prepare input parameters.
+def verify_block_arguments(
+    net_part: str,
+    block: Dict[str, Any],
+    num_block: int,
+) -> Tuple[int, int]:
+    """Verify block arguments are valid.
 
     Args:
-        net_part (str): either 'encoder' or 'decoder'
-        blocks_arch (list): list of blocks for network part (type and parameters)
-        input_layer (str): input layer type
+        net_part: Network part, either 'encoder' or 'decoder'.
+        block: Block parameters.
+        num_block: Block ID.
 
     Return:
-        input_layer (str): input layer type
-        input_layer_odim (int): output dim of input layer
-        input_dropout_rate (float): dropout rate of input layer
-        input_pos_dropout_rate (float): dropout rate of input layer positional enc.
-        out_dim (int): output dim of last block
+        block_io: Input and output dimension of the block.
 
     """
-    input_dropout_rate = sorted(
-        Counter(
-            b["dropout-rate"] for b in blocks_arch if "dropout-rate" in b
-        ).most_common(),
-        key=lambda x: x[0],
-        reverse=True,
-    )
-
-    input_pos_dropout_rate = sorted(
-        Counter(
-            b["pos-dropout-rate"] for b in blocks_arch if "pos-dropout-rate" in b
-        ).most_common(),
-        key=lambda x: x[0],
-        reverse=True,
-    )
+    block_type = block.get("type")
 
-    input_dropout_rate = input_dropout_rate[0][0] if input_dropout_rate else 0.0
-    input_pos_dropout_rate = (
-        input_pos_dropout_rate[0][0] if input_pos_dropout_rate else 0.0
-    )
+    if block_type is None:
+        raise ValueError(
+            "Block %d in %s doesn't a type assigned.", (num_block, net_part)
+        )
 
-    cmp_io = []
-    has_transformer = False
-    has_conformer = False
-    for i in range(len(blocks_arch)):
-        if "type" in blocks_arch[i]:
-            block_type = blocks_arch[i]["type"]
-        else:
-            raise ValueError("type is not defined in the " + str(i + 1) + "th block.")
-
-        if block_type == "transformer":
-            if not {"d_hidden", "d_ff", "heads"}.issubset(blocks_arch[i]):
-                raise ValueError(
-                    "Block "
-                    + str(i + 1)
-                    + "in "
-                    + net_part
-                    + ": Transformer block format is: {'type: transformer', "
-                    "'d_hidden': int, 'd_ff': int, 'heads': int, [...]}"
-                )
-
-            has_transformer = True
-            cmp_io.append((blocks_arch[i]["d_hidden"], blocks_arch[i]["d_hidden"]))
-        elif block_type == "conformer":
-            if net_part != "encoder":
-                raise ValueError(
-                    "Block " + str(i + 1) + ": conformer type is only for encoder part."
-                )
-
-            if not {
-                "d_hidden",
-                "d_ff",
-                "heads",
-                "macaron_style",
-                "use_conv_mod",
-            }.issubset(blocks_arch[i]):
-                raise ValueError(
-                    "Block "
-                    + str(i + 1)
-                    + " in "
-                    + net_part
-                    + ": Conformer block format is {'type: conformer', "
-                    "'d_hidden': int, 'd_ff': int, 'heads': int, "
-                    "'macaron_style': bool, 'use_conv_mod': bool, [...]}"
-                )
-
-            if (
-                blocks_arch[i]["use_conv_mod"] is True
-                and "conv_mod_kernel" not in blocks_arch[i]
-            ):
-                raise ValueError(
-                    "Block "
-                    + str(i + 1)
-                    + ": 'use_conv_mod' is True but 'use_conv_kernel' is not specified"
-                )
-
-            has_conformer = True
-            cmp_io.append((blocks_arch[i]["d_hidden"], blocks_arch[i]["d_hidden"]))
-        elif block_type == "causal-conv1d":
-            if not {"idim", "odim", "kernel_size"}.issubset(blocks_arch[i]):
-                raise ValueError(
-                    "Block "
-                    + str(i + 1)
-                    + " in "
-                    + net_part
-                    + ": causal conv1d block format is: {'type: causal-conv1d', "
-                    "'idim': int, 'odim': int, 'kernel_size': int}"
-                )
-
-            if i == 0:
-                input_layer = "c-embed"
-
-            cmp_io.append((blocks_arch[i]["idim"], blocks_arch[i]["odim"]))
-        elif block_type == "tdnn":
-            if not {"idim", "odim", "ctx_size", "dilation", "stride"}.issubset(
-                blocks_arch[i]
-            ):
-                raise ValueError(
-                    "Block "
-                    + str(i + 1)
-                    + " in "
-                    + net_part
-                    + ": TDNN block format is: {'type: tdnn', "
-                    "'idim': int, 'odim': int, 'ctx_size': int, "
-                    "'dilation': int, 'stride': int, [...]}"
-                )
-
-            cmp_io.append((blocks_arch[i]["idim"], blocks_arch[i]["odim"]))
-        else:
-            raise NotImplementedError(
-                "Wrong type for block "
-                + str(i + 1)
-                + " in "
-                + net_part
-                + ". Currently supported: "
-                "tdnn, causal-conv1d or transformer"
+    if block_type == "transformer":
+        arguments = {"d_hidden", "d_ff", "heads"}
+    elif block_type == "conformer":
+        arguments = {
+            "d_hidden",
+            "d_ff",
+            "heads",
+            "macaron_style",
+            "use_conv_mod",
+        }
+
+        if net_part == "decoder":
+            raise ValueError("Decoder does not support 'conformer'.")
+
+        if block.get("use_conv_mod", None) is True and "conv_mod_kernel" not in block:
+            raise ValueError(
+                "Block %d: 'use_conv_mod' is True but "
+                " 'conv_mod_kernel' is not specified" % num_block
             )
+    elif block_type == "causal-conv1d":
+        arguments = {"idim", "odim", "kernel_size"}
+
+        if net_part == "encoder":
+            raise ValueError("Encoder does not support 'causal-conv1d'.")
+
+    elif block_type == "conv1d":
+        arguments = {"idim", "odim", "kernel_size"}
+
+        if net_part == "decoder":
+            raise ValueError("Decoder does not support 'conv1d.'")
+    else:
+        raise NotImplementedError(
+            "Wrong type. Currently supported: "
+            "causal-conv1d, conformer, conv-nd or transformer."
+        )
+
+    if not arguments.issubset(block):
+        raise ValueError(
+            "%s in %s in position %d: Expected block arguments : %s."
+            " See tutorial page for more information."
+            % (block_type, net_part, num_block, arguments)
+        )
+
+    if block_type in ("transformer", "conformer"):
+        block_io = (block["d_hidden"], block["d_hidden"])
+    else:
+        block_io = (block["idim"], block["odim"])
+
+    return block_io
+
+
+def prepare_input_layer(
+    input_layer_type: str,
+    feats_dim: int,
+    blocks: List[Dict[str, Any]],
+    dropout_rate: float,
+    pos_enc_dropout_rate: float,
+) -> Dict[str, Any]:
+    """Prepare input layer arguments.
+
+    Args:
+        input_layer_type: Input layer type.
+        feats_dim: Dimension of input features.
+        blocks: Blocks parameters for network part.
+        dropout_rate: Dropout rate for input layer.
+        pos_enc_dropout_rate: Dropout rate for input layer pos. enc.
+
+    Return:
+        input_block: Input block parameters.
+
+    """
+    input_block = {}
+    first_block_type = blocks[0].get("type", None)
+
+    if first_block_type == "causal-conv1d":
+        input_block["type"] = "c-embed"
+    else:
+        input_block["type"] = input_layer_type
+
+    input_block["dropout-rate"] = dropout_rate
+    input_block["pos-dropout-rate"] = pos_enc_dropout_rate
+
+    input_block["idim"] = feats_dim
+
+    if first_block_type in ("transformer", "conformer"):
+        input_block["odim"] = blocks[0].get("d_hidden", 0)
+    else:
+        input_block["odim"] = blocks[0].get("idim", 0)
+
+    return input_block
+
+
+def prepare_body_model(
+    net_part: str,
+    blocks: List[Dict[str, Any]],
+) -> Tuple[int]:
+    """Prepare model body blocks.
+
+    Args:
+        net_part: Network part, either 'encoder' or 'decoder'.
+        blocks: Blocks parameters for network part.
+
+    Return:
+        : Network output dimension.
+
+    """
+    cmp_io = [
+        verify_block_arguments(net_part, b, (i + 1)) for i, b in enumerate(blocks)
+    ]
 
-    if has_transformer and has_conformer:
+    if {"transformer", "conformer"} <= {b["type"] for b in blocks}:
         raise NotImplementedError(
             net_part + ": transformer and conformer blocks "
-            "can't be defined in the same net part."
+            "can't be used together in the same net part."
         )
 
     for i in range(1, len(cmp_io)):
         if cmp_io[(i - 1)][1] != cmp_io[i][0]:
             raise ValueError(
-                "Output/Input mismatch between blocks "
-                + str(i)
-                + " and "
-                + str(i + 1)
-                + " in "
-                + net_part
+                "Output/Input mismatch between blocks %d and %d in %s"
+                % (i, (i + 1), net_part)
             )
 
-    if blocks_arch[0]["type"] in ("tdnn", "causal-conv1d"):
-        input_layer_odim = blocks_arch[0]["idim"]
-    else:
-        input_layer_odim = blocks_arch[0]["d_hidden"]
-
-    if blocks_arch[-1]["type"] in ("tdnn", "causal-conv1d"):
-        out_dim = blocks_arch[-1]["odim"]
-    else:
-        out_dim = blocks_arch[-1]["d_hidden"]
+    return cmp_io[-1][1]
 
-    return (
-        input_layer,
-        input_layer_odim,
-        input_dropout_rate,
-        input_pos_dropout_rate,
-        out_dim,
-    )
 
-
-def get_pos_enc_and_att_class(net_part, pos_enc_type, self_attn_type):
+def get_pos_enc_and_att_class(
+    net_part: str, pos_enc_type: str, self_attn_type: str
+) -> Tuple[
+    Union[PositionalEncoding, ScaledPositionalEncoding, RelPositionalEncoding],
+    Union[MultiHeadedAttention, RelPositionMultiHeadedAttention],
+]:
     """Get positional encoding and self attention module class.
 
     Args:
-        net_part (str): either 'encoder' or 'decoder'
-        pos_enc_type (str): positional encoding type
-        self_attn_type (str): self-attention type
+        net_part: Network part, either 'encoder' or 'decoder'.
+        pos_enc_type: Positional encoding type.
+        self_attn_type: Self-attention type.
 
     Return:
-        pos_enc_class (torch.nn.Module): positional encoding class
-        self_attn_class (torch.nn.Module): self-attention class
+        pos_enc_class: Positional encoding class.
+        self_attn_class: Self-attention class.
 
     """
     if pos_enc_type == "abs_pos":
@@ -240,38 +228,36 @@ def get_pos_enc_and_att_class(net_part, pos_enc_type, self_attn_type):
 
 
 def build_input_layer(
-    input_layer,
-    idim,
-    odim,
-    pos_enc_class,
-    dropout_rate_embed,
-    dropout_rate,
-    pos_dropout_rate,
-    padding_idx,
-):
+    block: Dict[str, Any],
+    pos_enc_class: torch.nn.Module,
+    padding_idx: int,
+) -> Tuple[Union[Conv2dSubsampling, VGG2L, torch.nn.Sequential], int]:
     """Build input layer.
 
     Args:
-        input_layer (str): input layer type
-        idim (int): input dimension
-        odim (int): output dimension
-        pos_enc_class (class): positional encoding class
-        dropout_rate_embed (float): dropout rate for embedding layer
-        dropout_rate (float): dropout rate for input layer
-        pos_dropout_rate (float): dropout rate for positional encoding
-        padding_idx (int): padding index for embedding input layer (if specified)
+        block: Architecture definition of input layer.
+        pos_enc_class: Positional encoding class.
+        padding_idx: Padding symbol ID for embedding layer (if provided).
 
     Returns:
-        (torch.nn.*): input layer module
-        subsampling_factor (int): subsampling factor
+        : Input layer module.
+        subsampling_factor: Subsampling factor.
 
     """
+    input_type = block["type"]
+
+    idim = block["idim"]
+    odim = block["odim"]
+
+    dropout_rate = block["dropout-rate"]
+    pos_dropout_rate = block["pos-dropout-rate"]
+
     if pos_enc_class.__name__ == "RelPositionalEncoding":
         pos_enc_class_subsampling = pos_enc_class(odim, pos_dropout_rate)
     else:
         pos_enc_class_subsampling = None
 
-    if input_layer == "linear":
+    if input_type == "linear":
         return (
             torch.nn.Sequential(
                 torch.nn.Linear(idim, odim),
@@ -282,11 +268,11 @@ def build_input_layer(
             ),
             1,
         )
-    elif input_layer == "conv2d":
+    elif input_type == "conv2d":
         return Conv2dSubsampling(idim, odim, dropout_rate, pos_enc_class_subsampling), 4
-    elif input_layer == "vgg2l":
+    elif input_type == "vgg2l":
         return VGG2L(idim, odim, pos_enc_class_subsampling), 4
-    elif input_layer == "embed":
+    elif input_type == "embed":
         return (
             torch.nn.Sequential(
                 torch.nn.Embedding(idim, odim, padding_idx=padding_idx),
@@ -294,254 +280,256 @@ def build_input_layer(
             ),
             1,
         )
-    elif input_layer == "c-embed":
+    elif input_type == "c-embed":
         return (
             torch.nn.Sequential(
                 torch.nn.Embedding(idim, odim, padding_idx=padding_idx),
-                torch.nn.Dropout(dropout_rate_embed),
+                torch.nn.Dropout(dropout_rate),
             ),
             1,
         )
     else:
-        raise NotImplementedError("Support: linear, conv2d, vgg2l and embed")
+        raise NotImplementedError(
+            "Invalid input layer: %s. Supported: linear, conv2d, vgg2l and embed"
+            % input_type
+        )
 
 
-def build_transformer_block(net_part, block_arch, pw_layer_type, pw_activation_type):
+def build_transformer_block(
+    net_part: str,
+    block: Dict[str, Any],
+    pw_layer_type: str,
+    pw_activation_type: str,
+) -> Union[EncoderLayer, TransformerDecoderLayer]:
     """Build function for transformer block.
 
     Args:
-        net_part (str): either 'encoder' or 'decoder'
-        block_arch (dict): transformer block parameters
-        pw_layer_type (str): positionwise layer type
-        pw_activation_type (str): positionwise activation type
+        net_part: Network part, either 'encoder' or 'decoder'.
+        block: Transformer block parameters.
+        pw_layer_type: Positionwise layer type.
+        pw_activation_type: Positionwise activation type.
 
     Returns:
-        (function): function to create transformer block
+        : Function to create transformer (encoder or decoder) block.
 
     """
-    d_hidden = block_arch["d_hidden"]
-    d_ff = block_arch["d_ff"]
-    heads = block_arch["heads"]
+    d_hidden = block["d_hidden"]
 
-    dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0
-    pos_dropout_rate = (
-        block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0
-    )
-    att_dropout_rate = (
-        block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0
-    )
+    dropout_rate = block.get("dropout-rate", 0.0)
+    pos_dropout_rate = block.get("pos-dropout-rate", 0.0)
+    att_dropout_rate = block.get("att-dropout-rate", 0.0)
 
-    if pw_layer_type == "linear":
-        pw_layer = PositionwiseFeedForward
-        pw_activation = get_activation(pw_activation_type)
-        pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation)
-    else:
-        raise NotImplementedError("Transformer block only supports linear yet.")
+    if pw_layer_type != "linear":
+        raise NotImplementedError(
+            "Transformer block only supports linear pointwise layer."
+        )
 
     if net_part == "encoder":
         transformer_layer_class = EncoderLayer
     elif net_part == "decoder":
-        transformer_layer_class = DecoderLayer
+        transformer_layer_class = TransformerDecoderLayer
 
     return lambda: transformer_layer_class(
         d_hidden,
-        MultiHeadedAttention(heads, d_hidden, att_dropout_rate),
-        pw_layer(*pw_layer_args),
+        MultiHeadedAttention(block["heads"], d_hidden, att_dropout_rate),
+        PositionwiseFeedForward(
+            d_hidden,
+            block["d_ff"],
+            pos_dropout_rate,
+            get_activation(pw_activation_type),
+        ),
         dropout_rate,
     )
 
 
 def build_conformer_block(
-    block_arch,
-    self_attn_class,
-    pw_layer_type,
-    pw_activation_type,
-    conv_mod_activation_type,
-):
+    block: Dict[str, Any],
+    self_attn_class: str,
+    pw_layer_type: str,
+    pw_activation_type: str,
+    conv_mod_activation_type: str,
+) -> ConformerEncoderLayer:
     """Build function for conformer block.
 
     Args:
-        block_arch (dict): conformer block parameters
-        self_attn_type (str): self-attention module type
-        pw_layer_type (str): positionwise layer type
-        pw_activation_type (str): positionwise activation type
-        conv_mod_activation_type (str): convolutional module activation type
+        block: Conformer block parameters.
+        self_attn_type: Self-attention module type.
+        pw_layer_type: Positionwise layer type.
+        pw_activation_type: Positionwise activation type.
+        conv_mod_activation_type: Convolutional module activation type.
 
     Returns:
-        (function): function to create conformer block
+        : Function to create conformer (encoder) block.
 
     """
-    d_hidden = block_arch["d_hidden"]
-    d_ff = block_arch["d_ff"]
-    heads = block_arch["heads"]
-    macaron_style = block_arch["macaron_style"]
-    use_conv_mod = block_arch["use_conv_mod"]
-
-    dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0
-    pos_dropout_rate = (
-        block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0
-    )
-    att_dropout_rate = (
-        block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0
-    )
+    d_hidden = block["d_hidden"]
+    d_ff = block["d_ff"]
+
+    dropout_rate = block.get("dropout-rate", 0.0)
+    pos_dropout_rate = block.get("pos-dropout-rate", 0.0)
+    att_dropout_rate = block.get("att-dropout-rate", 0.0)
+
+    macaron_style = block["macaron_style"]
+    use_conv_mod = block["use_conv_mod"]
 
     if pw_layer_type == "linear":
         pw_layer = PositionwiseFeedForward
-        pw_activation = get_activation(pw_activation_type)
-        pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation)
+        pw_layer_args = (
+            d_hidden,
+            d_ff,
+            pos_dropout_rate,
+            get_activation(pw_activation_type),
+        )
     else:
         raise NotImplementedError("Conformer block only supports linear yet.")
 
+    if macaron_style:
+        macaron_net = PositionwiseFeedForward
+        macaron_net_args = (
+            d_hidden,
+            d_ff,
+            pos_dropout_rate,
+            get_activation(pw_activation_type),
+        )
+
     if use_conv_mod:
-        conv_layer = ConvolutionModule
-        conv_activation = get_activation(conv_mod_activation_type)
-        conv_layers_args = (d_hidden, block_arch["conv_mod_kernel"], conv_activation)
+        conv_mod = ConvolutionModule
+        conv_mod_args = (
+            d_hidden,
+            block["conv_mod_kernel"],
+            get_activation(conv_mod_activation_type),
+        )
 
     return lambda: ConformerEncoderLayer(
         d_hidden,
-        self_attn_class(heads, d_hidden, att_dropout_rate),
+        self_attn_class(block["heads"], d_hidden, att_dropout_rate),
         pw_layer(*pw_layer_args),
-        pw_layer(*pw_layer_args) if macaron_style else None,
-        conv_layer(*conv_layers_args) if use_conv_mod else None,
+        macaron_net(*macaron_net_args) if macaron_style else None,
+        conv_mod(*conv_mod_args) if use_conv_mod else None,
         dropout_rate,
     )
 
 
-def build_causal_conv1d_block(block_arch):
+def build_conv1d_block(block: Dict[str, Any], block_type: str) -> CausalConv1d:
     """Build function for causal conv1d block.
 
     Args:
-        block_arch (dict): causal conv1d block parameters
+        block: CausalConv1d or Conv1D block parameters.
 
     Returns:
-        (function): function to create causal conv1d block
+        : Function to create conv1d (encoder) or causal conv1d (decoder) block.
 
     """
-    idim = block_arch["idim"]
-    odim = block_arch["odim"]
-    kernel_size = block_arch["kernel_size"]
-
-    return lambda: CausalConv1d(idim, odim, kernel_size)
-
-
-def build_tdnn_block(block_arch):
-    """Build function for tdnn block.
-
-    Args:
-        block_arch (dict): tdnn block parameters
-
-    Returns:
-        (function): function to create tdnn block
+    if block_type == "conv1d":
+        conv_class = Conv1d
+    else:
+        conv_class = CausalConv1d
 
-    """
-    idim = block_arch["idim"]
-    odim = block_arch["odim"]
-    ctx_size = block_arch["ctx_size"]
-    dilation = block_arch["dilation"]
-    stride = block_arch["stride"]
-
-    use_batch_norm = (
-        block_arch["use-batch-norm"] if "use-batch-norm" in block_arch else False
-    )
-    use_relu = block_arch["use-relu"] if "use-relu" in block_arch else False
+    stride = block.get("stride", 1)
+    dilation = block.get("dilation", 1)
+    groups = block.get("groups", 1)
+    bias = block.get("bias", True)
 
-    dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0
+    use_batch_norm = block.get("use-batch-norm", False)
+    use_relu = block.get("use-relu", False)
+    dropout_rate = block.get("dropout-rate", 0.0)
 
-    return lambda: TDNN(
-        idim,
-        odim,
-        ctx_size=ctx_size,
-        dilation=dilation,
+    return lambda: conv_class(
+        block["idim"],
+        block["odim"],
+        block["kernel_size"],
         stride=stride,
-        dropout_rate=dropout_rate,
-        batch_norm=use_batch_norm,
+        dilation=dilation,
+        groups=groups,
+        bias=bias,
         relu=use_relu,
+        batch_norm=use_batch_norm,
+        dropout_rate=dropout_rate,
     )
 
 
 def build_blocks(
-    net_part,
-    idim,
-    input_layer,
-    blocks_arch,
-    repeat_block=0,
-    self_attn_type="self_attn",
-    positional_encoding_type="abs_pos",
-    positionwise_layer_type="linear",
-    positionwise_activation_type="relu",
-    conv_mod_activation_type="relu",
-    dropout_rate_embed=0.0,
-    padding_idx=-1,
-):
-    """Build block for customizable architecture.
+    net_part: str,
+    idim: int,
+    input_layer_type: str,
+    blocks: List[Dict[str, Any]],
+    repeat_block: int = 0,
+    self_attn_type: str = "self_attn",
+    positional_encoding_type: str = "abs_pos",
+    positionwise_layer_type: str = "linear",
+    positionwise_activation_type: str = "relu",
+    conv_mod_activation_type: str = "relu",
+    input_layer_dropout_rate: float = 0.0,
+    input_layer_pos_enc_dropout_rate: float = 0.0,
+    padding_idx: int = -1,
+) -> Tuple[
+    Union[Conv2dSubsampling, VGG2L, torch.nn.Sequential], MultiSequential, int, int
+]:
+    """Build custom model blocks.
 
     Args:
-        net_part (str): either 'encoder' or 'decoder'
-        idim (int): dimension of inputs
-        input_layer (str): input layer type
-        blocks_arch (list): list of blocks for network part (type and parameters)
-        repeat_block (int): repeat provided blocks N times if N > 1
-        positional_encoding_type (str): positional encoding layer type
-        positionwise_layer_type (str): linear
-        positionwise_activation_type (str): positionwise activation type
-        conv_mod_activation_type (str): convolutional module activation type
-        dropout_rate_embed (float): dropout rate for embedding
-        padding_idx (int): padding index for embedding input layer (if specified)
+        net_part: Network part, either 'encoder' or 'decoder'.
+        idim: Input dimension.
+        input_layer: Input layer type.
+        blocks: Blocks parameters for network part.
+        repeat_block: Number of times provided blocks are repeated.
+        positional_encoding_type: Positional encoding layer type.
+        positionwise_layer_type: Positionwise layer type.
+        positionwise_activation_type: Positionwise activation type.
+        conv_mod_activation_type: Convolutional module activation type.
+        input_layer_dropout_rate: Dropout rate for input layer.
+        input_layer_pos_enc_dropout_rate: Dropout rate for input layer pos. enc.
+        padding_idx: Padding symbol ID for embedding layer.
 
     Returns:
-        in_layer (torch.nn.*): input layer
-        all_blocks (MultiSequential): all blocks for network part
-        out_dim (int): dimension of last block output
-        conv_subsampling_factor (int): subsampling factor in frontend CNN
+        in_layer: Input layer
+        all_blocks: Encoder/Decoder network.
+        out_dim: Network output dimension.
+        conv_subsampling_factor: Subsampling factor in frontend CNN.
 
     """
     fn_modules = []
 
-    (
-        input_layer,
-        input_layer_odim,
-        input_dropout_rate,
-        input_pos_dropout_rate,
-        out_dim,
-    ) = check_and_prepare(net_part, blocks_arch, input_layer)
-
     pos_enc_class, self_attn_class = get_pos_enc_and_att_class(
         net_part, positional_encoding_type, self_attn_type
     )
 
-    in_layer, conv_subsampling_factor = build_input_layer(
-        input_layer,
+    input_block = prepare_input_layer(
+        input_layer_type,
         idim,
-        input_layer_odim,
+        blocks,
+        input_layer_dropout_rate,
+        input_layer_pos_enc_dropout_rate,
+    )
+
+    out_dim = prepare_body_model(net_part, blocks)
+
+    input_layer, conv_subsampling_factor = build_input_layer(
+        input_block,
         pos_enc_class,
-        dropout_rate_embed,
-        input_dropout_rate,
-        input_pos_dropout_rate,
         padding_idx,
     )
 
-    for i in range(len(blocks_arch)):
-        block_type = blocks_arch[i]["type"]
+    for i in range(len(blocks)):
+        block_type = blocks[i]["type"]
 
-        if block_type == "tdnn":
-            module = build_tdnn_block(blocks_arch[i])
-        elif block_type == "transformer":
-            module = build_transformer_block(
-                net_part,
-                blocks_arch[i],
-                positionwise_layer_type,
-                positionwise_activation_type,
-            )
+        if block_type in ("causal-conv1d", "conv1d"):
+            module = build_conv1d_block(blocks[i], block_type)
         elif block_type == "conformer":
             module = build_conformer_block(
-                blocks_arch[i],
+                blocks[i],
                 self_attn_class,
                 positionwise_layer_type,
                 positionwise_activation_type,
                 conv_mod_activation_type,
             )
-        elif block_type == "causal-conv1d":
-            module = build_causal_conv1d_block(blocks_arch[i])
+        elif block_type == "transformer":
+            module = build_transformer_block(
+                net_part,
+                blocks[i],
+                positionwise_layer_type,
+                positionwise_activation_type,
+            )
 
         fn_modules.append(module)
 
@@ -549,7 +537,7 @@ def build_blocks(
         fn_modules = fn_modules * repeat_block
 
     return (
-        in_layer,
+        input_layer,
         MultiSequential(*[fn() for fn in fn_modules]),
         out_dim,
         conv_subsampling_factor,
diff --git a/espnet/nets/pytorch_backend/transducer/causal_conv1d.py b/espnet/nets/pytorch_backend/transducer/causal_conv1d.py
deleted file mode 100644
index 0d8f331578f..00000000000
--- a/espnet/nets/pytorch_backend/transducer/causal_conv1d.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""CausalConv1d module definition for custom decoder."""
-
-import torch
-
-
-class CausalConv1d(torch.nn.Module):
-    """CausalConv1d module for custom decoder.
-
-    Args:
-        idim (int): dimension of inputs
-        odim (int): dimension of outputs
-        kernel_size (int): size of convolving kernel
-        stride (int): stride of the convolution
-        dilation (int): spacing between the kernel points
-        groups (int): number of blocked connections from ichannels to ochannels
-        bias (bool): whether to add a learnable bias to the output
-
-    """
-
-    def __init__(
-        self, idim, odim, kernel_size, stride=1, dilation=1, groups=1, bias=True
-    ):
-        """Construct a CausalConv1d object."""
-        super().__init__()
-
-        self._pad = (kernel_size - 1) * dilation
-
-        self.causal_conv1d = torch.nn.Conv1d(
-            idim,
-            odim,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=self._pad,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-        )
-
-    def forward(self, x, x_mask, cache=None):
-        """CausalConv1d forward for x.
-
-        Args:
-            x (torch.Tensor): input torch (B, U, idim)
-            x_mask (torch.Tensor): (B, 1, U)
-
-        Returns:
-            x (torch.Tensor): input torch (B, sub(U), attention_dim)
-            x_mask (torch.Tensor): (B, 1, sub(U))
-
-        """
-        x = x.permute(0, 2, 1)
-        x = self.causal_conv1d(x)
-
-        if self._pad != 0:
-            x = x[:, :, : -self._pad]
-
-        x = x.permute(0, 2, 1)
-
-        return x, x_mask
diff --git a/espnet/nets/pytorch_backend/transducer/conv1d_nets.py b/espnet/nets/pytorch_backend/transducer/conv1d_nets.py
new file mode 100644
index 00000000000..56816e8d04d
--- /dev/null
+++ b/espnet/nets/pytorch_backend/transducer/conv1d_nets.py
@@ -0,0 +1,254 @@
+"""Convolution networks definition for custom archictecture."""
+
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+
+class Conv1d(torch.nn.Module):
+    """1D convolution module for custom encoder.
+
+    Args:
+        idim: Input dimension.
+        odim: Output dimension.
+        kernel_size: Size of the convolving kernel.
+        stride: Stride of the convolution.
+        dilation: Spacing between the kernel points.
+        groups: Number of blocked connections from input channels to output channels.
+        bias: Whether to add a learnable bias to the output.
+        batch_norm: Whether to use batch normalization after convolution.
+        relu: Whether to use a ReLU activation after convolution.
+        dropout_rate: Dropout rate.
+
+    """
+
+    def __init__(
+        self,
+        idim: int,
+        odim: int,
+        kernel_size: Union[int, Tuple],
+        stride: Union[int, Tuple] = 1,
+        dilation: Union[int, Tuple] = 1,
+        groups: Union[int, Tuple] = 1,
+        bias: bool = True,
+        batch_norm: bool = False,
+        relu: bool = True,
+        dropout_rate: float = 0.0,
+    ):
+        """Construct a Conv1d module object."""
+        super().__init__()
+
+        self.conv = torch.nn.Conv1d(
+            idim,
+            odim,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+
+        if relu:
+            self.relu_func = torch.nn.ReLU()
+
+        if batch_norm:
+            self.bn = torch.nn.BatchNorm1d(odim)
+
+        self.relu = relu
+        self.batch_norm = batch_norm
+
+        self.padding = dilation * (kernel_size - 1)
+        self.stride = stride
+
+        self.out_pos = torch.nn.Linear(idim, odim)
+
+    def forward(
+        self,
+        sequence: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        mask: torch.Tensor,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor]:
+        """Forward ConvEncoderLayer module object.
+
+        Args:
+            sequence: Input sequences.
+                      (B, T, D_in)
+                        or (B, T, D_in),  (B, 2 * (T - 1), D_att)
+            mask: Mask of input sequences. (B, 1, T)
+
+        Returns:
+            sequence: Output sequences.
+                      (B, sub(T), D_out)
+                        or (B, sub(T), D_out),  (B, 2 * (sub(T) - 1), D_att)
+            mask: Mask of output sequences. (B, 1, sub(T))
+
+        """
+        if isinstance(sequence, tuple):
+            sequence, pos_embed = sequence[0], sequence[1]
+        else:
+            sequence, pos_embed = sequence, None
+
+        sequence = sequence.transpose(1, 2)
+        sequence = self.conv(sequence)
+
+        if self.batch_norm:
+            sequence = self.bn(sequence)
+
+        sequence = self.dropout(sequence)
+
+        if self.relu:
+            sequence = self.relu_func(sequence)
+
+        sequence = sequence.transpose(1, 2)
+
+        mask = self.create_new_mask(mask)
+
+        if pos_embed is not None:
+            pos_embed = self.create_new_pos_embed(pos_embed)
+
+            return (sequence, pos_embed), mask
+
+        return sequence, mask
+
+    def create_new_mask(self, mask: torch.Tensor) -> torch.Tensor:
+        """Create new mask.
+
+        Args:
+            mask: Mask of input sequences. (B, 1, T)
+
+        Returns:
+            mask: Mask of output sequences. (B, 1, sub(T))
+
+        """
+        if mask is None:
+            return mask
+
+        if self.padding != 0:
+            mask = mask[:, :, : -self.padding]
+
+        mask = mask[:, :, :: self.stride]
+
+        return mask
+
+    def create_new_pos_embed(self, pos_embed: torch.Tensor) -> torch.Tensor:
+        """Create new positional embedding vector.
+
+        Args:
+            pos_embed: Input sequences positional embedding.
+                       (B, 2 * (T - 1), D_att)
+
+        Return:
+            pos_embed: Output sequences positional embedding.
+                       (B, 2 * (sub(T) - 1), D_att)
+
+        """
+        pos_embed_positive = pos_embed[:, : pos_embed.size(1) // 2 + 1, :]
+        pos_embed_negative = pos_embed[:, pos_embed.size(1) // 2 :, :]
+
+        if self.padding != 0:
+            pos_embed_positive = pos_embed_positive[:, : -self.padding, :]
+            pos_embed_negative = pos_embed_negative[:, : -self.padding, :]
+
+        pos_embed_positive = pos_embed_positive[:, :: self.stride, :]
+        pos_embed_negative = pos_embed_negative[:, :: self.stride, :]
+
+        pos_embed = torch.cat([pos_embed_positive, pos_embed_negative[:, 1:, :]], dim=1)
+
+        return self.out_pos(pos_embed)
+
+
+class CausalConv1d(torch.nn.Module):
+    """1D causal convolution module for custom decoder.
+
+    Args:
+        idim: Input dimension.
+        odim: Output dimension.
+        kernel_size: Size of the convolving kernel.
+        stride: Stride of the convolution.
+        dilation: Spacing between the kernel points.
+        groups: Number of blocked connections from input channels to output channels.
+        bias: Whether to add a learnable bias to the output.
+        batch_norm: Whether to apply batch normalization.
+        relu: Whether to pass final output through ReLU activation.
+        dropout_rate: Dropout rate.
+
+    """
+
+    def __init__(
+        self,
+        idim: int,
+        odim: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        batch_norm: bool = False,
+        relu: bool = True,
+        dropout_rate: float = 0.0,
+    ):
+        """Construct a CausalConv1d object."""
+        super().__init__()
+
+        self.padding = (kernel_size - 1) * dilation
+
+        self.causal_conv1d = torch.nn.Conv1d(
+            idim,
+            odim,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+
+        if batch_norm:
+            self.bn = torch.nn.BatchNorm1d(odim)
+
+        if relu:
+            self.relu_func = torch.nn.ReLU()
+
+        self.batch_norm = batch_norm
+        self.relu = relu
+
+    def forward(
+        self,
+        sequence: torch.Tensor,
+        mask: torch.Tensor,
+        cache: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward CausalConv1d for custom decoder.
+
+        Args:
+            sequence: CausalConv1d input sequences. (B, U, D_in)
+            mask: Mask of CausalConv1d input sequences. (B, 1, U)
+
+
+        Returns:
+            sequence: CausalConv1d output sequences. (B, sub(U), D_out)
+            mask: Mask of CausalConv1d output sequences. (B, 1, sub(U))
+
+        """
+        sequence = sequence.transpose(1, 2)
+        sequence = self.causal_conv1d(sequence)
+
+        if self.padding != 0:
+            sequence = sequence[:, :, : -self.padding]
+
+        if self.batch_norm:
+            sequence = self.bn(sequence)
+
+        sequence = self.dropout(sequence)
+
+        if self.relu:
+            sequence = self.relu_func(sequence)
+
+        sequence = sequence.transpose(1, 2)
+
+        return sequence, mask
diff --git a/espnet/nets/pytorch_backend/transducer/custom_decoder.py b/espnet/nets/pytorch_backend/transducer/custom_decoder.py
index ca37fe645ff..f5b2724ef75 100644
--- a/espnet/nets/pytorch_backend/transducer/custom_decoder.py
+++ b/espnet/nets/pytorch_backend/transducer/custom_decoder.py
@@ -1,44 +1,54 @@
-"""Custom decoder definition for transducer models."""
+"""Custom decoder definition for Transducer model."""
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
 
 import torch
 
 from espnet.nets.pytorch_backend.transducer.blocks import build_blocks
-from espnet.nets.pytorch_backend.transducer.utils import check_batch_state
+from espnet.nets.pytorch_backend.transducer.utils import check_batch_states
 from espnet.nets.pytorch_backend.transducer.utils import check_state
 from espnet.nets.pytorch_backend.transducer.utils import pad_sequence
 from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
 from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
+from espnet.nets.transducer_decoder_interface import ExtendedHypothesis
+from espnet.nets.transducer_decoder_interface import Hypothesis
 from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface
 
 
 class CustomDecoder(TransducerDecoderInterface, torch.nn.Module):
-    """Custom decoder module for transducer models.
+    """Custom decoder module for Transducer model.
 
     Args:
-        odim (int): dimension of outputs
-        dec_arch (list): list of layer definitions
-        input_layer (str): input layer type
-        repeat_block (int): repeat provided blocks N times if N > 1
-        positional_encoding_type (str): positional encoding type
-        positionwise_layer_type (str): linear
-        positionwise_activation_type (str): positionwise activation type
-        dropout_rate_embed (float): dropout rate for embedding layer (if specified)
-        blank (int): blank symbol ID
+        odim: Output dimension.
+        dec_arch: Decoder block architecture (type and parameters).
+        input_layer: Input layer type.
+        repeat_block: Number of times dec_arch is repeated.
+        joint_activation_type: Type of activation for joint network.
+        positional_encoding_type: Positional encoding type.
+        positionwise_layer_type: Positionwise layer type.
+        positionwise_activation_type: Positionwise activation type.
+        input_layer_dropout_rate: Dropout rate for input layer.
+        blank_id: Blank symbol ID.
 
     """
 
     def __init__(
         self,
-        odim,
-        dec_arch,
-        input_layer="embed",
-        repeat_block=0,
-        joint_activation_type="tanh",
-        positional_encoding_type="abs_pos",
-        positionwise_layer_type="linear",
-        positionwise_activation_type="relu",
-        dropout_rate_embed=0.0,
-        blank=0,
+        odim: int,
+        dec_arch: List,
+        input_layer: str = "embed",
+        repeat_block: int = 0,
+        joint_activation_type: str = "tanh",
+        positional_encoding_type: str = "abs_pos",
+        positionwise_layer_type: str = "linear",
+        positionwise_activation_type: str = "relu",
+        input_layer_dropout_rate: float = 0.0,
+        blank_id: int = 0,
     ):
         """Construct a CustomDecoder object."""
         torch.nn.Module.__init__(self)
@@ -52,8 +62,8 @@ def __init__(
             positional_encoding_type=positional_encoding_type,
             positionwise_layer_type=positionwise_layer_type,
             positionwise_activation_type=positionwise_activation_type,
-            dropout_rate_embed=dropout_rate_embed,
-            padding_idx=blank,
+            input_layer_dropout_rate=input_layer_dropout_rate,
+            padding_idx=blank_id,
         )
 
         self.after_norm = LayerNorm(ddim)
@@ -62,216 +72,223 @@ def __init__(
         self.dunits = ddim
         self.odim = odim
 
-        self.blank = blank
+        self.blank_id = blank_id
 
-    def set_device(self, device):
+    def set_device(self, device: torch.device):
         """Set GPU device to use.
 
         Args:
-            device (torch.device): device id
+            device: Device ID.
 
         """
         self.device = device
 
-    def init_state(self, batch_size=None, device=None, dtype=None):
+    def init_state(
+        self,
+        batch_size: Optional[int] = None,
+    ) -> List[Optional[torch.Tensor]]:
         """Initialize decoder states.
 
         Args:
-            None
+            batch_size: Batch size.
 
         Returns:
-            state (list): batch of decoder decoder states [L x None]
+            state: Initial decoder hidden states. [N x None]
 
         """
         state = [None] * self.dlayers
 
         return state
 
-    def forward(self, tgt, tgt_mask, memory):
-        """Forward custom decoder.
+    def forward(
+        self, dec_input: torch.Tensor, dec_mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Encode label ID sequences.
 
         Args:
-            tgt (torch.Tensor): input token ids, int64 (batch, maxlen_out)
-                                if input_layer == "embed"
-                                input tensor
-                                (batch, maxlen_out, #mels) in the other cases
-            tgt_mask (torch.Tensor): input token mask,  (batch, maxlen_out)
-                                     dtype=torch.uint8 in PyTorch 1.2-
-                                     dtype=torch.bool in PyTorch 1.2+ (include 1.2)
-            memory (torch.Tensor): encoded memory, float32  (batch, maxlen_in, feat)
+            dec_input: Label ID sequences. (B, U)
+            dec_mask: Label mask sequences.  (B, U)
 
         Return:
-            tgt (torch.Tensor): decoder output (batch, maxlen_out, dim_dec)
-            tgt_mask (torch.Tensor): score mask before softmax (batch, maxlen_out)
+            dec_output: Decoder output sequences. (B, U, D_dec)
+            dec_output_mask: Mask of decoder output sequences. (B, U)
 
         """
-        tgt = self.embed(tgt)
+        dec_input = self.embed(dec_input)
 
-        tgt, tgt_mask = self.decoders(tgt, tgt_mask)
-        tgt = self.after_norm(tgt)
+        dec_output, dec_mask = self.decoders(dec_input, dec_mask)
+        dec_output = self.after_norm(dec_output)
 
-        return tgt, tgt_mask
+        return dec_output, dec_mask
 
-    def score(self, hyp, cache):
-        """Forward one step.
+    def score(
+        self, hyp: Hypothesis, cache: Dict[str, Any]
+    ) -> Tuple[torch.Tensor, List[Optional[torch.Tensor]], torch.Tensor]:
+        """One-step forward hypothesis.
 
         Args:
-            hyp (dataclass): hypothesis
-            cache (dict): states cache
+            hyp: Hypothesis.
+            cache: Pairs of (dec_out, dec_state) for each label sequence. (key)
 
         Returns:
-            y (torch.Tensor): decoder outputs (1, dec_dim)
-            (list): decoder states
-                [L x (1, max_len, dec_dim)]
-            lm_tokens (torch.Tensor): token id for LM (1)
+            dec_out: Decoder output sequence. (1, D_dec)
+            dec_state: Decoder hidden states. [N x (1, U, D_dec)]
+            lm_label: Label ID for LM. (1,)
 
         """
-        tgt = torch.tensor([hyp.yseq], device=self.device)
-        lm_tokens = tgt[:, -1]
+        labels = torch.tensor([hyp.yseq], device=self.device)
+        lm_label = labels[:, -1]
 
-        str_yseq = "".join(list(map(str, hyp.yseq)))
+        str_labels = "_".join(list(map(str, hyp.yseq)))
 
-        if str_yseq in cache:
-            y, new_state = cache[str_yseq]
+        if str_labels in cache:
+            dec_out, dec_state = cache[str_labels]
         else:
-            tgt_mask = subsequent_mask(len(hyp.yseq)).unsqueeze_(0)
+            dec_out_mask = subsequent_mask(len(hyp.yseq)).unsqueeze_(0)
 
-            state = check_state(hyp.dec_state, (tgt.size(1) - 1), self.blank)
+            new_state = check_state(hyp.dec_state, (labels.size(1) - 1), self.blank_id)
 
-            tgt = self.embed(tgt)
+            dec_out = self.embed(labels)
 
-            new_state = []
-            for s, decoder in zip(state, self.decoders):
-                tgt, tgt_mask = decoder(tgt, tgt_mask, cache=s)
-                new_state.append(tgt)
+            dec_state = []
+            for s, decoder in zip(new_state, self.decoders):
+                dec_out, dec_out_mask = decoder(dec_out, dec_out_mask, cache=s)
+                dec_state.append(dec_out)
 
-            y = self.after_norm(tgt[:, -1])
+            dec_out = self.after_norm(dec_out[:, -1])
 
-            cache[str_yseq] = (y, new_state)
+            cache[str_labels] = (dec_out, dec_state)
 
-        return y[0], new_state, lm_tokens
+        return dec_out[0], dec_state, lm_label
 
-    def batch_score(self, hyps, batch_states, cache, use_lm):
-        """Forward batch one step.
+    def batch_score(
+        self,
+        hyps: Union[List[Hypothesis], List[ExtendedHypothesis]],
+        dec_states: List[Optional[torch.Tensor]],
+        cache: Dict[str, Any],
+        use_lm: bool,
+    ) -> Tuple[torch.Tensor, List[Optional[torch.Tensor]], torch.Tensor]:
+        """One-step forward hypotheses.
 
         Args:
-            hyps (list): batch of hypotheses
-            batch_states (list): decoder states
-                [L x (B, max_len, dec_dim)]
-            cache (dict): states cache
+            hyps: Hypotheses.
+            dec_states: Decoder hidden states. [N x (B, U, D_dec)]
+            cache: Pairs of (h_dec, dec_states) for each label sequences. (keys)
+            use_lm: Whether to compute label ID sequences for LM.
 
         Returns:
-            batch_y (torch.Tensor): decoder output (B, dec_dim)
-            batch_states (list): decoder states
-                [L x (B, max_len, dec_dim)]
-            lm_tokens (torch.Tensor): batch of token ids for LM (B)
+            dec_out: Decoder output sequences. (B, D_dec)
+            dec_states: Decoder hidden states. [N x (B, U, D_dec)]
+            lm_labels: Label ID sequences for LM. (B,)
 
         """
         final_batch = len(hyps)
 
         process = []
-        done = [None for _ in range(final_batch)]
+        done = [None] * final_batch
 
         for i, hyp in enumerate(hyps):
-            str_yseq = "".join(list(map(str, hyp.yseq)))
+            str_labels = "_".join(list(map(str, hyp.yseq)))
 
-            if str_yseq in cache:
-                done[i] = cache[str_yseq]
+            if str_labels in cache:
+                done[i] = cache[str_labels]
             else:
-                process.append((str_yseq, hyp.yseq, hyp.dec_state))
+                process.append((str_labels, hyp.yseq, hyp.dec_state))
 
         if process:
-            _tokens = pad_sequence([p[1] for p in process], self.blank)
-            batch_tokens = torch.LongTensor(_tokens, device=self.device)
-
-            tgt_mask = (
-                subsequent_mask(batch_tokens.size(-1))
-                .unsqueeze_(0)
-                .expand(len(process), -1, -1)
-            )
+            labels = pad_sequence([p[1] for p in process], self.blank_id)
+            labels = torch.LongTensor(labels, device=self.device)
 
-            dec_state = self.create_batch_states(
+            p_dec_states = self.create_batch_states(
                 self.init_state(),
                 [p[2] for p in process],
-                _tokens,
+                labels,
             )
 
-            tgt = self.embed(batch_tokens)
+            dec_out = self.embed(labels)
+
+            dec_out_mask = (
+                subsequent_mask(labels.size(-1))
+                .unsqueeze_(0)
+                .expand(len(process), -1, -1)
+            )
 
-            next_state = []
-            for s, decoder in zip(dec_state, self.decoders):
-                tgt, tgt_mask = decoder(tgt, tgt_mask, cache=s)
-                next_state.append(tgt)
+            new_states = []
+            for s, decoder in zip(p_dec_states, self.decoders):
+                dec_out, dec_out_mask = decoder(dec_out, dec_out_mask, cache=s)
+                new_states.append(dec_out)
 
-            tgt = self.after_norm(tgt[:, -1])
+            dec_out = self.after_norm(dec_out[:, -1])
 
         j = 0
         for i in range(final_batch):
             if done[i] is None:
-                new_state = self.select_state(next_state, j)
+                state = self.select_state(new_states, j)
 
-                done[i] = (tgt[j], new_state)
-                cache[process[j][0]] = (tgt[j], new_state)
+                done[i] = (dec_out[j], state)
+                cache[process[j][0]] = (dec_out[j], state)
 
                 j += 1
 
-        self.create_batch_states(
-            batch_states, [d[1] for d in done], [[0] + h.yseq for h in hyps]
+        dec_out = torch.stack([d[0] for d in done])
+        dec_states = self.create_batch_states(
+            dec_states, [d[1] for d in done], [[0] + h.yseq for h in hyps]
         )
-        batch_y = torch.stack([d[0] for d in done])
 
         if use_lm:
-            lm_tokens = torch.LongTensor(
+            lm_labels = torch.LongTensor(
                 [hyp.yseq[-1] for hyp in hyps], device=self.device
             )
 
-            return batch_y, batch_states, lm_tokens
+            return dec_out, dec_states, lm_labels
 
-        return batch_y, batch_states, None
+        return dec_out, dec_states, None
 
-    def select_state(self, batch_states, idx):
-        """Get decoder state from batch of states, for given id.
+    def select_state(
+        self, states: List[Optional[torch.Tensor]], idx: int
+    ) -> List[Optional[torch.Tensor]]:
+        """Get specified ID state from decoder hidden states.
 
         Args:
-            batch_states (list): batch of decoder states
-                [L x (B, max_len, dec_dim)]
-            idx (int): index to extract state from batch of states
+            states: Decoder hidden states. [N x (B, U, D_dec)]
+            idx: State ID to extract.
 
         Returns:
-            state_idx (list): decoder states for given id
-                [L x (1, max_len, dec_dim)]
+            state_idx: Decoder hidden state for given ID. [N x (1, U, D_dec)]
 
         """
-        if batch_states[0] is None:
-            return batch_states
+        if states[0] is None:
+            return states
 
-        state_idx = [batch_states[layer][idx] for layer in range(self.dlayers)]
+        state_idx = [states[layer][idx] for layer in range(self.dlayers)]
 
         return state_idx
 
-    def create_batch_states(self, batch_states, l_states, check_list):
-        """Create batch of decoder states.
+    def create_batch_states(
+        self,
+        states: List[Optional[torch.Tensor]],
+        new_states: List[Optional[torch.Tensor]],
+        check_list: List[List[int]],
+    ) -> List[Optional[torch.Tensor]]:
+        """Create decoder hidden states sequences.
 
         Args:
-            batch_states (list): batch of decoder states
-                [L x (B, max_len, dec_dim)]
-            l_states (list): list of decoder states
-                [B x [L x (1, max_len, dec_dim)]]
-            check_list (list): list of sequences for max_len
+            states: Decoder hidden states. [N x (B, U, D_dec)]
+            new_states: Decoder hidden states. [B x [N x (1, U, D_dec)]]
+            check_list: Label ID sequences.
 
         Returns:
-            batch_states (list): batch of decoder states
-                [L x (B, max_len, dec_dim)]
+            states: New decoder hidden states. [N x (B, U, D_dec)]
 
         """
-        if l_states[0][0] is None:
-            return batch_states
+        if new_states[0][0] is None:
+            return states
 
         max_len = max(len(elem) for elem in check_list) - 1
 
         for layer in range(self.dlayers):
-            batch_states[layer] = check_batch_state(
-                [s[layer] for s in l_states], max_len, self.blank
+            states[layer] = check_batch_states(
+                [s[layer] for s in new_states], max_len, self.blank_id
             )
 
-        return batch_states
+        return states
diff --git a/espnet/nets/pytorch_backend/transducer/custom_encoder.py b/espnet/nets/pytorch_backend/transducer/custom_encoder.py
index 6a024bf12e6..109d2071ba9 100644
--- a/espnet/nets/pytorch_backend/transducer/custom_encoder.py
+++ b/espnet/nets/pytorch_backend/transducer/custom_encoder.py
@@ -1,5 +1,9 @@
 """Cutom encoder definition for transducer models."""
 
+from typing import List
+from typing import Tuple
+from typing import Union
+
 import torch
 
 from espnet.nets.pytorch_backend.transducer.blocks import build_blocks
@@ -13,35 +17,37 @@ class CustomEncoder(torch.nn.Module):
     """Custom encoder module for transducer models.
 
     Args:
-        idim (int): input dim
-        enc_arch (list): list of encoder blocks (type and parameters)
-        input_layer (str): input layer type
-        repeat_block (int): repeat provided block N times if N > 1
-        self_attn_type (str): type of self-attention
-        positional_encoding_type (str): positional encoding type
-        positionwise_layer_type (str): linear
-        positionwise_activation_type (str): positionwise activation type
-        conv_mod_activation_type (str): convolutional module activation type
-        normalize_before (bool): whether to use layer_norm before the first block
-        aux_task_layer_list (list): list of layer ids for intermediate output
-        padding_idx (int): padding_idx for embedding input layer (if specified)
+        idim: Input dimension.
+        enc_arch: Encoder block architecture (type and parameters).
+        input_layer: Input layer type.
+        repeat_block: Number of times blocks_arch is repeated.
+        self_attn_type: Self-attention type.
+        positional_encoding_type: Positional encoding type.
+        positionwise_layer_type: Positionwise layer type.
+        positionwise_activation_type: Positionwise activation type.
+        conv_mod_activation_type: Convolutional module activation type.
+        aux_enc_output_layers: Layer IDs for auxiliary encoder output sequences.
+        input_layer_dropout_rate: Dropout rate for input layer.
+        input_layer_pos_enc_dropout_rate: Dropout rate for input layer pos. enc.
+        padding_idx: Padding symbol ID for embedding layer.
 
     """
 
     def __init__(
         self,
-        idim,
-        enc_arch,
-        input_layer="linear",
-        repeat_block=0,
-        self_attn_type="selfattn",
-        positional_encoding_type="abs_pos",
-        positionwise_layer_type="linear",
-        positionwise_activation_type="relu",
-        conv_mod_activation_type="relu",
-        normalize_before=True,
-        aux_task_layer_list=[],
-        padding_idx=-1,
+        idim: int,
+        enc_arch: List,
+        input_layer: str = "linear",
+        repeat_block: int = 1,
+        self_attn_type: str = "selfattn",
+        positional_encoding_type: str = "abs_pos",
+        positionwise_layer_type: str = "linear",
+        positionwise_activation_type: str = "relu",
+        conv_mod_activation_type: str = "relu",
+        aux_enc_output_layers: List = [],
+        input_layer_dropout_rate: float = 0.0,
+        input_layer_pos_enc_dropout_rate: float = 0.0,
+        padding_idx: int = -1,
     ):
         """Construct an CustomEncoder object."""
         super().__init__()
@@ -62,63 +68,65 @@ def __init__(
             positionwise_layer_type=positionwise_layer_type,
             positionwise_activation_type=positionwise_activation_type,
             conv_mod_activation_type=conv_mod_activation_type,
+            input_layer_dropout_rate=input_layer_dropout_rate,
+            input_layer_pos_enc_dropout_rate=input_layer_pos_enc_dropout_rate,
             padding_idx=padding_idx,
         )
 
-        self.normalize_before = normalize_before
-
-        if self.normalize_before:
-            self.after_norm = LayerNorm(self.enc_out)
+        self.after_norm = LayerNorm(self.enc_out)
 
         self.n_blocks = len(enc_arch) * repeat_block
 
-        self.aux_task_layer_list = aux_task_layer_list
+        self.aux_enc_output_layers = aux_enc_output_layers
 
-    def forward(self, xs, masks):
-        """Encode input sequence.
+    def forward(
+        self,
+        feats: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], torch.Tensor]]:
+        """Encode feature sequences.
 
         Args:
-            xs (torch.Tensor): input tensor
-            masks (torch.Tensor): input mask
+            feats: Feature sequences. (B, F, D_feats)
+            feats_mask: Feature mask sequences. (B, 1, F)
 
         Returns:
-            xs (torch.Tensor or tuple):
-                position embedded output or
-                (position embedded output, auxiliary outputs)
-            mask (torch.Tensor): position embedded mask
+            enc_out: Encoder output sequences. (B, T, D_enc) with/without
+                     Auxiliary encoder output sequences. (B, T, D_enc_aux)
+            enc_out_mask: Mask for encoder output sequences. (B, 1, T) with/without
+                          Mask for auxiliary encoder output sequences. (B, T, D_enc_aux)
 
         """
         if isinstance(self.embed, (Conv2dSubsampling, VGG2L)):
-            xs, masks = self.embed(xs, masks)
+            enc_out, mask = self.embed(feats, mask)
         else:
-            xs = self.embed(xs)
+            enc_out = self.embed(feats)
 
-        if self.aux_task_layer_list:
-            aux_xs_list = []
+        if self.aux_enc_output_layers:
+            aux_custom_outputs = []
+            aux_custom_lens = []
 
             for b in range(self.n_blocks):
-                xs, masks = self.encoders[b](xs, masks)
+                enc_out, mask = self.encoders[b](enc_out, mask)
 
-                if b in self.aux_task_layer_list:
-                    if isinstance(xs, tuple):
-                        aux_xs = xs[0]
+                if b in self.aux_enc_output_layers:
+                    if isinstance(enc_out, tuple):
+                        aux_custom_output = enc_out[0]
                     else:
-                        aux_xs = xs
+                        aux_custom_output = enc_out
+
+                    aux_custom_outputs.append(self.after_norm(aux_custom_output))
+                    aux_custom_lens.append(mask)
 
-                    if self.normalize_before:
-                        aux_xs_list.append(self.after_norm(aux_xs))
-                    else:
-                        aux_xs_list.append(aux_xs)
         else:
-            xs, masks = self.encoders(xs, masks)
+            enc_out, mask = self.encoders(enc_out, mask)
 
-        if isinstance(xs, tuple):
-            xs = xs[0]
+        if isinstance(enc_out, tuple):
+            enc_out = enc_out[0]
 
-        if self.normalize_before:
-            xs = self.after_norm(xs)
+        enc_out = self.after_norm(enc_out)
 
-        if self.aux_task_layer_list:
-            return (xs, aux_xs_list), masks
+        if self.aux_enc_output_layers:
+            return (enc_out, aux_custom_outputs), (mask, aux_custom_lens)
 
-        return xs, masks
+        return enc_out, mask
diff --git a/espnet/nets/pytorch_backend/transducer/error_calculator.py b/espnet/nets/pytorch_backend/transducer/error_calculator.py
index 5989f1ea2f4..1d204770cfb 100644
--- a/espnet/nets/pytorch_backend/transducer/error_calculator.py
+++ b/espnet/nets/pytorch_backend/transducer/error_calculator.py
@@ -1,44 +1,49 @@
-#!/usr/bin/env python3
-# encoding: utf-8
+"""CER/WER computation for Transducer model."""
 
-"""CER/WER monitoring for transducer models."""
+from typing import List
+from typing import Tuple
+from typing import Union
 
-import editdistance
+import torch
 
 from espnet.nets.beam_search_transducer import BeamSearchTransducer
+from espnet.nets.pytorch_backend.transducer.custom_decoder import CustomDecoder
+from espnet.nets.pytorch_backend.transducer.joint_network import JointNetwork
+from espnet.nets.pytorch_backend.transducer.rnn_decoder import RNNDecoder
 
 
 class ErrorCalculator(object):
-    """Calculate CER and WER for transducer models.
+    """CER and WER computation for Transducer model.
 
     Args:
-        decoder (torch.nn.Module|TransducerDecoderInterface): decoder module
-        joint_network (torch.nn.Module): joint network module
-        token_list (list): list of tokens
-        sym_space (str): space symbol
-        sym_blank (str): blank symbol
-        report_cer (boolean): compute CER option
-        report_wer (boolean): compute WER option
+        decoder: Decoder module.
+        joint_network: Joint network module.
+        token_list: Set of unique labels.
+        sym_space: Space symbol.
+        sym_blank: Blank symbol.
+        report_cer: Whether to compute CER.
+        report_wer: Whether to compute WER.
 
     """
 
     def __init__(
         self,
-        decoder,
-        joint_network,
-        token_list,
-        sym_space,
-        sym_blank,
-        report_cer=False,
-        report_wer=False,
+        decoder: Union[RNNDecoder, CustomDecoder],
+        joint_network: JointNetwork,
+        token_list: List[int],
+        sym_space: str,
+        sym_blank: str,
+        report_cer: bool = False,
+        report_wer: bool = False,
     ):
-        """Construct an ErrorCalculator object for transducer model."""
+        """Construct an ErrorCalculator object for Transducer model."""
         super().__init__()
 
         self.beam_search = BeamSearchTransducer(
             decoder=decoder,
             joint_network=joint_network,
-            beam_size=1,
+            beam_size=2,
+            search_type="default",
         )
 
         self.decoder = decoder
@@ -50,114 +55,116 @@ def __init__(
         self.report_cer = report_cer
         self.report_wer = report_wer
 
-    def __call__(self, hs_pad, ys_pad):
-        """Calculate sentence-level WER/CER score for transducer models.
+    def __call__(
+        self, enc_out: torch.Tensor, target: torch.Tensor
+    ) -> Tuple[float, float]:
+        """Calculate sentence-level CER/WER score for hypotheses sequences.
 
         Args:
-            hs_pad (torch.Tensor): batch of padded input sequence (batch, T, D)
-            ys_pad (torch.Tensor): reference (batch, seqlen)
+            enc_out: Encoder output sequences. (B, T, D_enc)
+            target: Target label ID sequences. (B, L)
 
         Returns:
-            (float): sentence-level CER score
-            (float): sentence-level WER score
+            cer: Sentence-level CER score.
+            wer: Sentence-level WER score.
 
         """
         cer, wer = None, None
 
-        batchsize = int(hs_pad.size(0))
+        batchsize = int(enc_out.size(0))
         batch_nbest = []
 
-        hs_pad = hs_pad.to(next(self.decoder.parameters()).device)
+        enc_out = enc_out.to(next(self.decoder.parameters()).device)
 
         for b in range(batchsize):
-            nbest_hyps = self.beam_search(hs_pad[b])
+            nbest_hyps = self.beam_search(enc_out[b])
             batch_nbest.append(nbest_hyps[-1])
 
-        ys_hat = [nbest_hyp.yseq[1:] for nbest_hyp in batch_nbest]
+        batch_nbest = [nbest_hyp.yseq[1:] for nbest_hyp in batch_nbest]
 
-        seqs_hat, seqs_true = self.convert_to_char(ys_hat, ys_pad.cpu())
+        hyps, refs = self.convert_to_char(batch_nbest, target.cpu())
 
         if self.report_cer:
-            cer = self.calculate_cer(seqs_hat, seqs_true)
+            cer = self.calculate_cer(hyps, refs)
 
         if self.report_wer:
-            wer = self.calculate_wer(seqs_hat, seqs_true)
+            wer = self.calculate_wer(hyps, refs)
 
         return cer, wer
 
-    def convert_to_char(self, ys_hat, ys_pad):
-        """Convert index to character.
+    def convert_to_char(
+        self, hyps: torch.Tensor, refs: torch.Tensor
+    ) -> Tuple[List, List]:
+        """Convert label ID sequences to character.
 
         Args:
-            ys_hat (torch.Tensor): prediction (batch, seqlen)
-            ys_pad (torch.Tensor): reference (batch, seqlen)
+            hyps: Hypotheses sequences. (B, L)
+            refs: References sequences. (B, L)
 
         Returns:
-            (list): token list of prediction
-            (list): token list of reference
+            char_hyps: Character list of hypotheses.
+            char_hyps: Character list of references.
 
         """
-        seqs_hat, seqs_true = [], []
+        char_hyps, char_refs = [], []
 
-        for i, y_hat in enumerate(ys_hat):
-            y_true = ys_pad[i]
+        for i, hyp in enumerate(hyps):
+            hyp_i = [self.token_list[int(h)] for h in hyp]
+            ref_i = [self.token_list[int(r)] for r in refs[i]]
 
-            seq_hat = [self.token_list[int(idx)] for idx in y_hat]
-            seq_true = [self.token_list[int(idx)] for idx in y_true if int(idx) != -1]
+            char_hyp = "".join(hyp_i).replace(self.space, " ")
+            char_hyp = char_hyp.replace(self.blank, "")
+            char_ref = "".join(ref_i).replace(self.space, " ")
 
-            seq_hat_text = "".join(seq_hat).replace(self.space, " ")
-            seq_hat_text = seq_hat_text.replace(self.blank, "")
-            seq_true_text = "".join(seq_true).replace(self.space, " ")
+            char_hyps.append(char_hyp)
+            char_refs.append(char_ref)
 
-            seqs_hat.append(seq_hat_text)
-            seqs_true.append(seq_true_text)
+        return char_hyps, char_refs
 
-        return seqs_hat, seqs_true
-
-    def calculate_cer(self, seqs_hat, seqs_true):
-        """Calculate sentence-level CER score for transducer model.
+    def calculate_cer(self, hyps: torch.Tensor, refs: torch.Tensor) -> float:
+        """Calculate sentence-level CER score.
 
         Args:
-            seqs_hat (torch.Tensor): prediction (batch, seqlen)
-            seqs_true (torch.Tensor): reference (batch, seqlen)
+            hyps: Hypotheses sequences. (B, L)
+            refs: References sequences. (B, L)
 
         Returns:
-            (float): average sentence-level CER score
+            : Average sentence-level CER score.
 
         """
-        char_eds, char_ref_lens = [], []
+        import editdistance
 
-        for i, seq_hat_text in enumerate(seqs_hat):
-            seq_true_text = seqs_true[i]
+        distances, lens = [], []
 
-            hyp_chars = seq_hat_text.replace(" ", "")
-            ref_chars = seq_true_text.replace(" ", "")
+        for i, hyp in enumerate(hyps):
+            char_hyp = hyp.replace(" ", "")
+            char_ref = refs[i].replace(" ", "")
 
-            char_eds.append(editdistance.eval(hyp_chars, ref_chars))
-            char_ref_lens.append(len(ref_chars))
+            distances.append(editdistance.eval(char_hyp, char_ref))
+            lens.append(len(char_ref))
 
-        return float(sum(char_eds)) / sum(char_ref_lens)
+        return float(sum(distances)) / sum(lens)
 
-    def calculate_wer(self, seqs_hat, seqs_true):
-        """Calculate sentence-level WER score for transducer model.
+    def calculate_wer(self, hyps: torch.Tensor, refs: torch.Tensor) -> float:
+        """Calculate sentence-level WER score.
 
         Args:
-            seqs_hat (torch.Tensor): prediction (batch, seqlen)
-            seqs_true (torch.Tensor): reference (batch, seqlen)
+            hyps: Hypotheses sequences. (B, L)
+            refs: References sequences. (B, L)
 
         Returns:
-            (float): average sentence-level WER score
+            : Average sentence-level WER score.
 
         """
-        word_eds, word_ref_lens = [], []
+        import editdistance
 
-        for i, seq_hat_text in enumerate(seqs_hat):
-            seq_true_text = seqs_true[i]
+        distances, lens = [], []
 
-            hyp_words = seq_hat_text.split()
-            ref_words = seq_true_text.split()
+        for i, hyp in enumerate(hyps):
+            word_hyp = hyp.split()
+            word_ref = refs[i].split()
 
-            word_eds.append(editdistance.eval(hyp_words, ref_words))
-            word_ref_lens.append(len(ref_words))
+            distances.append(editdistance.eval(word_hyp, word_ref))
+            lens.append(len(word_ref))
 
-        return float(sum(word_eds)) / sum(word_ref_lens)
+        return float(sum(distances)) / sum(lens)
diff --git a/espnet/nets/pytorch_backend/transducer/initializer.py b/espnet/nets/pytorch_backend/transducer/initializer.py
index e218bb203e2..8ae47ff471f 100644
--- a/espnet/nets/pytorch_backend/transducer/initializer.py
+++ b/espnet/nets/pytorch_backend/transducer/initializer.py
@@ -1,24 +1,23 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""Parameter initialization for transducer model."""
+"""Parameter initialization for Transducer model."""
 
+from argparse import Namespace
 import math
 
+import torch
+
 from espnet.nets.pytorch_backend.initialization import set_forget_bias_to_one
 
 
-def initializer(model, args):
-    """Initialize transducer model.
+def initializer(model: torch.nn.Module, args: Namespace):
+    """Initialize Transducer model.
 
     Args:
-        model (torch.nn.Module): transducer instance
-        args (Namespace): argument Namespace containing options
+        model: Transducer model.
+        args: Namespace containing model options.
 
     """
     for name, p in model.named_parameters():
-        if any(x in name for x in ["enc.", "dec.", "joint_network"]):
-            # rnn based parts + joint network
+        if any(x in name for x in ["enc.", "dec.", "transducer_tasks."]):
             if p.dim() == 1:
                 # bias
                 p.data.zero_()
diff --git a/espnet/nets/pytorch_backend/transducer/joint_network.py b/espnet/nets/pytorch_backend/transducer/joint_network.py
index d88f2ee7d55..97a77919d7c 100644
--- a/espnet/nets/pytorch_backend/transducer/joint_network.py
+++ b/espnet/nets/pytorch_backend/transducer/joint_network.py
@@ -9,14 +9,17 @@ class JointNetwork(torch.nn.Module):
     """Transducer joint network module.
 
     Args:
-        joint_space_size: Dimension of joint space
-        joint_activation_type: Activation type for joint network
+        joint_output_size: Joint network output dimension
+        encoder_output_size: Encoder output dimension.
+        decoder_output_size: Decoder output dimension.
+        joint_space_size: Dimension of joint space.
+        joint_activation_type: Type of activation for joint network.
 
     """
 
     def __init__(
         self,
-        vocab_size: int,
+        joint_output_size: int,
         encoder_output_size: int,
         decoder_output_size: int,
         joint_space_size: int,
@@ -30,27 +33,41 @@ def __init__(
             decoder_output_size, joint_space_size, bias=False
         )
 
-        self.lin_out = torch.nn.Linear(joint_space_size, vocab_size)
+        self.lin_out = torch.nn.Linear(joint_space_size, joint_output_size)
 
         self.joint_activation = get_activation(joint_activation_type)
 
     def forward(
-        self, h_enc: torch.Tensor, h_dec: torch.Tensor, is_aux: bool = False
+        self,
+        enc_out: torch.Tensor,
+        dec_out: torch.Tensor,
+        is_aux: bool = False,
+        quantization: bool = False,
     ) -> torch.Tensor:
-        """Joint computation of z.
+        """Joint computation of encoder and decoder hidden state sequences.
 
         Args:
-            h_enc: Batch of expanded hidden state (B, T, 1, D_enc)
-            h_dec: Batch of expanded hidden state (B, 1, U, D_dec)
+            enc_out: Expanded encoder output state sequences (B, T, 1, D_enc)
+            dec_out: Expanded decoder output state sequences (B, 1, U, D_dec)
+            is_aux: Whether auxiliary tasks in used.
+            quantization: Whether dynamic quantization is used.
 
         Returns:
-            z: Output (B, T, U, vocab_size)
+            joint_out: Joint output state sequences. (B, T, U, D_out)
 
         """
         if is_aux:
-            z = self.joint_activation(h_enc + self.lin_dec(h_dec))
+            joint_out = self.joint_activation(enc_out + self.lin_dec(dec_out))
+        elif quantization:
+            joint_out = self.joint_activation(
+                self.lin_enc(enc_out.unsqueeze(0)) + self.lin_dec(dec_out.unsqueeze(0))
+            )
+
+            return self.lin_out(joint_out)[0]
         else:
-            z = self.joint_activation(self.lin_enc(h_enc) + self.lin_dec(h_dec))
-        z = self.lin_out(z)
+            joint_out = self.joint_activation(
+                self.lin_enc(enc_out) + self.lin_dec(dec_out)
+            )
+        joint_out = self.lin_out(joint_out)
 
-        return z
+        return joint_out
diff --git a/espnet/nets/pytorch_backend/transducer/loss.py b/espnet/nets/pytorch_backend/transducer/loss.py
deleted file mode 100644
index 543049cc0ec..00000000000
--- a/espnet/nets/pytorch_backend/transducer/loss.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-
-"""Transducer loss module."""
-
-import torch
-
-
-class TransLoss(torch.nn.Module):
-    """Transducer loss module.
-
-    Args:
-        trans_type (str): type of transducer implementation to calculate loss.
-        blank_id (int): blank symbol id
-    """
-
-    def __init__(self, trans_type, blank_id):
-        """Construct an TransLoss object."""
-        super().__init__()
-
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-        if trans_type == "warp-transducer":
-            from warprnnt_pytorch import RNNTLoss
-
-            self.trans_loss = RNNTLoss(blank=blank_id)
-        elif trans_type == "warp-rnnt":
-            if device.type == "cuda":
-                try:
-                    from warp_rnnt import rnnt_loss
-
-                    self.trans_loss = rnnt_loss
-                except ImportError:
-                    raise ImportError(
-                        "warp-rnnt is not installed. Please re-setup"
-                        " espnet or use 'warp-transducer'"
-                    )
-            else:
-                raise ValueError("warp-rnnt is not supported in CPU mode")
-
-        self.trans_type = trans_type
-        self.blank_id = blank_id
-
-    def forward(self, pred_pad, target, pred_len, target_len):
-        """Compute path-aware regularization transducer loss.
-
-        Args:
-            pred_pad (torch.Tensor): Batch of predicted sequences
-                (batch, maxlen_in, maxlen_out+1, odim)
-            target (torch.Tensor): Batch of target sequences (batch, maxlen_out)
-            pred_len (torch.Tensor): batch of lengths of predicted sequences (batch)
-            target_len (torch.tensor): batch of lengths of target sequences (batch)
-
-        Returns:
-            loss (torch.Tensor): transducer loss
-
-        """
-        dtype = pred_pad.dtype
-        if dtype != torch.float32:
-            # warp-transducer and warp-rnnt only support float32
-            pred_pad = pred_pad.to(dtype=torch.float32)
-
-        if self.trans_type == "warp-rnnt":
-            log_probs = torch.log_softmax(pred_pad, dim=-1)
-
-            loss = self.trans_loss(
-                log_probs,
-                target,
-                pred_len,
-                target_len,
-                reduction="mean",
-                blank=self.blank_id,
-                gather=True,
-            )
-        else:
-            loss = self.trans_loss(pred_pad, target, pred_len, target_len)
-        loss = loss.to(dtype=dtype)
-
-        return loss
diff --git a/espnet/nets/pytorch_backend/transducer/rnn_decoder.py b/espnet/nets/pytorch_backend/transducer/rnn_decoder.py
index 55ee4b8708d..401cbe8f808 100644
--- a/espnet/nets/pytorch_backend/transducer/rnn_decoder.py
+++ b/espnet/nets/pytorch_backend/transducer/rnn_decoder.py
@@ -1,48 +1,57 @@
-"""RNN decoder for transducer-based models."""
+"""RNN decoder definition for Transducer model."""
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
 
 import torch
 
+from espnet.nets.transducer_decoder_interface import ExtendedHypothesis
+from espnet.nets.transducer_decoder_interface import Hypothesis
 from espnet.nets.transducer_decoder_interface import TransducerDecoderInterface
 
 
-class DecoderRNNT(TransducerDecoderInterface, torch.nn.Module):
-    """RNN-T Decoder module.
+class RNNDecoder(TransducerDecoderInterface, torch.nn.Module):
+    """RNN decoder module for Transducer model.
 
     Args:
-        odim (int): dimension of outputs
-        dtype (str): gru or lstm
-        dlayers (int): # prediction layers
-        dunits (int): # prediction units
-        blank (int): blank symbol id
-        embed_dim (int): dimension of embeddings
-        dropout (float): dropout rate
-        dropout_embed (float): embedding dropout rate
+        odim: Output dimension.
+        dtype: Decoder units type.
+        dlayers: Number of decoder layers.
+        dunits: Number of decoder units per layer..
+        embed_dim: Embedding layer dimension.
+        dropout_rate: Dropout rate for decoder layers.
+        dropout_rate_embed: Dropout rate for embedding layer.
+        blank_id: Blank symbol ID.
 
     """
 
     def __init__(
         self,
-        odim,
-        dtype,
-        dlayers,
-        dunits,
-        blank,
-        embed_dim,
-        dropout=0.0,
-        dropout_embed=0.0,
+        odim: int,
+        dtype: str,
+        dlayers: int,
+        dunits: int,
+        embed_dim: int,
+        dropout_rate: float = 0.0,
+        dropout_rate_embed: float = 0.0,
+        blank_id: int = 0,
     ):
         """Transducer initializer."""
         super().__init__()
 
-        self.embed = torch.nn.Embedding(odim, embed_dim, padding_idx=blank)
-        self.dropout_embed = torch.nn.Dropout(p=dropout_embed)
+        self.embed = torch.nn.Embedding(odim, embed_dim, padding_idx=blank_id)
+        self.dropout_embed = torch.nn.Dropout(p=dropout_rate_embed)
 
         dec_net = torch.nn.LSTM if dtype == "lstm" else torch.nn.GRU
 
         self.decoder = torch.nn.ModuleList(
             [dec_net(embed_dim, dunits, 1, batch_first=True)]
         )
-        self.dropout_dec = torch.nn.Dropout(p=dropout)
+        self.dropout_dec = torch.nn.Dropout(p=dropout_rate)
 
         for _ in range(1, dlayers):
             self.decoder += [dec_net(dunits, dunits, 1, batch_first=True)]
@@ -54,37 +63,29 @@ def __init__(
         self.odim = odim
 
         self.ignore_id = -1
-        self.blank = blank
+        self.blank_id = blank_id
 
         self.multi_gpus = torch.cuda.device_count() > 1
 
-    def set_device(self, device):
+    def set_device(self, device: torch.device):
         """Set GPU device to use.
 
         Args:
-            device (torch.device): device id
+            device: Device ID.
 
         """
         self.device = device
 
-    def set_data_type(self, data_type):
-        """Set GPU device to use.
-
-        Args:
-            data_type (torch.dtype): Tensor data type
-
-        """
-        self.data_type = data_type
-
-    def init_state(self, batch_size):
+    def init_state(
+        self, batch_size: int
+    ) -> Tuple[torch.Tensor, Optional[torch.tensor]]:
         """Initialize decoder states.
 
         Args:
-            batch_size (int): Batch size
+            batch_size: Batch size.
 
         Returns:
-            (tuple): batch of decoder states
-                ((L, B, dec_dim), (L, B, dec_dim))
+            : Initial decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
 
         """
         h_n = torch.zeros(
@@ -92,7 +93,6 @@ def init_state(self, batch_size):
             batch_size,
             self.dunits,
             device=self.device,
-            dtype=self.data_type,
         )
 
         if self.dtype == "lstm":
@@ -101,113 +101,113 @@ def init_state(self, batch_size):
                 batch_size,
                 self.dunits,
                 device=self.device,
-                dtype=self.data_type,
             )
 
             return (h_n, c_n)
 
         return (h_n, None)
 
-    def rnn_forward(self, y, state):
-        """RNN forward.
+    def rnn_forward(
+        self,
+        sequence: torch.Tensor,
+        state: Tuple[torch.Tensor, Optional[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Encode source label sequences.
 
         Args:
-            y (torch.Tensor): batch of input features (B, emb_dim)
-            state (tuple): batch of decoder states
-                ((L, B, dec_dim), (L, B, dec_dim))
+            sequence: RNN input sequences. (B, D_emb)
+            state: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
 
         Returns:
-            y (torch.Tensor): batch of output features (B, dec_dim)
-            (tuple): batch of decoder states
-                (L, B, dec_dim), (L, B, dec_dim))
+            sequence: RNN output sequences. (B, D_dec)
+            (h_next, c_next): Decoder hidden states. (N, B, D_dec), (N, B, D_dec))
 
         """
         h_prev, c_prev = state
-        h_next, c_next = self.init_state(y.size(0))
+        h_next, c_next = self.init_state(sequence.size(0))
 
         for layer in range(self.dlayers):
             if self.dtype == "lstm":
-                y, (
+                sequence, (
                     h_next[layer : layer + 1],
                     c_next[layer : layer + 1],
                 ) = self.decoder[layer](
-                    y, hx=(h_prev[layer : layer + 1], c_prev[layer : layer + 1])
+                    sequence, hx=(h_prev[layer : layer + 1], c_prev[layer : layer + 1])
                 )
             else:
-                y, h_next[layer : layer + 1] = self.decoder[layer](
-                    y, hx=h_prev[layer : layer + 1]
+                sequence, h_next[layer : layer + 1] = self.decoder[layer](
+                    sequence, hx=h_prev[layer : layer + 1]
                 )
 
-            y = self.dropout_dec(y)
+            sequence = self.dropout_dec(sequence)
 
-        return y, (h_next, c_next)
+        return sequence, (h_next, c_next)
 
-    def forward(self, hs_pad, ys_in_pad):
-        """Forward function for transducer.
+    def forward(self, labels: torch.Tensor) -> torch.Tensor:
+        """Encode source label sequences.
 
         Args:
-            hs_pad (torch.Tensor):
-                batch of padded hidden state sequences (B, Tmax, D)
-            ys_in_pad (torch.Tensor):
-                batch of padded character id sequence tensor (B, Lmax+1)
+            labels: Label ID sequences. (B, L)
 
         Returns:
-            z (torch.Tensor): output (B, T, U, odim)
+            dec_out: Decoder output sequences. (B, T, U, D_dec)
 
         """
-        self.set_device(hs_pad.device)
-        self.set_data_type(hs_pad.dtype)
-
-        state = self.init_state(hs_pad.size(0))
-        eys = self.dropout_embed(self.embed(ys_in_pad))
+        init_state = self.init_state(labels.size(0))
+        dec_embed = self.dropout_embed(self.embed(labels))
 
-        h_dec, _ = self.rnn_forward(eys, state)
+        dec_out, _ = self.rnn_forward(dec_embed, init_state)
 
-        return h_dec
+        return dec_out
 
-    def score(self, hyp, cache):
-        """Forward one step.
+    def score(
+        self, hyp: Hypothesis, cache: Dict[str, Any]
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]], torch.Tensor]:
+        """One-step forward hypothesis.
 
         Args:
-            hyp (dataclass): hypothesis
-            cache (dict): states cache
+            hyp: Hypothesis.
+            cache: Pairs of (dec_out, state) for each label sequence. (key)
 
         Returns:
-            y (torch.Tensor): decoder outputs (1, dec_dim)
-            state (tuple): decoder states
-                ((L, 1, dec_dim), (L, 1, dec_dim)),
-            (torch.Tensor): token id for LM (1,)
+            dec_out: Decoder output sequence. (1, D_dec)
+            new_state: Decoder hidden states. ((N, 1, D_dec), (N, 1, D_dec))
+            label: Label ID for LM. (1,)
 
         """
-        vy = torch.full((1, 1), hyp.yseq[-1], dtype=torch.long, device=self.device)
+        label = torch.full((1, 1), hyp.yseq[-1], dtype=torch.long, device=self.device)
 
-        str_yseq = "".join(list(map(str, hyp.yseq)))
+        str_labels = "_".join(list(map(str, hyp.yseq)))
 
-        if str_yseq in cache:
-            y, state = cache[str_yseq]
+        if str_labels in cache:
+            dec_out, dec_state = cache[str_labels]
         else:
-            ey = self.embed(vy)
+            dec_emb = self.embed(label)
 
-            y, state = self.rnn_forward(ey, hyp.dec_state)
-            cache[str_yseq] = (y, state)
+            dec_out, dec_state = self.rnn_forward(dec_emb, hyp.dec_state)
+            cache[str_labels] = (dec_out, dec_state)
 
-        return y[0][0], state, vy[0]
+        return dec_out[0][0], dec_state, label[0]
 
-    def batch_score(self, hyps, batch_states, cache, use_lm):
-        """Forward batch one step.
+    def batch_score(
+        self,
+        hyps: Union[List[Hypothesis], List[ExtendedHypothesis]],
+        dec_states: Tuple[torch.Tensor, Optional[torch.Tensor]],
+        cache: Dict[str, Any],
+        use_lm: bool,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
+        """One-step forward hypotheses.
 
         Args:
-            hyps (list): batch of hypotheses
-            batch_states (tuple): batch of decoder states
-                ((L, B, dec_dim), (L, B, dec_dim))
-            cache (dict): states cache
-            use_lm (bool): whether a LM is used for decoding
+            hyps: Hypotheses.
+            states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+            cache: Pairs of (dec_out, dec_states) for each label sequences. (keys)
+            use_lm: Whether to compute label ID sequences for LM.
 
         Returns:
-            batch_y (torch.Tensor): decoder output (B, dec_dim)
-            batch_states (tuple): batch of decoder states
-                ((L, B, dec_dim), (L, B, dec_dim))
-            lm_tokens (torch.Tensor): batch of token ids for LM (B)
+            dec_out: Decoder output sequences. (B, D_dec)
+            dec_states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+            lm_labels: Label ID sequences for LM. (B,)
 
         """
         final_batch = len(hyps)
@@ -216,77 +216,80 @@ def batch_score(self, hyps, batch_states, cache, use_lm):
         done = [None] * final_batch
 
         for i, hyp in enumerate(hyps):
-            str_yseq = "".join(list(map(str, hyp.yseq)))
+            str_labels = "_".join(list(map(str, hyp.yseq)))
 
-            if str_yseq in cache:
-                done[i] = cache[str_yseq]
+            if str_labels in cache:
+                done[i] = cache[str_labels]
             else:
-                process.append((str_yseq, hyp.yseq[-1], hyp.dec_state))
+                process.append((str_labels, hyp.yseq[-1], hyp.dec_state))
 
         if process:
-            tokens = torch.LongTensor([[p[1]] for p in process], device=self.device)
-            dec_state = self.create_batch_states(
-                self.init_state(tokens.size(0)), [p[2] for p in process]
+            labels = torch.LongTensor([[p[1]] for p in process], device=self.device)
+            p_dec_states = self.create_batch_states(
+                self.init_state(labels.size(0)), [p[2] for p in process]
             )
 
-            ey = self.embed(tokens)
-            y, dec_state = self.rnn_forward(ey, dec_state)
+            dec_emb = self.embed(labels)
+            dec_out, new_states = self.rnn_forward(dec_emb, p_dec_states)
 
         j = 0
         for i in range(final_batch):
             if done[i] is None:
-                new_state = self.select_state(dec_state, j)
+                state = self.select_state(new_states, j)
 
-                done[i] = (y[j], new_state)
-                cache[process[j][0]] = (y[j], new_state)
+                done[i] = (dec_out[j], state)
+                cache[process[j][0]] = (dec_out[j], state)
 
                 j += 1
 
-        batch_y = torch.cat([d[0] for d in done], dim=0)
-        batch_states = self.create_batch_states(batch_states, [d[1] for d in done])
+        dec_out = torch.cat([d[0] for d in done], dim=0)
+        dec_states = self.create_batch_states(dec_states, [d[1] for d in done])
 
         if use_lm:
-            lm_tokens = torch.LongTensor([h.yseq[-1] for h in hyps], device=self.device)
+            lm_labels = torch.LongTensor([h.yseq[-1] for h in hyps], device=self.device)
 
-            return batch_y, batch_states, lm_tokens
+            return dec_out, dec_states, lm_labels
 
-        return batch_y, batch_states, None
+        return dec_out, dec_states, None
 
-    def select_state(self, batch_states, idx):
-        """Get decoder state from batch of states, for given id.
+    def select_state(
+        self, states: Tuple[torch.Tensor, Optional[torch.Tensor]], idx: int
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Get specified ID state from decoder hidden states.
 
         Args:
-            batch_states (tuple): batch of decoder states
-                ((L, B, dec_dim), (L, B, dec_dim))
-            idx (int): index to extract state from batch of states
+            states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+            idx: State ID to extract.
 
         Returns:
-            (tuple): decoder states for given id
-                ((L, 1, dec_dim), (L, 1, dec_dim))
+            : Decoder hidden state for given ID.
+              ((N, 1, D_dec), (N, 1, D_dec))
 
         """
         return (
-            batch_states[0][:, idx : idx + 1, :],
-            batch_states[1][:, idx : idx + 1, :] if self.dtype == "lstm" else None,
+            states[0][:, idx : idx + 1, :],
+            states[1][:, idx : idx + 1, :] if self.dtype == "lstm" else None,
         )
 
-    def create_batch_states(self, batch_states, l_states, l_tokens=None):
-        """Create batch of decoder states.
+    def create_batch_states(
+        self,
+        states: Tuple[torch.Tensor, Optional[torch.Tensor]],
+        new_states: List[Tuple[torch.Tensor, Optional[torch.Tensor]]],
+        check_list: Optional[List] = None,
+    ) -> List[Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Create decoder hidden states.
 
         Args:
-            batch_states (tuple): batch of decoder states
-               ((L, B, dec_dim), (L, B, dec_dim))
-            l_states (list): list of decoder states
-               [L x ((1, dec_dim), (1, dec_dim))]
+            states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+            new_states: Decoder hidden states. [N x ((1, D_dec), (1, D_dec))]
 
         Returns:
-            batch_states (tuple): batch of decoder states
-                ((L, B, dec_dim), (L, B, dec_dim))
+            states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
 
         """
         return (
-            torch.cat([s[0] for s in l_states], dim=1),
-            torch.cat([s[1] for s in l_states], dim=1)
+            torch.cat([s[0] for s in new_states], dim=1),
+            torch.cat([s[1] for s in new_states], dim=1)
             if self.dtype == "lstm"
             else None,
         )
diff --git a/espnet/nets/pytorch_backend/transducer/rnn_encoder.py b/espnet/nets/pytorch_backend/transducer/rnn_encoder.py
index 3751c89f305..3fe6a783710 100644
--- a/espnet/nets/pytorch_backend/transducer/rnn_encoder.py
+++ b/espnet/nets/pytorch_backend/transducer/rnn_encoder.py
@@ -1,16 +1,14 @@
-"""RNN encoder implementation for transducer-based models.
+"""RNN encoder implementation for Transducer model.
 
 These classes are based on the ones in espnet.nets.pytorch_backend.rnn.encoders,
-and modified to output intermediate layers representation based on a list of
-layers given as input. These additional outputs are intended to be used with
-auxiliary tasks.
-It should be noted that, here, RNN class rely on a stack of 1-layer LSTM instead
-of a multi-layer LSTM for that purpose.
+and modified to output intermediate representation based given list of layers as input.
+To do so, RNN class rely on a stack of 1-layer LSTM instead of a multi-layer LSTM.
+The additional outputs are intended to be used with Transducer auxiliary tasks.
+
 
 """
 
-import argparse
-import logging
+from argparse import Namespace
 from typing import List
 from typing import Optional
 from typing import Tuple
@@ -31,260 +29,295 @@ class RNNP(torch.nn.Module):
     """RNN with projection layer module.
 
     Args:
-        idim: Dimension of inputs
-        elayers: Dimension of encoder layers
-        cdim: Number of units (results in cdim * 2 if bidirectional)
-        hdim: Number of projection units
-        subsample: List of subsampling number
-        dropout: Dropout rate
-        typ: RNN type
-        aux_task_layer_list: List of layer ids for intermediate output
+        idim: Input dimension.
+        rnn_type: RNNP units type.
+        elayers: Number of RNNP layers.
+        eunits: Number of units ((2 * eunits) if bidirectional).
+        eprojs: Number of projection units.
+        subsample: Subsampling rate per layer.
+        dropout_rate: Dropout rate for RNNP layers.
+        aux_output_layers: Layer IDs for auxiliary RNNP output sequences.
 
     """
 
     def __init__(
         self,
         idim: int,
+        rnn_type: str,
         elayers: int,
-        cdim: int,
-        hdim: int,
+        eunits: int,
+        eprojs: int,
         subsample: np.ndarray,
-        dropout: float,
-        typ: str = "blstm",
-        aux_task_layer_list: List = [],
+        dropout_rate: float,
+        aux_output_layers: List = [],
     ):
         """Initialize RNNP module."""
-        super(RNNP, self).__init__()
+        super().__init__()
 
-        bidir = typ[0] == "b"
+        bidir = rnn_type[0] == "b"
         for i in range(elayers):
             if i == 0:
-                inputdim = idim
+                input_dim = idim
             else:
-                inputdim = hdim
+                input_dim = eprojs
 
-            RNN = torch.nn.LSTM if "lstm" in typ else torch.nn.GRU
-            rnn = RNN(
-                inputdim, cdim, num_layers=1, bidirectional=bidir, batch_first=True
+            rnn_layer = torch.nn.LSTM if "lstm" in rnn_type else torch.nn.GRU
+            rnn = rnn_layer(
+                input_dim, eunits, num_layers=1, bidirectional=bidir, batch_first=True
             )
 
             setattr(self, "%s%d" % ("birnn" if bidir else "rnn", i), rnn)
 
             if bidir:
-                setattr(self, "bt%d" % i, torch.nn.Linear(2 * cdim, hdim))
+                setattr(self, "bt%d" % i, torch.nn.Linear(2 * eunits, eprojs))
             else:
-                setattr(self, "bt%d" % i, torch.nn.Linear(cdim, hdim))
+                setattr(self, "bt%d" % i, torch.nn.Linear(eunits, eprojs))
+
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
 
         self.elayers = elayers
-        self.cdim = cdim
+        self.eunits = eunits
         self.subsample = subsample
-        self.typ = typ
+        self.rnn_type = rnn_type
         self.bidir = bidir
-        self.dropout = dropout
 
-        self.aux_task_layer_list = aux_task_layer_list
+        self.aux_output_layers = aux_output_layers
 
     def forward(
         self,
-        xs_pad: torch.Tensor,
-        ilens: torch.Tensor,
-        prev_state: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor, List], torch.Tensor]:
+        rnn_input: torch.Tensor,
+        rnn_len: torch.Tensor,
+        prev_states: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
         """RNNP forward.
 
         Args:
-            xs_pad: Batch of padded input sequences (B, Tmax, idim)
-            ilens: Batch of lengths of input sequences (B)
-            prev_state: Batch of previous RNN states
+            rnn_input: RNN input sequences. (B, T, D_in)
+            rnn_len: RNN input sequences lengths. (B,)
+            prev_states: RNN hidden states. [N x (B, T, D_proj)]
 
         Returns:
-            : Batch of padded output sequences (B, Tmax, hdim)
-                    or tuple w/ aux outputs ((B, Tmax, hdim), [L x (B, Tmax, hdim)])
-            : Batch of lengths of output sequences (B)
-            : Batch of hidden state sequences (B, Tmax, hdim)
+            rnn_output : RNN output sequences. (B, T, D_proj)
+                         with or without intermediate RNN output sequences.
+                         ((B, T, D_proj), [N x (B, T, D_proj)])
+            rnn_len: RNN output sequences lengths. (B,)
+            current_states: RNN hidden states. [N x (B, T, D_proj)]
 
         """
-        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
+        aux_rnn_outputs = []
+        aux_rnn_lens = []
+        current_states = []
 
-        aux_xs_list = []
-        elayer_states = []
         for layer in range(self.elayers):
-            if not isinstance(ilens, torch.Tensor):
-                ilens = torch.tensor(ilens)
+            if not isinstance(rnn_len, torch.Tensor):
+                rnn_len = torch.tensor(rnn_len)
 
-            xs_pack = pack_padded_sequence(xs_pad, ilens.cpu(), batch_first=True)
+            pack_rnn_input = pack_padded_sequence(
+                rnn_input, rnn_len.cpu(), batch_first=True
+            )
             rnn = getattr(self, ("birnn" if self.bidir else "rnn") + str(layer))
-            rnn.flatten_parameters()
 
-            if prev_state is not None and rnn.bidirectional:
-                prev_state = reset_backward_rnn_state(prev_state)
+            if isinstance(rnn, (torch.nn.LSTM, torch.nn.GRU)):
+                rnn.flatten_parameters()
 
-            ys, states = rnn(
-                xs_pack, hx=None if prev_state is None else prev_state[layer]
+            if prev_states is not None and rnn.bidirectional:
+                prev_states = reset_backward_rnn_state(prev_states)
+
+            pack_rnn_output, states = rnn(
+                pack_rnn_input, hx=None if prev_states is None else prev_states[layer]
             )
-            elayer_states.append(states)
+            current_states.append(states)
 
-            ys_pad, ilens = pad_packed_sequence(ys, batch_first=True)
+            pad_rnn_output, rnn_len = pad_packed_sequence(
+                pack_rnn_output, batch_first=True
+            )
 
             sub = self.subsample[layer + 1]
             if sub > 1:
-                ys_pad = ys_pad[:, ::sub]
-                ilens = torch.tensor([int(i + 1) // sub for i in ilens])
+                pad_rnn_output = pad_rnn_output[:, ::sub]
+                rnn_len = torch.tensor([int(i + 1) // sub for i in rnn_len])
 
             projection_layer = getattr(self, "bt%d" % layer)
-            projected = projection_layer(ys_pad.contiguous().view(-1, ys_pad.size(2)))
-            xs_pad = projected.view(ys_pad.size(0), ys_pad.size(1), -1)
+            proj_rnn_output = projection_layer(
+                pad_rnn_output.contiguous().view(-1, pad_rnn_output.size(2))
+            )
+            rnn_output = proj_rnn_output.view(
+                pad_rnn_output.size(0), pad_rnn_output.size(1), -1
+            )
 
-            if layer in self.aux_task_layer_list:
-                aux_xs_list.append(xs_pad)
+            if layer in self.aux_output_layers:
+                aux_rnn_outputs.append(rnn_output)
+                aux_rnn_lens.append(rnn_len)
 
             if layer < self.elayers - 1:
-                xs_pad = torch.tanh(F.dropout(xs_pad, p=self.dropout))
+                rnn_output = torch.tanh(self.dropout(rnn_output))
 
-        if aux_xs_list:
-            return (xs_pad, aux_xs_list), ilens, elayer_states
+            rnn_input = rnn_output
+
+        if aux_rnn_outputs:
+            return (
+                (rnn_output, aux_rnn_outputs),
+                (rnn_len, aux_rnn_lens),
+                current_states,
+            )
         else:
-            return xs_pad, ilens, elayer_states
+            return rnn_output, rnn_len, current_states
 
 
 class RNN(torch.nn.Module):
     """RNN module.
 
     Args:
-        idim: Dimension of inputs
-        elayers: Number of encoder layers
-        cdim: Number of rnn units (resulted in cdim * 2 if bidirectional)
-        hdim: Number of final projection units
-        dropout: Dropout rate
-        typ: The RNN type
+        idim: Input dimension.
+        rnn_type: RNN units type.
+        elayers: Number of RNN layers.
+        eunits: Number of units ((2 * eunits) if bidirectional)
+        eprojs: Number of final projection units.
+        dropout_rate: Dropout rate for RNN layers.
+        aux_output_layers: List of layer IDs for auxiliary RNN output sequences.
 
     """
 
     def __init__(
         self,
         idim: int,
+        rnn_type: str,
         elayers: int,
-        cdim: int,
-        hdim: int,
-        dropout: float,
-        typ: str = "blstm",
-        aux_task_layer_list: List = [],
+        eunits: int,
+        eprojs: int,
+        dropout_rate: float,
+        aux_output_layers: List = [],
     ):
         """Initialize RNN module."""
-        super(RNN, self).__init__()
+        super().__init__()
 
-        bidir = typ[0] == "b"
+        bidir = rnn_type[0] == "b"
 
         for i in range(elayers):
             if i == 0:
-                inputdim = idim
+                input_dim = idim
             else:
-                inputdim = cdim
+                input_dim = eunits
 
-            layer_type = torch.nn.LSTM if "lstm" in typ else torch.nn.GRU
-            rnn = layer_type(
-                inputdim, cdim, num_layers=1, bidirectional=bidir, batch_first=True
+            rnn_layer = torch.nn.LSTM if "lstm" in rnn_type else torch.nn.GRU
+            rnn = rnn_layer(
+                input_dim, eunits, num_layers=1, bidirectional=bidir, batch_first=True
             )
 
             setattr(self, "%s%d" % ("birnn" if bidir else "rnn", i), rnn)
 
-        self.dropout = torch.nn.Dropout(p=dropout)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
 
         self.elayers = elayers
-        self.cdim = cdim
-        self.hdim = hdim
-        self.typ = typ
+        self.eunits = eunits
+        self.eprojs = eprojs
+        self.rnn_type = rnn_type
         self.bidir = bidir
 
-        self.l_last = torch.nn.Linear(cdim, hdim)
+        self.l_last = torch.nn.Linear(eunits, eprojs)
 
-        self.aux_task_layer_list = aux_task_layer_list
+        self.aux_output_layers = aux_output_layers
 
     def forward(
         self,
-        xs_pad: torch.Tensor,
-        ilens: torch.Tensor,
-        prev_state: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple[torch.Tensor, List], torch.Tensor]:
+        rnn_input: torch.Tensor,
+        rnn_len: torch.Tensor,
+        prev_states: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
         """RNN forward.
 
         Args:
-            xs_pad: Batch of padded input sequences (B, Tmax, idim)
-            ilens: Batch of lengths of input sequences (B)
-            prev_state: Batch of previous RNN states
+            rnn_input: RNN input sequences. (B, T, D_in)
+            rnn_len: RNN input sequences lengths. (B,)
+            prev_states: RNN hidden states. [N x (B, T, D_proj)]
 
         Returns:
-            : Batch of padded output sequences (B, Tmax, hdim)
-                    or tuple w/ aux outputs ((B, Tmax, hdim), [L x (B, Tmax, hdim)])
-            : Batch of lengths of output sequences (B)
-            : Batch of hidden state sequences (B, Tmax, hdim)
+            rnn_output : RNN output sequences. (B, T, D_proj)
+                         with or without intermediate RNN output sequences.
+                         ((B, T, D_proj), [N x (B, T, D_proj)])
+            rnn_len: RNN output sequences lengths. (B,)
+            current_states: RNN hidden states. [N x (B, T, D_proj)]
 
         """
-        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
+        aux_rnn_outputs = []
+        aux_rnn_lens = []
+        current_states = []
 
-        aux_xs_list = []
-        elayer_states = []
         for layer in range(self.elayers):
-            if not isinstance(ilens, torch.Tensor):
-                ilens = torch.tensor(ilens)
+            if not isinstance(rnn_len, torch.Tensor):
+                rnn_len = torch.tensor(rnn_len)
 
-            xs_pack = pack_padded_sequence(xs_pad, ilens.cpu(), batch_first=True)
+            pack_rnn_input = pack_padded_sequence(
+                rnn_input, rnn_len.cpu(), batch_first=True
+            )
 
             rnn = getattr(self, ("birnn" if self.bidir else "rnn") + str(layer))
-            rnn.flatten_parameters()
 
-            if prev_state is not None and rnn.bidirectional:
-                prev_state = reset_backward_rnn_state(prev_state)
+            if isinstance(rnn, (torch.nn.LSTM, torch.nn.GRU)):
+                rnn.flatten_parameters()
+
+            if prev_states is not None and rnn.bidirectional:
+                prev_states = reset_backward_rnn_state(prev_states)
 
-            xs, states = rnn(
-                xs_pack, hx=None if prev_state is None else prev_state[layer]
+            pack_rnn_output, states = rnn(
+                pack_rnn_input, hx=None if prev_states is None else prev_states[layer]
             )
-            elayer_states.append(states)
+            current_states.append(states)
 
-            xs_pad, ilens = pad_packed_sequence(xs, batch_first=True)
+            rnn_output, rnn_len = pad_packed_sequence(pack_rnn_output, batch_first=True)
 
             if self.bidir:
-                xs_pad = xs_pad[:, :, : self.cdim] + xs_pad[:, :, self.cdim :]
+                rnn_output = (
+                    rnn_output[:, :, : self.eunits] + rnn_output[:, :, self.eunits :]
+                )
 
-            if layer in self.aux_task_layer_list:
-                aux_projected = torch.tanh(
-                    self.l_last(xs_pad.contiguous().view(-1, xs_pad.size(2)))
+            if layer in self.aux_output_layers:
+                aux_proj_rnn_output = torch.tanh(
+                    self.l_last(rnn_output.contiguous().view(-1, rnn_output.size(2)))
+                )
+                aux_rnn_output = aux_proj_rnn_output.view(
+                    rnn_output.size(0), rnn_output.size(1), -1
                 )
-                aux_xs_pad = aux_projected.view(xs_pad.size(0), xs_pad.size(1), -1)
 
-                aux_xs_list.append(aux_xs_pad)
+                aux_rnn_outputs.append(aux_rnn_output)
+                aux_rnn_lens.append(rnn_len)
 
             if layer < self.elayers - 1:
-                xs_pad = self.dropout(xs_pad)
+                rnn_input = self.dropout(rnn_output)
 
-        projected = torch.tanh(
-            self.l_last(xs_pad.contiguous().view(-1, xs_pad.size(2)))
+        proj_rnn_output = torch.tanh(
+            self.l_last(rnn_output.contiguous().view(-1, rnn_output.size(2)))
         )
-        xs_pad = projected.view(xs_pad.size(0), xs_pad.size(1), -1)
+        rnn_output = proj_rnn_output.view(rnn_output.size(0), rnn_output.size(1), -1)
 
-        if aux_xs_list:
-            return (xs_pad, aux_xs_list), ilens, elayer_states
+        if aux_rnn_outputs:
+            return (
+                (rnn_output, aux_rnn_outputs),
+                (rnn_len, aux_rnn_lens),
+                current_states,
+            )
         else:
-            return xs_pad, ilens, elayer_states
+            return rnn_output, rnn_len, current_states
 
 
 def reset_backward_rnn_state(
-    states: Union[torch.Tensor, Tuple, List]
-) -> Union[torch.Tensor, Tuple, List]:
+    states: Union[torch.Tensor, List[Optional[torch.Tensor]]]
+) -> Union[torch.Tensor, List[Optional[torch.Tensor]]]:
     """Set backward BRNN states to zeroes.
 
     Args:
-        states: RNN states
+        states: Encoder hidden states.
 
     Returns:
-        states: RNN states with backward set to zeroes
+        states: Encoder hidden states with backward set to zero.
 
     """
-    if isinstance(states, (list, tuple)):
+    if isinstance(states, list):
         for state in states:
             state[1::2] = 0.0
     else:
         states[1::2] = 0.0
+
     return states
 
 
@@ -308,85 +341,84 @@ def __init__(self, in_channel: int = 1):
 
         self.in_channel = in_channel
 
-    def forward(self, xs_pad: torch.Tensor, ilens: torch.Tensor, **kwargs):
+    def forward(
+        self, feats: torch.Tensor, feats_len: torch.Tensor, **kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """VGG2L forward.
 
         Args:
-            xs_pad: Batch of padded input sequences (B, Tmax, D)
-            ilens: Batch of lengths of input sequences (B)
+            feats: Feature sequences. (B, F, D_feats)
+            feats_len: Feature sequences lengths. (B, )
 
         Returns:
-            : Batch of padded output sequences (B, Tmax // 4, 128 * D // 4)
-            : Batch of lengths of output sequences (B)
+            vgg_out: VGG2L output sequences. (B, F // 4, 128 * D_feats // 4)
+            vgg_out_len: VGG2L output sequences lengths. (B,)
 
         """
-        logging.debug(self.__class__.__name__ + " input lengths: " + str(ilens))
-
-        xs_pad = xs_pad.view(
-            xs_pad.size(0),
-            xs_pad.size(1),
+        feats = feats.view(
+            feats.size(0),
+            feats.size(1),
             self.in_channel,
-            xs_pad.size(2) // self.in_channel,
+            feats.size(2) // self.in_channel,
         ).transpose(1, 2)
 
-        xs_pad = F.relu(self.conv1_1(xs_pad))
-        xs_pad = F.relu(self.conv1_2(xs_pad))
-        xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True)
+        vgg1 = F.relu(self.conv1_1(feats))
+        vgg1 = F.relu(self.conv1_2(vgg1))
+        vgg1 = F.max_pool2d(vgg1, 2, stride=2, ceil_mode=True)
+
+        vgg2 = F.relu(self.conv2_1(vgg1))
+        vgg2 = F.relu(self.conv2_2(vgg2))
+        vgg2 = F.max_pool2d(vgg2, 2, stride=2, ceil_mode=True)
 
-        xs_pad = F.relu(self.conv2_1(xs_pad))
-        xs_pad = F.relu(self.conv2_2(xs_pad))
-        xs_pad = F.max_pool2d(xs_pad, 2, stride=2, ceil_mode=True)
+        vgg_out = vgg2.transpose(1, 2)
+        vgg_out = vgg_out.contiguous().view(
+            vgg_out.size(0), vgg_out.size(1), vgg_out.size(2) * vgg_out.size(3)
+        )
 
-        if torch.is_tensor(ilens):
-            ilens = ilens.cpu().numpy()
+        if torch.is_tensor(feats_len):
+            feats_len = feats_len.cpu().numpy()
         else:
-            ilens = np.array(ilens, dtype=np.float32)
-        ilens = np.array(np.ceil(ilens / 2), dtype=np.int64)
-        ilens = np.array(
-            np.ceil(np.array(ilens, dtype=np.float32) / 2), dtype=np.int64
-        ).tolist()
+            feats_len = np.array(feats_len, dtype=np.float32)
 
-        xs_pad = xs_pad.transpose(1, 2)
-        xs_pad = xs_pad.contiguous().view(
-            xs_pad.size(0), xs_pad.size(1), xs_pad.size(2) * xs_pad.size(3)
-        )
+        vgg1_len = np.array(np.ceil(feats_len / 2), dtype=np.int64)
+        vgg_out_len = np.array(
+            np.ceil(np.array(vgg1_len, dtype=np.float32) / 2), dtype=np.int64
+        ).tolist()
 
-        return xs_pad, ilens, None
+        return vgg_out, vgg_out_len, None
 
 
 class Encoder(torch.nn.Module):
     """Encoder module.
 
     Args:
-        etype: Type of encoder network
-        idim: Number of dimensions of encoder network
-        elayers: Number of layers of encoder network
-        eunits: Number of RNN units of encoder network
-        eprojs: Number of projection units of encoder network
-        subsample: List of subsampling numbers
-        dropout: Dropout rate
-        in_channel: Number of input channels
+        idim: Input dimension.
+        etype: Encoder units type.
+        elayers: Number of encoder layers.
+        eunits: Number of encoder units per layer.
+        eprojs: Number of projection units per layer.
+        subsample: Subsampling rate per layer.
+        dropout_rate: Dropout rate for encoder layers.
+        intermediate_encoder_layers: Layer IDs for auxiliary encoder output sequences.
 
     """
 
     def __init__(
         self,
-        etype: str,
         idim: int,
+        etype: str,
         elayers: int,
         eunits: int,
         eprojs: int,
         subsample: np.ndarray,
-        dropout: float,
-        in_channel: int = 1,
-        aux_task_layer_list: List = [],
+        dropout_rate: float = 0.0,
+        aux_enc_output_layers: List = [],
     ):
         """Initialize Encoder module."""
         super(Encoder, self).__init__()
 
-        typ = etype.lstrip("vgg").rstrip("p")
-        if typ not in ["lstm", "gru", "blstm", "bgru"]:
-            logging.error("Error: need to specify an appropriate encoder architecture")
+        rnn_type = etype.lstrip("vgg").rstrip("p")
+        in_channel = 1
 
         if etype.startswith("vgg"):
             if etype[-1] == "p":
@@ -395,33 +427,31 @@ def __init__(
                         VGG2L(in_channel),
                         RNNP(
                             get_vgg2l_odim(idim, in_channel=in_channel),
+                            rnn_type,
                             elayers,
                             eunits,
                             eprojs,
                             subsample,
-                            dropout,
-                            typ=typ,
-                            aux_task_layer_list=aux_task_layer_list,
+                            dropout_rate=dropout_rate,
+                            aux_output_layers=aux_enc_output_layers,
                         ),
                     ]
                 )
-                logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
             else:
                 self.enc = torch.nn.ModuleList(
                     [
                         VGG2L(in_channel),
                         RNN(
                             get_vgg2l_odim(idim, in_channel=in_channel),
+                            rnn_type,
                             elayers,
                             eunits,
                             eprojs,
-                            dropout,
-                            typ=typ,
-                            aux_task_layer_list=aux_task_layer_list,
+                            dropout_rate=dropout_rate,
+                            aux_output_layers=aux_enc_output_layers,
                         ),
                     ]
                 )
-                logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
             self.conv_subsampling_factor = 4
         else:
             if etype[-1] == "p":
@@ -429,107 +459,119 @@ def __init__(
                     [
                         RNNP(
                             idim,
+                            rnn_type,
                             elayers,
                             eunits,
                             eprojs,
                             subsample,
-                            dropout,
-                            typ=typ,
-                            aux_task_layer_list=aux_task_layer_list,
+                            dropout_rate=dropout_rate,
+                            aux_output_layers=aux_enc_output_layers,
                         )
                     ]
                 )
-                logging.info(typ.upper() + " with every-layer projection for encoder")
             else:
                 self.enc = torch.nn.ModuleList(
                     [
                         RNN(
                             idim,
+                            rnn_type,
                             elayers,
                             eunits,
                             eprojs,
-                            dropout,
-                            typ=typ,
-                            aux_task_layer_list=aux_task_layer_list,
+                            dropout_rate=dropout_rate,
+                            aux_output_layers=aux_enc_output_layers,
                         )
                     ]
                 )
-                logging.info(typ.upper() + " without projection for encoder")
             self.conv_subsampling_factor = 1
 
-    def forward(self, xs_pad, ilens, prev_states=None):
+    def forward(
+        self,
+        feats: torch.Tensor,
+        feats_len: torch.Tensor,
+        prev_states: Optional[List[torch.Tensor]] = None,
+    ):
         """Forward encoder.
 
         Args:
-            xs_pad: Batch of padded input sequences (B, Tmax, idim)
-            ilens: Batch of lengths of input sequences (B)
-            prev_state: Batch of previous encoder hidden states (B, ??)
+            feats: Feature sequences. (B, F, D_feats)
+            feats_len: Feature sequences lengths. (B,)
+            prev_states: Previous encoder hidden states. [N x (B, T, D_enc)]
 
         Returns:
-            : Batch of padded output sequences (B, Tmax, hdim)
-                    or tuple w/ aux outputs ((B, Tmax, hdim), [L x (B, Tmax, hdim)])
-            : Batch of lengths of output sequences (B)
-            : Batch of hidden state sequences (B, Tmax, hdim)
+            enc_out: Encoder output sequences. (B, T, D_enc)
+                   with or without encoder intermediate output sequences.
+                   ((B, T, D_enc), [N x (B, T, D_enc)])
+            enc_out_len: Encoder output sequences lengths. (B,)
+            current_states: Encoder hidden states. [N x (B, T, D_enc)]
 
         """
         if prev_states is None:
             prev_states = [None] * len(self.enc)
         assert len(prev_states) == len(self.enc)
 
+        _enc_out = feats
+        _enc_out_len = feats_len
         current_states = []
-        for module, prev_state in zip(self.enc, prev_states):
-            xs_pad, ilens, states = module(
-                xs_pad,
-                ilens,
-                prev_state=prev_state,
+        for rnn_module, prev_state in zip(self.enc, prev_states):
+            _enc_out, _enc_out_len, states = rnn_module(
+                _enc_out,
+                _enc_out_len,
+                prev_states=prev_state,
             )
             current_states.append(states)
 
-        if isinstance(xs_pad, tuple):
-            final_xs_pad, aux_xs_list = xs_pad[0], xs_pad[1]
+        if isinstance(_enc_out, tuple):
+            enc_out, aux_enc_out = _enc_out[0], _enc_out[1]
+            enc_out_len, aux_enc_out_len = _enc_out_len[0], _enc_out_len[1]
 
-            mask = to_device(final_xs_pad, make_pad_mask(ilens).unsqueeze(-1))
+            enc_out_mask = to_device(enc_out, make_pad_mask(enc_out_len).unsqueeze(-1))
+            enc_out = enc_out.masked_fill(enc_out_mask, 0.0)
 
-            aux_xs_list = [layer.masked_fill(mask, 0.0) for layer in aux_xs_list]
+            for i in range(len(aux_enc_out)):
+                aux_mask = to_device(
+                    aux_enc_out[i], make_pad_mask(aux_enc_out_len[i]).unsqueeze(-1)
+                )
+                aux_enc_out[i] = aux_enc_out[i].masked_fill(aux_mask, 0.0)
 
             return (
-                (
-                    final_xs_pad.masked_fill(mask, 0.0),
-                    aux_xs_list,
-                ),
-                ilens,
+                (enc_out, aux_enc_out),
+                (enc_out_len, aux_enc_out_len),
                 current_states,
             )
         else:
-            mask = to_device(xs_pad, make_pad_mask(ilens).unsqueeze(-1))
+            enc_out_mask = to_device(
+                _enc_out, make_pad_mask(_enc_out_len).unsqueeze(-1)
+            )
 
-            return xs_pad.masked_fill(mask, 0.0), ilens, current_states
+            return _enc_out.masked_fill(enc_out_mask, 0.0), _enc_out_len, current_states
 
 
 def encoder_for(
-    args: argparse.Namespace,
-    idim: Union[int, List],
+    args: Namespace,
+    idim: int,
     subsample: np.ndarray,
-    aux_task_layer_list: List = [],
-) -> Union[torch.nn.Module, List[torch.nn.Module]]:
-    """Instantiate an encoder module given the program arguments.
+    aux_enc_output_layers: List = [],
+) -> torch.nn.Module:
+    """Instantiate a RNN encoder with specified arguments.
 
     Args:
-        args: The model arguments
-        idim: Dimension of inputs or list of dimensions of inputs for each encoder
-        subsample: subsample factors or list of subsample factors for each encoder
+        args: The model arguments.
+        idim: Input dimension.
+        subsample: Subsampling rate per layer.
+        aux_enc_output_layers: Layer IDs for auxiliary encoder output sequences.
 
     Returns:
-        : The encoder module or list of encoder modules
+        : Encoder module.
 
     """
     return Encoder(
-        args.etype,
         idim,
+        args.etype,
         args.elayers,
         args.eunits,
         args.eprojs,
         subsample,
-        args.dropout_rate,
-        aux_task_layer_list=aux_task_layer_list,
+        dropout_rate=args.dropout_rate,
+        aux_enc_output_layers=aux_enc_output_layers,
     )
diff --git a/espnet/nets/pytorch_backend/transducer/tdnn.py b/espnet/nets/pytorch_backend/transducer/tdnn.py
deleted file mode 100644
index 1041a81f85e..00000000000
--- a/espnet/nets/pytorch_backend/transducer/tdnn.py
+++ /dev/null
@@ -1,163 +0,0 @@
-"""TDNN modules definition for transformer encoder."""
-
-import logging
-from typing import Tuple
-from typing import Union
-
-import torch
-
-
-class TDNN(torch.nn.Module):
-    """TDNN implementation with symmetric context.
-
-    Args:
-        idim: Dimension of inputs
-        odim: Dimension of outputs
-        ctx_size: Size of context window
-        stride: Stride of the sliding blocks
-        dilation: Parameter to control the stride of
-                  elements within the neighborhood
-        batch_norm: Whether to use batch normalization
-        relu: Whether to use non-linearity layer (ReLU)
-
-    """
-
-    def __init__(
-        self,
-        idim: int,
-        odim: int,
-        ctx_size: int = 5,
-        dilation: int = 1,
-        stride: int = 1,
-        batch_norm: bool = False,
-        relu: bool = True,
-        dropout_rate: float = 0.0,
-    ):
-        """Construct a TDNN object."""
-        super().__init__()
-
-        self.idim = idim
-        self.odim = odim
-
-        self.ctx_size = ctx_size
-        self.stride = stride
-        self.dilation = dilation
-
-        self.batch_norm = batch_norm
-        self.relu = relu
-
-        self.tdnn = torch.nn.Conv1d(
-            idim, odim, ctx_size, stride=stride, dilation=dilation
-        )
-
-        if self.relu:
-            self.relu_func = torch.nn.ReLU()
-
-        if self.batch_norm:
-            self.bn = torch.nn.BatchNorm1d(odim)
-
-        self.dropout = torch.nn.Dropout(p=dropout_rate)
-
-    def forward(
-        self,
-        x_input: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        masks: torch.Tensor,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor]:
-        """Forward TDNN.
-
-        Args:
-            x_input: Input tensor (B, T, idim) or ((B, T, idim), (B, T, att_dim))
-            or ((B, T, idim), (B, 2*T-1, att_dim))
-            masks: Input mask (B, 1, T)
-
-        Returns:
-            x_output: Output tensor (B, sub(T), odim)
-                          or ((B, sub(T), odim), (B, sub(T), att_dim))
-            mask: Output mask (B, 1, sub(T))
-
-        """
-        if isinstance(x_input, tuple):
-            xs, pos_emb = x_input[0], x_input[1]
-        else:
-            xs, pos_emb = x_input, None
-
-        # The bidirect_pos is used to distinguish legacy_rel_pos and rel_pos in
-        # Conformer model. Note the `legacy_rel_pos` will be deprecated in the future.
-        # Details can be found in https://github.com/espnet/espnet/pull/2816.
-        if pos_emb is not None and pos_emb.size(1) == 2 * xs.size(1) - 1:
-            logging.warning("Using bidirectional relative postitional encoding.")
-            bidirect_pos = True
-        else:
-            bidirect_pos = False
-
-        xs = xs.transpose(1, 2)
-        xs = self.tdnn(xs)
-
-        if self.relu:
-            xs = self.relu_func(xs)
-
-        xs = self.dropout(xs)
-
-        if self.batch_norm:
-            xs = self.bn(xs)
-
-        xs = xs.transpose(1, 2)
-
-        return self.create_outputs(xs, pos_emb, masks, bidirect_pos=bidirect_pos)
-
-    def create_outputs(
-        self,
-        xs: torch.Tensor,
-        pos_emb: torch.Tensor,
-        masks: torch.Tensor,
-        bidirect_pos: bool = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor]:
-        """Create outputs with subsampled version of pos_emb and masks.
-
-        Args:
-            xs: Output tensor (B, sub(T), odim)
-            pos_emb: Input positional embedding tensor (B, T, att_dim)
-            or (B, 2*T-1, att_dim)
-            masks: Input mask (B, 1, T)
-            bidirect_pos: whether to use bidirectional positional embedding
-
-        Returns:
-            xs: Output tensor (B, sub(T), odim)
-            pos_emb: Output positional embedding tensor (B, sub(T), att_dim)
-            or (B, 2*sub(T)-1, att_dim)
-            masks: Output mask (B, 1, sub(T))
-
-        """
-        sub = (self.ctx_size - 1) * self.dilation
-
-        if masks is not None:
-            if sub != 0:
-                masks = masks[:, :, :-sub]
-
-            masks = masks[:, :, :: self.stride]
-
-        if pos_emb is not None:
-            # If the bidirect_pos is true, the pos_emb will include both positive and
-            # negative embeddings. Refer to https://github.com/espnet/espnet/pull/2816.
-            if bidirect_pos:
-                pos_emb_positive = pos_emb[:, : pos_emb.size(1) // 2 + 1, :]
-                pos_emb_negative = pos_emb[:, pos_emb.size(1) // 2 :, :]
-
-                if sub != 0:
-                    pos_emb_positive = pos_emb_positive[:, :-sub, :]
-                    pos_emb_negative = pos_emb_negative[:, :-sub, :]
-
-                pos_emb_positive = pos_emb_positive[:, :: self.stride, :]
-                pos_emb_negative = pos_emb_negative[:, :: self.stride, :]
-                pos_emb = torch.cat(
-                    [pos_emb_positive, pos_emb_negative[:, 1:, :]], dim=1
-                )
-            else:
-                if sub != 0:
-                    pos_emb = pos_emb[:, :-sub, :]
-
-                pos_emb = pos_emb[:, :: self.stride, :]
-
-            return (xs, pos_emb), masks
-
-        return xs, masks
diff --git a/espnet/nets/pytorch_backend/transducer/transducer_tasks.py b/espnet/nets/pytorch_backend/transducer/transducer_tasks.py
new file mode 100644
index 00000000000..79dc614bca6
--- /dev/null
+++ b/espnet/nets/pytorch_backend/transducer/transducer_tasks.py
@@ -0,0 +1,469 @@
+"""Module implementing Transducer main and auxiliary tasks."""
+
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+
+from espnet.nets.pytorch_backend.nets_utils import pad_list
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transducer.joint_network import JointNetwork
+
+
+class TransducerTasks(torch.nn.Module):
+    """Transducer tasks module."""
+
+    def __init__(
+        self,
+        encoder_dim: int,
+        decoder_dim: int,
+        joint_dim: int,
+        output_dim: int,
+        joint_activation_type: str = "tanh",
+        transducer_loss_weight: float = 1.0,
+        ctc_loss: bool = False,
+        ctc_loss_weight: float = 0.5,
+        ctc_loss_dropout_rate: float = 0.0,
+        lm_loss: bool = False,
+        lm_loss_weight: float = 0.5,
+        lm_loss_smoothing_rate: float = 0.0,
+        aux_transducer_loss: bool = False,
+        aux_transducer_loss_weight: float = 0.2,
+        aux_transducer_loss_mlp_dim: int = 320,
+        aux_trans_loss_mlp_dropout_rate: float = 0.0,
+        symm_kl_div_loss: bool = False,
+        symm_kl_div_loss_weight: float = 0.2,
+        fastemit_lambda: float = 0.0,
+        blank_id: int = 0,
+        ignore_id: int = -1,
+        training: bool = False,
+    ):
+        """Initialize module for Transducer tasks.
+
+        Args:
+            encoder_dim: Encoder outputs dimension.
+            decoder_dim: Decoder outputs dimension.
+            joint_dim: Joint space dimension.
+            output_dim: Output dimension.
+            joint_activation_type: Type of activation for joint network.
+            transducer_loss_weight: Weight for main transducer loss.
+            ctc_loss: Compute CTC loss.
+            ctc_loss_weight: Weight of CTC loss.
+            ctc_loss_dropout_rate: Dropout rate for CTC loss inputs.
+            lm_loss: Compute LM loss.
+            lm_loss_weight: Weight of LM loss.
+            lm_loss_smoothing_rate: Smoothing rate for LM loss' label smoothing.
+            aux_transducer_loss: Compute auxiliary transducer loss.
+            aux_transducer_loss_weight: Weight of auxiliary transducer loss.
+            aux_transducer_loss_mlp_dim: Hidden dimension for aux. transducer MLP.
+            aux_trans_loss_mlp_dropout_rate: Dropout rate for aux. transducer MLP.
+            symm_kl_div_loss: Compute KL divergence loss.
+            symm_kl_div_loss_weight: Weight of KL divergence loss.
+            fastemit_lambda: Regularization parameter for FastEmit.
+            blank_id: Blank symbol ID.
+            ignore_id: Padding symbol ID.
+            training: Whether the model was initializated in training or inference mode.
+
+        """
+        super().__init__()
+
+        if not training:
+            ctc_loss, lm_loss, aux_transducer_loss, symm_kl_div_loss = (
+                False,
+                False,
+                False,
+                False,
+            )
+
+        self.joint_network = JointNetwork(
+            output_dim, encoder_dim, decoder_dim, joint_dim, joint_activation_type
+        )
+
+        if training:
+            from warprnnt_pytorch import RNNTLoss
+
+            self.transducer_loss = RNNTLoss(
+                blank=blank_id,
+                reduction="sum",
+                fastemit_lambda=fastemit_lambda,
+            )
+
+        if ctc_loss:
+            self.ctc_lin = torch.nn.Linear(encoder_dim, output_dim)
+
+            self.ctc_loss = torch.nn.CTCLoss(
+                blank=blank_id,
+                reduction="none",
+                zero_infinity=True,
+            )
+
+        if aux_transducer_loss:
+            self.mlp = torch.nn.Sequential(
+                torch.nn.Linear(encoder_dim, aux_transducer_loss_mlp_dim),
+                torch.nn.LayerNorm(aux_transducer_loss_mlp_dim),
+                torch.nn.Dropout(p=aux_trans_loss_mlp_dropout_rate),
+                torch.nn.ReLU(),
+                torch.nn.Linear(aux_transducer_loss_mlp_dim, joint_dim),
+            )
+
+            if symm_kl_div_loss:
+                self.kl_div = torch.nn.KLDivLoss(reduction="sum")
+
+        if lm_loss:
+            self.lm_lin = torch.nn.Linear(decoder_dim, output_dim)
+
+            self.label_smoothing_loss = LabelSmoothingLoss(
+                output_dim, ignore_id, lm_loss_smoothing_rate, normalize_length=False
+            )
+
+        self.output_dim = output_dim
+
+        self.transducer_loss_weight = transducer_loss_weight
+
+        self.use_ctc_loss = ctc_loss
+        self.ctc_loss_weight = ctc_loss_weight
+        self.ctc_dropout_rate = ctc_loss_dropout_rate
+
+        self.use_lm_loss = lm_loss
+        self.lm_loss_weight = lm_loss_weight
+
+        self.use_aux_transducer_loss = aux_transducer_loss
+        self.aux_transducer_loss_weight = aux_transducer_loss_weight
+
+        self.use_symm_kl_div_loss = symm_kl_div_loss
+        self.symm_kl_div_loss_weight = symm_kl_div_loss_weight
+
+        self.blank_id = blank_id
+        self.ignore_id = ignore_id
+
+        self.target = None
+
+    def compute_transducer_loss(
+        self,
+        enc_out: torch.Tensor,
+        dec_out: torch.tensor,
+        target: torch.Tensor,
+        t_len: torch.Tensor,
+        u_len: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute Transducer loss.
+
+        Args:
+            enc_out: Encoder output sequences. (B, T, D_enc)
+            dec_out: Decoder output sequences. (B, U, D_dec)
+            target: Target label ID sequences. (B, L)
+            t_len: Time lengths. (B,)
+            u_len: Label lengths. (B,)
+
+        Returns:
+            (joint_out, loss_trans):
+                Joint output sequences. (B, T, U, D_joint),
+                Transducer loss value.
+
+        """
+        joint_out = self.joint_network(enc_out.unsqueeze(2), dec_out.unsqueeze(1))
+
+        loss_trans = self.transducer_loss(joint_out, target, t_len, u_len)
+        loss_trans /= joint_out.size(0)
+
+        return joint_out, loss_trans
+
+    def compute_ctc_loss(
+        self,
+        enc_out: torch.Tensor,
+        target: torch.Tensor,
+        t_len: torch.Tensor,
+        u_len: torch.Tensor,
+    ):
+        """Compute CTC loss.
+
+        Args:
+            enc_out: Encoder output sequences. (B, T, D_enc)
+            target: Target character ID sequences. (B, U)
+            t_len: Time lengths. (B,)
+            u_len: Label lengths. (B,)
+
+        Returns:
+            : CTC loss value.
+
+        """
+        ctc_lin = self.ctc_lin(
+            torch.nn.functional.dropout(
+                enc_out.to(dtype=torch.float32), p=self.ctc_dropout_rate
+            )
+        )
+        ctc_logp = torch.log_softmax(ctc_lin.transpose(0, 1), dim=-1)
+
+        with torch.backends.cudnn.flags(deterministic=True):
+            loss_ctc = self.ctc_loss(ctc_logp, target, t_len, u_len)
+
+        return loss_ctc.mean()
+
+    def compute_aux_transducer_and_symm_kl_div_losses(
+        self,
+        aux_enc_out: torch.Tensor,
+        dec_out: torch.Tensor,
+        joint_out: torch.Tensor,
+        target: torch.Tensor,
+        aux_t_len: torch.Tensor,
+        u_len: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute auxiliary Transducer loss and Jensen-Shannon divergence loss.
+
+        Args:
+            aux_enc_out: Encoder auxiliary output sequences. [N x (B, T_aux, D_enc_aux)]
+            dec_out: Decoder output sequences. (B, U, D_dec)
+            joint_out: Joint output sequences. (B, T, U, D_joint)
+            target: Target character ID sequences. (B, L)
+            aux_t_len: Auxiliary time lengths. [N x (B,)]
+            u_len: True U lengths. (B,)
+
+        Returns:
+           : Auxiliary Transducer loss and KL divergence loss values.
+
+        """
+        aux_trans_loss = 0
+        symm_kl_div_loss = 0
+
+        num_aux_layers = len(aux_enc_out)
+        B, T, U, D = joint_out.shape
+
+        for p in self.joint_network.parameters():
+            p.requires_grad = False
+
+        for i, aux_enc_out_i in enumerate(aux_enc_out):
+            aux_mlp = self.mlp(aux_enc_out_i)
+
+            aux_joint_out = self.joint_network(
+                aux_mlp.unsqueeze(2),
+                dec_out.unsqueeze(1),
+                is_aux=True,
+            )
+
+            if self.use_aux_transducer_loss:
+                aux_trans_loss += (
+                    self.transducer_loss(
+                        aux_joint_out,
+                        target,
+                        aux_t_len[i],
+                        u_len,
+                    )
+                    / B
+                )
+
+            if self.use_symm_kl_div_loss:
+                denom = B * T * U
+
+                kl_main_aux = (
+                    self.kl_div(
+                        torch.log_softmax(joint_out, dim=-1),
+                        torch.softmax(aux_joint_out, dim=-1),
+                    )
+                    / denom
+                )
+
+                kl_aux_main = (
+                    self.kl_div(
+                        torch.log_softmax(aux_joint_out, dim=-1),
+                        torch.softmax(joint_out, dim=-1),
+                    )
+                    / denom
+                )
+
+                symm_kl_div_loss += kl_main_aux + kl_aux_main
+
+        for p in self.joint_network.parameters():
+            p.requires_grad = True
+
+        aux_trans_loss /= num_aux_layers
+
+        if self.use_symm_kl_div_loss:
+            symm_kl_div_loss /= num_aux_layers
+
+        return aux_trans_loss, symm_kl_div_loss
+
+    def compute_lm_loss(
+        self,
+        dec_out: torch.Tensor,
+        target: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward LM loss.
+
+        Args:
+            dec_out: Decoder output sequences. (B, U, D_dec)
+            target: Target label ID sequences. (B, U)
+
+        Returns:
+            : LM loss value.
+
+        """
+        lm_lin = self.lm_lin(dec_out)
+
+        lm_loss = self.label_smoothing_loss(lm_lin, target)
+
+        return lm_loss
+
+    def set_target(self, target: torch.Tensor):
+        """Set target label ID sequences.
+
+        Args:
+            target: Target label ID sequences. (B, L)
+
+        """
+        self.target = target
+
+    def get_target(self):
+        """Set target label ID sequences.
+
+        Args:
+
+        Returns:
+            target: Target label ID sequences. (B, L)
+
+        """
+        return self.target
+
+    def get_transducer_tasks_io(
+        self,
+        labels: torch.Tensor,
+        enc_out_len: torch.Tensor,
+        aux_enc_out_len: Optional[List],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Get Transducer tasks inputs and outputs.
+
+        Args:
+            labels: Label ID sequences. (B, U)
+            enc_out_len: Time lengths. (B,)
+            aux_enc_out_len: Auxiliary time lengths. [N X (B,)]
+
+        Returns:
+            target: Target label ID sequences. (B, L)
+            lm_loss_target: LM loss target label ID sequences. (B, U)
+            t_len: Time lengths. (B,)
+            aux_t_len: Auxiliary time lengths. [N x (B,)]
+            u_len: Label lengths. (B,)
+
+        """
+        device = labels.device
+
+        labels_unpad = [label[label != self.ignore_id] for label in labels]
+        blank = labels[0].new([self.blank_id])
+
+        target = pad_list(labels_unpad, self.blank_id).type(torch.int32).to(device)
+        lm_loss_target = (
+            pad_list(
+                [torch.cat([y, blank], dim=0) for y in labels_unpad], self.ignore_id
+            )
+            .type(torch.int64)
+            .to(device)
+        )
+
+        self.set_target(target)
+
+        if enc_out_len.dim() > 1:
+            enc_mask_unpad = [m[m != 0] for m in enc_out_len]
+            enc_out_len = list(map(int, [m.size(0) for m in enc_mask_unpad]))
+        else:
+            enc_out_len = list(map(int, enc_out_len))
+
+        t_len = torch.IntTensor(enc_out_len).to(device)
+        u_len = torch.IntTensor([label.size(0) for label in labels_unpad]).to(device)
+
+        if aux_enc_out_len:
+            aux_t_len = []
+
+            for i in range(len(aux_enc_out_len)):
+                if aux_enc_out_len[i].dim() > 1:
+                    aux_mask_unpad = [aux[aux != 0] for aux in aux_enc_out_len[i]]
+                    aux_t_len.append(
+                        torch.IntTensor(
+                            list(map(int, [aux.size(0) for aux in aux_mask_unpad]))
+                        ).to(device)
+                    )
+                else:
+                    aux_t_len.append(
+                        torch.IntTensor(list(map(int, aux_enc_out_len[i]))).to(device)
+                    )
+        else:
+            aux_t_len = aux_enc_out_len
+
+        return target, lm_loss_target, t_len, aux_t_len, u_len
+
+    def forward(
+        self,
+        enc_out: torch.Tensor,
+        aux_enc_out: List[torch.Tensor],
+        dec_out: torch.Tensor,
+        labels: torch.Tensor,
+        enc_out_len: torch.Tensor,
+        aux_enc_out_len: torch.Tensor,
+    ) -> Tuple[Tuple[Any], float, float]:
+        """Forward main and auxiliary task.
+
+        Args:
+            enc_out: Encoder output sequences. (B, T, D_enc)
+            aux_enc_out: Encoder intermediate output sequences. (B, T_aux, D_enc_aux)
+            dec_out: Decoder output sequences. (B, U, D_dec)
+            target: Target label ID sequences. (B, L)
+            t_len: Time lengths. (B,)
+            aux_t_len: Auxiliary time lengths. (B,)
+            u_len: Label lengths. (B,)
+
+        Returns:
+            : Weighted losses.
+              (transducer loss, ctc loss, aux Transducer loss, KL div loss, LM loss)
+            cer: Sentence-level CER score.
+            wer: Sentence-level WER score.
+
+        """
+        if self.use_symm_kl_div_loss:
+            assert self.use_aux_transducer_loss
+
+        (trans_loss, ctc_loss, lm_loss, aux_trans_loss, symm_kl_div_loss) = (
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        )
+
+        target, lm_loss_target, t_len, aux_t_len, u_len = self.get_transducer_tasks_io(
+            labels,
+            enc_out_len,
+            aux_enc_out_len,
+        )
+
+        joint_out, trans_loss = self.compute_transducer_loss(
+            enc_out, dec_out, target, t_len, u_len
+        )
+
+        if self.use_ctc_loss:
+            ctc_loss = self.compute_ctc_loss(enc_out, target, t_len, u_len)
+
+        if self.use_aux_transducer_loss:
+            (
+                aux_trans_loss,
+                symm_kl_div_loss,
+            ) = self.compute_aux_transducer_and_symm_kl_div_losses(
+                aux_enc_out,
+                dec_out,
+                joint_out,
+                target,
+                aux_t_len,
+                u_len,
+            )
+
+        if self.use_lm_loss:
+            lm_loss = self.compute_lm_loss(dec_out, lm_loss_target)
+
+        return (
+            self.transducer_loss_weight * trans_loss,
+            self.ctc_loss_weight * ctc_loss,
+            self.aux_transducer_loss_weight * aux_trans_loss,
+            self.symm_kl_div_loss_weight * symm_kl_div_loss,
+            self.lm_loss_weight * lm_loss,
+        )
diff --git a/espnet/nets/pytorch_backend/transducer/transformer_decoder_layer.py b/espnet/nets/pytorch_backend/transducer/transformer_decoder_layer.py
index e84070c65e9..9aecce54e0c 100644
--- a/espnet/nets/pytorch_backend/transducer/transformer_decoder_layer.py
+++ b/espnet/nets/pytorch_backend/transducer/transformer_decoder_layer.py
@@ -1,75 +1,96 @@
-"""Decoder layer definition for transformer-transducer models."""
+"""Transformer decoder layer definition for custom Transducer model."""
+
+from typing import Optional
 
 import torch
-from torch import nn
 
+from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
 from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
 
 
-class DecoderLayer(nn.Module):
-    """Single decoder layer module for transformer-transducer models.
+class TransformerDecoderLayer(torch.nn.Module):
+    """Transformer decoder layer module for custom Transducer model.
 
     Args:
-        size (int): input dim
-        self_attn (MultiHeadedAttention): self attention module
-        feed_forward (PositionwiseFeedForward): feed forward layer module
-        dropout_rate (float): dropout rate
-        normalize_before (bool): whether to use layer_norm before the first block
+        hdim: Hidden dimension.
+        self_attention: Self-attention module.
+        feed_forward: Feed forward module.
+        dropout_rate: Dropout rate.
 
     """
 
-    def __init__(self, size, self_attn, feed_forward, dropout_rate):
+    def __init__(
+        self,
+        hdim: int,
+        self_attention: MultiHeadedAttention,
+        feed_forward: PositionwiseFeedForward,
+        dropout_rate: float,
+    ):
         """Construct an DecoderLayer object."""
         super().__init__()
 
-        self.self_attn = self_attn
+        self.self_attention = self_attention
         self.feed_forward = feed_forward
 
-        self.norm1 = LayerNorm(size)
-        self.norm2 = LayerNorm(size)
+        self.norm1 = LayerNorm(hdim)
+        self.norm2 = LayerNorm(hdim)
 
-        self.dropout = nn.Dropout(dropout_rate)
+        self.dropout = torch.nn.Dropout(dropout_rate)
 
-        self.size = size
+        self.hdim = hdim
 
-    def forward(self, tgt, tgt_mask, cache=None):
-        """Compute decoded features.
+    def forward(
+        self,
+        sequence: torch.Tensor,
+        mask: torch.Tensor,
+        cache: Optional[torch.Tensor] = None,
+    ):
+        """Compute previous decoder output sequences.
 
         Args:
-            tgt (torch.Tensor): decoded previous target features (B, Lmax, idim)
-            tgt_mask (torch.Tensor): mask for tgt (B, Lmax)
-            cache (torch.Tensor): cached output (B, Lmax-1, idim)
+            sequence: Transformer input sequences. (B, U, D_dec)
+            mask: Transformer intput mask sequences. (B, U)
+            cache: Cached decoder output sequences. (B, (U - 1), D_dec)
 
         Returns:
-            tgt (torch.Tensor): decoder target features (B, Lmax, odim)
-            tgt_mask (torch.Tensor): mask for tgt (B, Lmax)
+            sequence: Transformer output sequences. (B, U, D_dec)
+            mask: Transformer output mask sequences. (B, U)
+
         """
-        residual = tgt
-        tgt = self.norm1(tgt)
+        residual = sequence
+        sequence = self.norm1(sequence)
 
         if cache is None:
-            tgt_q = tgt
+            sequence_q = sequence
         else:
+            batch = sequence.shape[0]
+            prev_len = sequence.shape[1] - 1
+
             assert cache.shape == (
-                tgt.shape[0],
-                tgt.shape[1] - 1,
-                self.size,
-            ), f"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+                batch,
+                prev_len,
+                self.hdim,
+            ), f"{cache.shape} == {(batch, prev_len, self.hdim)}"
 
-            tgt_q = tgt[:, -1:, :]
+            sequence_q = sequence[:, -1:, :]
             residual = residual[:, -1:, :]
 
-            if tgt_mask is not None:
-                tgt_mask = tgt_mask[:, -1:, :]
+            if mask is not None:
+                mask = mask[:, -1:, :]
 
-        tgt = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_mask))
+        sequence = residual + self.dropout(
+            self.self_attention(sequence_q, sequence, sequence, mask)
+        )
 
-        residual = tgt
-        tgt = self.norm2(tgt)
+        residual = sequence
+        sequence = self.norm2(sequence)
 
-        tgt = residual + self.dropout(self.feed_forward(tgt))
+        sequence = residual + self.dropout(self.feed_forward(sequence))
 
         if cache is not None:
-            tgt = torch.cat([cache, tgt], dim=1)
+            sequence = torch.cat([cache, sequence], dim=1)
 
-        return tgt, tgt_mask
+        return sequence, mask
diff --git a/espnet/nets/pytorch_backend/transducer/utils.py b/espnet/nets/pytorch_backend/transducer/utils.py
index 077eb1896db..d8bf3bfe336 100644
--- a/espnet/nets/pytorch_backend/transducer/utils.py
+++ b/espnet/nets/pytorch_backend/transducer/utils.py
@@ -1,94 +1,109 @@
-"""Utility functions for transducer models."""
+"""Utility functions for Transducer models."""
 
 import os
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Union
 
 import numpy as np
 import torch
 
 from espnet.nets.pytorch_backend.nets_utils import pad_list
+from espnet.nets.transducer_decoder_interface import ExtendedHypothesis
+from espnet.nets.transducer_decoder_interface import Hypothesis
 
 
-def prepare_loss_inputs(ys_pad, hlens, blank_id=0, ignore_id=-1):
-    """Prepare tensors for transducer loss computation.
+def get_decoder_input(
+    labels: torch.Tensor, blank_id: int, ignore_id: int
+) -> torch.Tensor:
+    """Prepare decoder input.
 
     Args:
-        ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
-        hlens (torch.Tensor): batch of hidden sequence lengthts (B)
-                              or batch of masks (B, 1, Tmax)
-        blank_id (int): index of blank label
-        ignore_id (int): index of initial padding
+        labels: Label ID sequences. (B, L)
 
     Returns:
-        ys_in_pad (torch.Tensor): batch of padded target sequences + blank (B, Lmax + 1)
-        target (torch.Tensor): batch of padded target sequences (B, Lmax)
-        pred_len (torch.Tensor): batch of hidden sequence lengths (B)
-        target_len (torch.Tensor): batch of output sequence lengths (B)
+        decoder_input: Label ID sequences with blank prefix. (B, U)
 
     """
-    device = ys_pad.device
+    device = labels.device
 
-    ys = [y[y != ignore_id] for y in ys_pad]
-    blank = ys[0].new([blank_id])
+    labels_unpad = [label[label != ignore_id] for label in labels]
+    blank = labels[0].new([blank_id])
 
-    ys_in_pad = pad_list([torch.cat([blank, y], dim=0) for y in ys], blank_id)
-    ys_out_pad = pad_list([torch.cat([y, blank], dim=0) for y in ys], ignore_id)
+    decoder_input = pad_list(
+        [torch.cat([blank, label], dim=0) for label in labels_unpad], blank_id
+    ).to(device)
 
-    target = pad_list(ys, blank_id).type(torch.int32).to(device)
-    target_len = torch.IntTensor([y.size(0) for y in ys]).to(device)
-
-    if torch.is_tensor(hlens):
-        if hlens.dim() > 1:
-            hs = [h[h != 0] for h in hlens]
-            hlens = list(map(int, [h.size(0) for h in hs]))
-        else:
-            hlens = list(map(int, hlens))
-
-    pred_len = torch.IntTensor(hlens).to(device)
-
-    return ys_in_pad, ys_out_pad, target, pred_len, target_len
+    return decoder_input
 
 
-def valid_aux_task_layer_list(aux_layer_ids, enc_num_layers):
-    """Check whether input list of auxiliary layer ids is valid.
+def valid_aux_encoder_output_layers(
+    aux_layer_id: List[int],
+    enc_num_layers: int,
+    use_symm_kl_div_loss: bool,
+    subsample: List[int],
+) -> List[int]:
+    """Check whether provided auxiliary encoder layer IDs are valid.
 
-       Return the valid list sorted with duplicated removed.
+    Return the valid list sorted with duplicates removed.
 
     Args:
-        aux_layer_ids (list): Auxiliary layers ids
-        enc_num_layers (int): Number of encoder layers
+        aux_layer_id: Auxiliary encoder layer IDs.
+        enc_num_layers: Number of encoder layers.
+        use_symm_kl_div_loss: Whether symmetric KL divergence loss is used.
+        subsample: Subsampling rate per layer.
 
     Returns:
-        valid (list): Validated list of layers for auxiliary task
+        valid: Valid list of auxiliary encoder layers.
 
     """
     if (
-        not isinstance(aux_layer_ids, list)
-        or not aux_layer_ids
-        or not all(isinstance(layer, int) for layer in aux_layer_ids)
+        not isinstance(aux_layer_id, list)
+        or not aux_layer_id
+        or not all(isinstance(layer, int) for layer in aux_layer_id)
     ):
-        raise ValueError("--aux-task-layer-list argument takes a list of layer ids.")
+        raise ValueError(
+            "aux-transducer-loss-enc-output-layers option takes a list of layer IDs."
+            " Correct argument format is: '[0, 1]'"
+        )
 
-    sorted_list = sorted(aux_layer_ids, key=int, reverse=False)
+    sorted_list = sorted(aux_layer_id, key=int, reverse=False)
     valid = list(filter(lambda x: 0 <= x < enc_num_layers, sorted_list))
 
     if sorted_list != valid:
         raise ValueError(
-            "Provided list of layer ids for auxiliary task is incorrect. "
-            "IDs should be between [0, %d]" % (enc_num_layers - 1)
+            "Provided argument for aux-transducer-loss-enc-output-layers is incorrect."
+            " IDs should be between [0, %d]" % enc_num_layers
         )
 
+    if use_symm_kl_div_loss:
+        sorted_list += [enc_num_layers]
+
+        for n in range(1, len(sorted_list)):
+            sub_range = subsample[(sorted_list[n - 1] + 1) : sorted_list[n] + 1]
+            valid_shape = [False if n > 1 else True for n in sub_range]
+
+            if False in valid_shape:
+                raise ValueError(
+                    "Encoder layers %d and %d have different shape due to subsampling."
+                    " Symmetric KL divergence loss doesn't cover such case for now."
+                    % (sorted_list[n - 1], sorted_list[n])
+                )
+
     return valid
 
 
-def is_prefix(x, pref):
-    """Check prefix.
+def is_prefix(x: List[int], pref: List[int]) -> bool:
+    """Check if pref is a prefix of x.
 
     Args:
-        x (list): token id sequence
-        pref (list): token id sequence
+        x: Label ID sequence.
+        pref: Prefix label ID sequence.
 
     Returns:
-       (boolean): whether pref is a prefix of x.
+        : Whether pref is a prefix of x.
 
     """
     if len(pref) >= len(x):
@@ -101,15 +116,17 @@ def is_prefix(x, pref):
     return True
 
 
-def substract(x, subset):
-    """Remove elements of subset if corresponding token id sequence exist in x.
+def subtract(
+    x: List[ExtendedHypothesis], subset: List[ExtendedHypothesis]
+) -> List[ExtendedHypothesis]:
+    """Remove elements of subset if corresponding label ID sequence already exist in x.
 
     Args:
-        x (list): set of hypotheses
-        subset (list): subset of hypotheses
+        x: Set of hypotheses.
+        subset: Subset of x.
 
     Returns:
-       final (list): new set
+       final: New set of hypotheses.
 
     """
     final = []
@@ -122,17 +139,62 @@ def substract(x, subset):
     return final
 
 
-def select_lm_state(lm_states, idx, lm_layers, is_wordlm):
-    """Get LM state from batch for given id.
+def select_k_expansions(
+    hyps: List[ExtendedHypothesis],
+    logps: torch.Tensor,
+    beam_size: int,
+    gamma: float,
+    beta: float,
+) -> List[ExtendedHypothesis]:
+    """Return K hypotheses candidates for expansion from a list of hypothesis.
+
+    K candidates are selected according to the extended hypotheses probabilities
+    and a prune-by-value method. Where K is equal to beam_size + beta.
 
     Args:
-        lm_states (list or dict): batch of LM states
-        idx (int): index to extract state from batch state
-        lm_layers (int): number of LM layers
-        is_wordlm (bool): whether provided LM is a word-LM
+        hyps: Hypotheses.
+        beam_logp: Log-probabilities for hypotheses expansions.
+        beam_size: Beam size.
+        gamma: Allowed logp difference for prune-by-value method.
+        beta: Number of additional candidates to store.
+
+    Return:
+        k_expansions: Best K expansion hypotheses candidates.
+
+    """
+    k_expansions = []
+
+    for i, hyp in enumerate(hyps):
+        hyp_i = [(int(k), hyp.score + float(logp)) for k, logp in enumerate(logps[i])]
+        k_best_exp = max(hyp_i, key=lambda x: x[1])[1]
+
+        k_expansions.append(
+            sorted(
+                filter(lambda x: (k_best_exp - gamma) <= x[1], hyp_i),
+                key=lambda x: x[1],
+                reverse=True,
+            )[: beam_size + beta]
+        )
+
+    return k_expansions
+
+
+def select_lm_state(
+    lm_states: Union[List[Any], Dict[str, Any]],
+    idx: int,
+    lm_layers: int,
+    is_wordlm: bool,
+) -> Union[List[Any], Dict[str, Any]]:
+    """Get ID state from LM hidden states.
+
+    Args:
+        lm_states: LM hidden states.
+        idx: LM state ID to extract.
+        lm_layers: Number of LM layers.
+        is_wordlm: Whether provided LM is a word-level LM.
 
     Returns:
-       idx_state (dict): LM state for given id
+       idx_state: LM hidden state for given ID.
 
     """
     if is_wordlm:
@@ -146,43 +208,45 @@ def select_lm_state(lm_states, idx, lm_layers, is_wordlm):
     return idx_state
 
 
-def create_lm_batch_state(lm_states_list, lm_layers, is_wordlm):
-    """Create batch of LM states.
+def create_lm_batch_states(
+    lm_states: Union[List[Any], Dict[str, Any]], lm_layers, is_wordlm: bool
+) -> Union[List[Any], Dict[str, Any]]:
+    """Create LM hidden states.
 
     Args:
-        lm_states (list or dict): list of individual LM states
-        lm_layers (int): number of LM layers
-        is_wordlm (bool): whether provided LM is a word-LM
+        lm_states: LM hidden states.
+        lm_layers: Number of LM layers.
+        is_wordlm: Whether provided LM is a word-level LM.
 
     Returns:
-       batch_states (list): batch of LM states
+        new_states: LM hidden states.
 
     """
     if is_wordlm:
-        batch_states = lm_states_list
-    else:
-        batch_states = {}
+        return lm_states
 
-        batch_states["c"] = [
-            torch.stack([state["c"][layer] for state in lm_states_list])
-            for layer in range(lm_layers)
-        ]
-        batch_states["h"] = [
-            torch.stack([state["h"][layer] for state in lm_states_list])
-            for layer in range(lm_layers)
-        ]
+    new_states = {}
+
+    new_states["c"] = [
+        torch.stack([state["c"][layer] for state in lm_states])
+        for layer in range(lm_layers)
+    ]
+    new_states["h"] = [
+        torch.stack([state["h"][layer] for state in lm_states])
+        for layer in range(lm_layers)
+    ]
 
-    return batch_states
+    return new_states
 
 
-def init_lm_state(lm_model):
-    """Initialize LM state.
+def init_lm_state(lm_model: torch.nn.Module):
+    """Initialize LM hidden states.
 
     Args:
-        lm_model (torch.nn.Module): LM module
+        lm_model: LM module.
 
     Returns:
-        lm_state (dict): initial LM state
+        lm_state: Initial LM hidden states.
 
     """
     lm_layers = len(lm_model.rnn)
@@ -207,14 +271,14 @@ def init_lm_state(lm_model):
     return lm_state
 
 
-def recombine_hyps(hyps):
-    """Recombine hypotheses with equivalent output sequence.
+def recombine_hyps(hyps: List[Hypothesis]) -> List[Hypothesis]:
+    """Recombine hypotheses with same label ID sequence.
 
     Args:
-        hyps (list): list of hypotheses
+        hyps: Hypotheses.
 
     Returns:
-       final (list): list of recombined hypotheses
+       final: Recombined hypotheses.
 
     """
     final = []
@@ -229,37 +293,39 @@ def recombine_hyps(hyps):
         else:
             final.append(hyp)
 
-    return hyps
+    return final
 
 
-def pad_sequence(seqlist, pad_token):
-    """Left pad list of token id sequences.
+def pad_sequence(labels: List[int], pad_id: int) -> List[int]:
+    """Left pad label ID sequences.
 
     Args:
-        seqlist (list): list of token id sequences
-        pad_token (int): padding token id
+        labels: Label ID sequence.
+        pad_id: Padding symbol ID.
 
     Returns:
-        final (list): list of padded token id sequences
+        final: Padded label ID sequences.
 
     """
-    maxlen = max(len(x) for x in seqlist)
+    maxlen = max(len(x) for x in labels)
 
-    final = [([pad_token] * (maxlen - len(x))) + x for x in seqlist]
+    final = [([pad_id] * (maxlen - len(x))) + x for x in labels]
 
     return final
 
 
-def check_state(state, max_len, pad_token):
-    """Check state and left pad or trim if necessary.
+def check_state(
+    state: List[Optional[torch.Tensor]], max_len: int, pad_id: int
+) -> List[Optional[torch.Tensor]]:
+    """Check decoder hidden states and left pad or trim if necessary.
 
     Args:
-        state (list): list of of L decoder states (in_len, dec_dim)
-        max_len (int): maximum length authorized
-        pad_token (int): padding token id
+        state: Decoder hidden states. [N x (?, D_dec)]
+        max_len: maximum sequence length.
+        pad_id: Padding symbol ID.
 
     Returns:
-        final (list): list of L padded decoder states (1, max_len, dec_dim)
+        final: Decoder hidden states. [N x (1, max_len, D_dec)]
 
     """
     if state is None or max_len < 1 or state[0].size(1) == max_len:
@@ -277,7 +343,7 @@ def check_state(state, max_len, pad_token):
         ddim = state[0].size(2)
 
         final_dims = (1, max_len, ddim)
-        final = [state[0].data.new(*final_dims).fill_(pad_token) for _ in range(layers)]
+        final = [state[0].data.new(*final_dims).fill_(pad_id) for _ in range(layers)]
 
         for i, s in enumerate(state):
             final[i][:, (max_len - s.size(1)) : max_len, :] = s
@@ -287,22 +353,22 @@ def check_state(state, max_len, pad_token):
     return state
 
 
-def check_batch_state(state, max_len, pad_token):
-    """Check batch of states and left pad or trim if necessary.
+def check_batch_states(states, max_len, pad_id):
+    """Check decoder hidden states and left pad or trim if necessary.
 
     Args:
-        state (list): list of of L decoder states (B, ?, dec_dim)
-        max_len (int): maximum length authorized
-        pad_token (int): padding token id
+        state: Decoder hidden states. [N x (B, ?, D_dec)]
+        max_len: maximum sequence length.
+        pad_id: Padding symbol ID.
 
     Returns:
-        final (list): list of L decoder states (B, pred_len, dec_dim)
+        final: Decoder hidden states. [N x (B, max_len, dec_dim)]
 
     """
-    final_dims = (len(state), max_len, state[0].size(1))
-    final = state[0].data.new(*final_dims).fill_(pad_token)
+    final_dims = (len(states), max_len, states[0].size(1))
+    final = states[0].data.new(*final_dims).fill_(pad_id)
 
-    for i, s in enumerate(state):
+    for i, s in enumerate(states):
         curr_len = s.size(0)
 
         if curr_len < max_len:
@@ -313,12 +379,12 @@ def check_batch_state(state, max_len, pad_token):
     return final
 
 
-def custom_torch_load(model_path, model, training=True):
-    """Load transducer model modules and parameters with training-only ones removed.
+def custom_torch_load(model_path: str, model: torch.nn.Module, training: bool = True):
+    """Load Transducer model with training-only modules and parameters removed.
 
     Args:
-        model_path (str): Model path
-        model (torch.nn.Module): The model with pretrained modules
+        model_path: Model path.
+        model: Transducer model.
 
     """
     if "snapshot" in os.path.basename(model_path):
@@ -331,8 +397,12 @@ def custom_torch_load(model_path, model, training=True):
         )
 
     if not training:
+        task_keys = ("mlp", "ctc_lin", "kl_div", "lm_lin", "error_calculator")
+
         model_state_dict = {
-            k: v for k, v in model_state_dict.items() if not k.startswith("aux")
+            k: v
+            for k, v in model_state_dict.items()
+            if not any(mod in k for mod in task_keys)
         }
 
     model.load_state_dict(model_state_dict)
diff --git a/espnet/nets/pytorch_backend/transducer/vgg2l.py b/espnet/nets/pytorch_backend/transducer/vgg2l.py
index 18aeafb0f32..c7eecd23281 100644
--- a/espnet/nets/pytorch_backend/transducer/vgg2l.py
+++ b/espnet/nets/pytorch_backend/transducer/vgg2l.py
@@ -1,4 +1,4 @@
-"""VGG2L module definition for transformer encoder."""
+"""VGG2L module definition for custom encoder."""
 
 from typing import Tuple
 from typing import Union
@@ -10,9 +10,9 @@ class VGG2L(torch.nn.Module):
     """VGG2L module for custom encoder.
 
     Args:
-        idim: Dimension of inputs
-        odim: Dimension of outputs
-        pos_enc: Positional encoding class
+        idim: Input dimension.
+        odim: Output dimension.
+        pos_enc: Positional encoding class.
 
     """
 
@@ -41,49 +41,53 @@ def __init__(self, idim: int, odim: int, pos_enc: torch.nn.Module = None):
             self.output = torch.nn.Linear(128 * ((idim // 2) // 2), odim)
 
     def forward(
-        self, x: torch.Tensor, x_mask: torch.Tensor
+        self, feats: torch.Tensor, feats_mask: torch.Tensor
     ) -> Union[
         Tuple[torch.Tensor, torch.Tensor],
         Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor],
     ]:
-        """VGG2L forward for x.
+        """Forward VGG2L bottleneck.
 
         Args:
-            x: Input tensor (B, T, idim)
-            x_mask: Input mask (B, 1, T)
+            feats: Feature sequences. (B, F, D_feats)
+            feats_mask: Mask of feature sequences. (B, 1, F)
 
         Returns:
-            x: Output tensor (B, sub(T), odim)
-                   or ((B, sub(T), odim), (B, sub(T), att_dim))
-            x_mask: Output mask (B, 1, sub(T))
+            vgg_output: VGG output sequences.
+                   (B, sub(F), D_out) or ((B, sub(F), D_out), (B, sub(F), D_att))
+            vgg_mask: Mask of VGG output sequences. (B, 1, sub(F))
 
         """
-        x = x.unsqueeze(1)
-        x = self.vgg2l(x)
+        feats = feats.unsqueeze(1)
+        vgg_output = self.vgg2l(feats)
 
-        b, c, t, f = x.size()
+        b, c, t, f = vgg_output.size()
 
-        x = self.output(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        vgg_output = self.output(
+            vgg_output.transpose(1, 2).contiguous().view(b, t, c * f)
+        )
 
-        if x_mask is not None:
-            x_mask = self.create_new_mask(x_mask)
+        if feats_mask is not None:
+            vgg_mask = self.create_new_mask(feats_mask)
+        else:
+            vgg_mask = feats_mask
 
-        return x, x_mask
+        return vgg_output, vgg_mask
 
-    def create_new_mask(self, x_mask: torch.Tensor) -> torch.Tensor:
-        """Create a subsampled version of x_mask.
+    def create_new_mask(self, feats_mask: torch.Tensor) -> torch.Tensor:
+        """Create a subsampled mask of feature sequences.
 
         Args:
-            x_mask: Input mask (B, 1, T)
+            feats_mask: Mask of feature sequences. (B, 1, F)
 
         Returns:
-            x_mask: Output mask (B, 1, sub(T))
+            vgg_mask: Mask of VGG2L output sequences. (B, 1, sub(F))
 
         """
-        x_t1 = x_mask.size(2) - (x_mask.size(2) % 3)
-        x_mask = x_mask[:, :, :x_t1][:, :, ::3]
+        vgg1_t_len = feats_mask.size(2) - (feats_mask.size(2) % 3)
+        vgg_mask = feats_mask[:, :, :vgg1_t_len][:, :, ::3]
 
-        x_t2 = x_mask.size(2) - (x_mask.size(2) % 2)
-        x_mask = x_mask[:, :, :x_t2][:, :, ::2]
+        vgg2_t_len = vgg_mask.size(2) - (vgg_mask.size(2) % 2)
+        vgg_mask = vgg_mask[:, :, :vgg2_t_len][:, :, ::2]
 
-        return x_mask
+        return vgg_mask
diff --git a/espnet/nets/pytorch_backend/transformer/add_sos_eos.py b/espnet/nets/pytorch_backend/transformer/add_sos_eos.py
index 1f763bc97db..dcc98892e46 100644
--- a/espnet/nets/pytorch_backend/transformer/add_sos_eos.py
+++ b/espnet/nets/pytorch_backend/transformer/add_sos_eos.py
@@ -4,7 +4,7 @@
 # Copyright 2019 Shigeki Karita
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-"""Unility funcitons for Transformer."""
+"""Unility functions for Transformer."""
 
 import torch
 
diff --git a/espnet/nets/pytorch_backend/transformer/argument.py b/espnet/nets/pytorch_backend/transformer/argument.py
index 216a68d90c3..2ce81f58aac 100644
--- a/espnet/nets/pytorch_backend/transformer/argument.py
+++ b/espnet/nets/pytorch_backend/transformer/argument.py
@@ -121,6 +121,24 @@ def add_arguments_transformer_common(group):
         type=float,
         help="Dropout rate for the encoder",
     )
+    group.add_argument(
+        "--intermediate-ctc-weight",
+        default=0.0,
+        type=float,
+        help="Weight of intermediate CTC weight",
+    )
+    group.add_argument(
+        "--intermediate-ctc-layer",
+        default="",
+        type=str,
+        help="Position of intermediate CTC layer. {int} or {int},{int},...,{int}",
+    )
+    group.add_argument(
+        "--self-conditioning",
+        default=False,
+        type=strtobool,
+        help="use self-conditioning at intermediate CTC layers",
+    )
     # Encoder
     group.add_argument(
         "--elayers",
@@ -149,6 +167,12 @@ def add_arguments_transformer_common(group):
         type=int,
         help="Number of heads for multi head attention",
     )
+    group.add_argument(
+        "--stochastic-depth-rate",
+        default=0.0,
+        type=float,
+        help="Skip probability of stochastic layer regularization",
+    )
     # Decoder
     group.add_argument(
         "--dlayers", default=1, type=int, help="Number of decoder layers"
diff --git a/espnet/nets/pytorch_backend/transformer/contextual_block_encoder_layer.py b/espnet/nets/pytorch_backend/transformer/contextual_block_encoder_layer.py
index d4dd676ab84..16957e99820 100644
--- a/espnet/nets/pytorch_backend/transformer/contextual_block_encoder_layer.py
+++ b/espnet/nets/pytorch_backend/transformer/contextual_block_encoder_layer.py
@@ -58,7 +58,28 @@ def __init__(
         if self.concat_after:
             self.concat_linear = nn.Linear(size + size, size)
 
-    def forward(self, x, mask, past_ctx=None, next_ctx=None, layer_idx=0, cache=None):
+    def forward(
+        self,
+        x,
+        mask,
+        infer_mode=False,
+        past_ctx=None,
+        next_ctx=None,
+        is_short_segment=False,
+        layer_idx=0,
+        cache=None,
+    ):
+        """Calculate forward propagation."""
+        if self.training or not infer_mode:
+            return self.forward_train(x, mask, past_ctx, next_ctx, layer_idx, cache)
+        else:
+            return self.forward_infer(
+                x, mask, past_ctx, next_ctx, is_short_segment, layer_idx, cache
+            )
+
+    def forward_train(
+        self, x, mask, past_ctx=None, next_ctx=None, layer_idx=0, cache=None
+    ):
         """Compute encoded features.
 
         Args:
@@ -135,4 +156,98 @@ def forward(self, x, mask, past_ctx=None, next_ctx=None, layer_idx=0, cache=None
             next_ctx[:, 0, layer_idx, :] = x[:, 0, -1, :]
             next_ctx[:, 1:, layer_idx, :] = x[:, 0:-1, -1, :]
 
-        return x, mask, next_ctx, next_ctx, layer_idx
+        return x, mask, False, next_ctx, next_ctx, False, layer_idx
+
+    def forward_infer(
+        self,
+        x,
+        mask,
+        past_ctx=None,
+        next_ctx=None,
+        is_short_segment=False,
+        layer_idx=0,
+        cache=None,
+    ):
+        """Compute encoded features.
+
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            past_ctx (torch.Tensor): Previous contexutal vector
+            next_ctx (torch.Tensor): Next contexutal vector
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+            cur_ctx (torch.Tensor): Current contexutal vector
+            next_ctx (torch.Tensor): Next contexutal vector
+            layer_idx (int): layer index number
+
+        """
+        nbatch = x.size(0)
+        nblock = x.size(1)
+        # if layer_idx == 0, next_ctx has to be None
+        if layer_idx == 0:
+            assert next_ctx is None
+            next_ctx = x.new_zeros(nbatch, self.total_layer_num, x.size(-1))
+
+        # reshape ( nbatch, nblock, block_size + 2, dim )
+        #     -> ( nbatch * nblock, block_size + 2, dim )
+        x = x.view(-1, x.size(-2), x.size(-1))
+        if mask is not None:
+            mask = mask.view(-1, mask.size(-2), mask.size(-1))
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if self.concat_after:
+            x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+
+        # reshape ( nbatch * nblock, block_size + 2, dim )
+        #       -> ( nbatch, nblock, block_size + 2, dim )
+        x = x.view(nbatch, nblock, x.size(-2), x.size(-1))
+        if mask is not None:
+            mask = mask.view(nbatch, nblock, mask.size(-2), mask.size(-1))
+
+        # Propagete context information (the last frame of each block)
+        # to the first frame
+        # of the next block
+
+        if not is_short_segment:
+            if past_ctx is None:
+                # First block of an utterance
+                x[:, 0, 0, :] = x[:, 0, -1, :]
+            else:
+                x[:, 0, 0, :] = past_ctx[:, layer_idx, :]
+            if nblock > 1:
+                x[:, 1:, 0, :] = x[:, 0:-1, -1, :]
+            next_ctx[:, layer_idx, :] = x[:, -1, -1, :]
+        else:
+            next_ctx = None
+
+        return x, mask, True, past_ctx, next_ctx, is_short_segment, layer_idx + 1
diff --git a/espnet/nets/pytorch_backend/transformer/decoder.py b/espnet/nets/pytorch_backend/transformer/decoder.py
index 08ddbbbb98d..5236632665c 100644
--- a/espnet/nets/pytorch_backend/transformer/decoder.py
+++ b/espnet/nets/pytorch_backend/transformer/decoder.py
@@ -50,7 +50,7 @@ class Decoder(BatchScorerInterface, torch.nn.Module):
     Args:
         odim (int): Output diminsion.
         self_attention_layer_type (str): Self-attention layer type.
-        attention_dim (int): Dimention of attention.
+        attention_dim (int): Dimension of attention.
         attention_heads (int): The number of heads of multi head attention.
         conv_wshare (int): The number of kernel of convolution. Only used in
             self_attention_layer_type == "lightconv*" or "dynamiconv*".
@@ -150,7 +150,7 @@ def __init__(
         elif selfattention_layer_type == "lightconv2d":
             logging.info(
                 "decoder self-attention layer "
-                "type = lightweight convolution 2-dimentional"
+                "type = lightweight convolution 2-dimensional"
             )
             decoder_selfattn_layer = LightweightConvolution2D
             decoder_selfattn_layer_args = [
@@ -180,7 +180,7 @@ def __init__(
             ]
         elif selfattention_layer_type == "dynamicconv2d":
             logging.info(
-                "decoder self-attention layer type = dynamic convolution 2-dimentional"
+                "decoder self-attention layer type = dynamic convolution 2-dimensional"
             )
             decoder_selfattn_layer = DynamicConvolution2D
             decoder_selfattn_layer_args = [
diff --git a/espnet/nets/pytorch_backend/transformer/dynamic_conv.py b/espnet/nets/pytorch_backend/transformer/dynamic_conv.py
index bfff34fbcb6..8a2a0c1eaf0 100644
--- a/espnet/nets/pytorch_backend/transformer/dynamic_conv.py
+++ b/espnet/nets/pytorch_backend/transformer/dynamic_conv.py
@@ -71,7 +71,7 @@ def forward(self, query, key, value, mask):
             mask (torch.Tensor): (batch, time1, time2) mask
 
         Return:
-            x (torch.Tensor): (batch, time1, d_model) ouput
+            x (torch.Tensor): (batch, time1, d_model) output
 
         """
         # linear -> GLU -- -> lightconv -> linear
diff --git a/espnet/nets/pytorch_backend/transformer/dynamic_conv2d.py b/espnet/nets/pytorch_backend/transformer/dynamic_conv2d.py
index dd49719b3d9..f8a4dd6e9f6 100644
--- a/espnet/nets/pytorch_backend/transformer/dynamic_conv2d.py
+++ b/espnet/nets/pytorch_backend/transformer/dynamic_conv2d.py
@@ -1,4 +1,4 @@
-"""Dynamic 2-Dimentional Convolution module."""
+"""Dynamic 2-Dimensional Convolution module."""
 
 import numpy
 import torch
@@ -10,7 +10,7 @@
 
 
 class DynamicConvolution2D(nn.Module):
-    """Dynamic 2-Dimentional Convolution layer.
+    """Dynamic 2-Dimensional Convolution layer.
 
     This implementation is based on
     https://github.com/pytorch/fairseq/tree/master/fairseq
@@ -34,7 +34,7 @@ def __init__(
         use_kernel_mask=False,
         use_bias=False,
     ):
-        """Construct Dynamic 2-Dimentional Convolution layer."""
+        """Construct Dynamic 2-Dimensional Convolution layer."""
         super(DynamicConvolution2D, self).__init__()
 
         assert n_feat % wshare == 0
@@ -63,7 +63,7 @@ def __init__(
             self.bias = nn.Parameter(torch.Tensor(n_feat))
 
     def forward(self, query, key, value, mask):
-        """Forward of 'Dynamic 2-Dimentional Convolution'.
+        """Forward of 'Dynamic 2-Dimensional Convolution'.
 
         This function takes query, key and value but uses only query.
         This is just for compatibility with self-attention layer (attention.py)
@@ -75,7 +75,7 @@ def forward(self, query, key, value, mask):
             mask (torch.Tensor): (batch, time1, time2) mask
 
         Return:
-            x (torch.Tensor): (batch, time1, d_model) ouput
+            x (torch.Tensor): (batch, time1, d_model) output
 
         """
         # linear -> GLU -- -> lightconv -> linear
diff --git a/espnet/nets/pytorch_backend/transformer/embedding.py b/espnet/nets/pytorch_backend/transformer/embedding.py
index 3a92e0f8cc9..17a39fddec4 100644
--- a/espnet/nets/pytorch_backend/transformer/embedding.py
+++ b/espnet/nets/pytorch_backend/transformer/embedding.py
@@ -7,7 +7,6 @@
 """Positional Encoding Module."""
 
 import math
-
 import torch
 
 
@@ -42,7 +41,6 @@ class PositionalEncoding(torch.nn.Module):
         reverse (bool): Whether to reverse the input position. Only for
         the class LegacyRelPositionalEncoding. We remove it in the current
         class RelPositionalEncoding.
-
     """
 
     def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
@@ -87,7 +85,6 @@ def forward(self, x: torch.Tensor):
 
         Returns:
             torch.Tensor: Encoded tensor (batch, time, `*`).
-
         """
         self.extend_pe(x)
         x = x * self.xscale + self.pe[:, : x.size(1)]
@@ -130,6 +127,95 @@ def forward(self, x):
         return self.dropout(x)
 
 
+class LearnableFourierPosEnc(torch.nn.Module):
+    """Learnable Fourier Features for Positional Encoding.
+
+    See https://arxiv.org/pdf/2106.02795.pdf
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        gamma (float): init parameter for the positional kernel variance
+            see https://arxiv.org/pdf/2106.02795.pdf.
+        apply_scaling (bool): Whether to scale the input before adding the pos encoding.
+        hidden_dim (int): if not None, we modulate the pos encodings with
+            an MLP whose hidden layer has hidden_dim neurons.
+    """
+
+    def __init__(
+        self,
+        d_model,
+        dropout_rate=0.0,
+        max_len=5000,
+        gamma=1.0,
+        apply_scaling=False,
+        hidden_dim=None,
+    ):
+        """Initialize class."""
+        super(LearnableFourierPosEnc, self).__init__()
+
+        self.d_model = d_model
+
+        if apply_scaling:
+            self.xscale = math.sqrt(self.d_model)
+        else:
+            self.xscale = 1.0
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.max_len = max_len
+
+        self.gamma = gamma
+        if self.gamma is None:
+            self.gamma = self.d_model // 2
+
+        assert (
+            d_model % 2 == 0
+        ), "d_model should be divisible by two in order to use this layer."
+        self.w_r = torch.nn.Parameter(torch.empty(1, d_model // 2))
+        self._reset()  # init the weights
+
+        self.hidden_dim = hidden_dim
+        if self.hidden_dim is not None:
+            self.mlp = torch.nn.Sequential(
+                torch.nn.Linear(d_model, hidden_dim),
+                torch.nn.GELU(),
+                torch.nn.Linear(hidden_dim, d_model),
+            )
+
+    def _reset(self):
+        self.w_r.data = torch.normal(
+            0, (1 / math.sqrt(self.gamma)), (1, self.d_model // 2)
+        )
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        position_v = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1).to(x)
+
+        cosine = torch.cos(torch.matmul(position_v, self.w_r))
+        sine = torch.sin(torch.matmul(position_v, self.w_r))
+        pos_enc = torch.cat((cosine, sine), -1)
+        pos_enc /= math.sqrt(self.d_model)
+
+        if self.hidden_dim is None:
+            return pos_enc.unsqueeze(0)
+        else:
+            return self.mlp(pos_enc.unsqueeze(0))
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        pe = self.extend_pe(x)
+        x = x * self.xscale + pe
+        return self.dropout(x)
+
+
 class LegacyRelPositionalEncoding(PositionalEncoding):
     """Relative positional encoding module (old version).
 
@@ -242,3 +328,57 @@ def forward(self, x: torch.Tensor):
             self.pe.size(1) // 2 - x.size(1) + 1 : self.pe.size(1) // 2 + x.size(1),
         ]
         return self.dropout(x), self.dropout(pos_emb)
+
+
+class StreamPositionalEncoding(torch.nn.Module):
+    """Streaming Positional encoding.
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super(StreamPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.tmp = torch.tensor(0.0).expand(1, max_len)
+        self.extend_pe(self.tmp.size(1), self.tmp.device, self.tmp.dtype)
+        self._register_load_state_dict_pre_hook(_pre_hook)
+
+    def extend_pe(self, length, device, dtype):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= length:
+                if self.pe.dtype != dtype or self.pe.device != device:
+                    self.pe = self.pe.to(dtype=dtype, device=device)
+                return
+        pe = torch.zeros(length, self.d_model)
+        position = torch.arange(0, length, dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=device, dtype=dtype)
+
+    def forward(self, x: torch.Tensor, start_idx: int = 0):
+        """Add positional encoding.
+
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+
+        """
+        self.extend_pe(x.size(1) + start_idx, x.device, x.dtype)
+        x = x * self.xscale + self.pe[:, start_idx : start_idx + x.size(1)]
+        return self.dropout(x)
diff --git a/espnet/nets/pytorch_backend/transformer/encoder.py b/espnet/nets/pytorch_backend/transformer/encoder.py
index 1df99582298..508bf1aa7a7 100644
--- a/espnet/nets/pytorch_backend/transformer/encoder.py
+++ b/espnet/nets/pytorch_backend/transformer/encoder.py
@@ -47,15 +47,15 @@ class Encoder(torch.nn.Module):
 
     Args:
         idim (int): Input dimension.
-        attention_dim (int): Dimention of attention.
+        attention_dim (int): Dimension of attention.
         attention_heads (int): The number of heads of multi head attention.
         conv_wshare (int): The number of kernel of convolution. Only used in
-            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+            selfattention_layer_type == "lightconv*" or "dynamiconv*".
         conv_kernel_length (Union[int, str]): Kernel size str of convolution
-            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type
+            (e.g. 71_71_71_71_71_71). Only used in selfattention_layer_type
             == "lightconv*" or "dynamiconv*".
         conv_usebias (bool): Whether to use bias in convolution. Only used in
-            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+            selfattention_layer_type == "lightconv*" or "dynamiconv*".
         linear_units (int): The number of units of position-wise feed forward.
         num_blocks (int): The number of decoder blocks.
         dropout_rate (float): Dropout rate.
@@ -73,6 +73,11 @@ class Encoder(torch.nn.Module):
         positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
         selfattention_layer_type (str): Encoder attention layer type.
         padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
+            indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type
+            signature.)
 
     """
 
@@ -97,6 +102,10 @@ def __init__(
         positionwise_conv_kernel_size=1,
         selfattention_layer_type="selfattn",
         padding_idx=-1,
+        stochastic_depth_rate=0.0,
+        intermediate_layers=None,
+        ctc_softmax=None,
+        conditioning_layer_dim=None,
     ):
         """Construct an Encoder object."""
         super(Encoder, self).__init__()
@@ -186,7 +195,7 @@ def __init__(
         elif selfattention_layer_type == "lightconv2d":
             logging.info(
                 "encoder self-attention layer "
-                "type = lightweight convolution 2-dimentional"
+                "type = lightweight convolution 2-dimensional"
             )
             encoder_selfattn_layer = LightweightConvolution2D
             encoder_selfattn_layer_args = [
@@ -216,7 +225,7 @@ def __init__(
             ]
         elif selfattention_layer_type == "dynamicconv2d":
             logging.info(
-                "encoder self-attention layer type = dynamic convolution 2-dimentional"
+                "encoder self-attention layer type = dynamic convolution 2-dimensional"
             )
             encoder_selfattn_layer = DynamicConvolution2D
             encoder_selfattn_layer_args = [
@@ -242,11 +251,20 @@ def __init__(
                 dropout_rate,
                 normalize_before,
                 concat_after,
+                stochastic_depth_rate * float(1 + lnum) / num_blocks,
             ),
         )
         if self.normalize_before:
             self.after_norm = LayerNorm(attention_dim)
 
+        self.intermediate_layers = intermediate_layers
+        self.use_conditioning = True if ctc_softmax is not None else False
+        if self.use_conditioning:
+            self.ctc_softmax = ctc_softmax
+            self.conditioning_layer = torch.nn.Linear(
+                conditioning_layer_dim, attention_dim
+            )
+
     def get_positionwise_layer(
         self,
         positionwise_layer_type="linear",
@@ -298,9 +316,33 @@ def forward(self, xs, masks):
             xs, masks = self.embed(xs, masks)
         else:
             xs = self.embed(xs)
-        xs, masks = self.encoders(xs, masks)
+
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (
+                    self.intermediate_layers is not None
+                    and layer_idx + 1 in self.intermediate_layers
+                ):
+                    encoder_output = xs
+                    # intermediate branches also require normalization.
+                    if self.normalize_before:
+                        encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+
+                    if self.use_conditioning:
+                        intermediate_result = self.ctc_softmax(encoder_output)
+                        xs = xs + self.conditioning_layer(intermediate_result)
+
         if self.normalize_before:
             xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
         return xs, masks
 
     def forward_one_step(self, xs, masks, cache=None):
diff --git a/espnet/nets/pytorch_backend/transformer/encoder_layer.py b/espnet/nets/pytorch_backend/transformer/encoder_layer.py
index 5758fbc663f..863aa6730b3 100644
--- a/espnet/nets/pytorch_backend/transformer/encoder_layer.py
+++ b/espnet/nets/pytorch_backend/transformer/encoder_layer.py
@@ -30,7 +30,9 @@ class EncoderLayer(nn.Module):
             if True, additional linear will be applied.
             i.e. x -> x + linear(concat(x, att(x)))
             if False, no additional linear will be applied. i.e. x -> x + att(x)
-
+        stochastic_depth_rate (float): Proability to skip this layer.
+            During training, the layer may skip residual computation and return input
+            as-is with given probability.
     """
 
     def __init__(
@@ -41,6 +43,7 @@ def __init__(
         dropout_rate,
         normalize_before=True,
         concat_after=False,
+        stochastic_depth_rate=0.0,
     ):
         """Construct an EncoderLayer object."""
         super(EncoderLayer, self).__init__()
@@ -54,6 +57,7 @@ def __init__(
         self.concat_after = concat_after
         if self.concat_after:
             self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
 
     def forward(self, x, mask, cache=None):
         """Compute encoded features.
@@ -68,6 +72,19 @@ def forward(self, x, mask, cache=None):
             torch.Tensor: Mask tensor (#batch, time).
 
         """
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = torch.cat([cache, x], dim=1)
+            return x, mask
+
         residual = x
         if self.normalize_before:
             x = self.norm1(x)
@@ -82,16 +99,18 @@ def forward(self, x, mask, cache=None):
 
         if self.concat_after:
             x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
-            x = residual + self.concat_linear(x_concat)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
         else:
-            x = residual + self.dropout(self.self_attn(x_q, x, x, mask))
+            x = residual + stoch_layer_coeff * self.dropout(
+                self.self_attn(x_q, x, x, mask)
+            )
         if not self.normalize_before:
             x = self.norm1(x)
 
         residual = x
         if self.normalize_before:
             x = self.norm2(x)
-        x = residual + self.dropout(self.feed_forward(x))
+        x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
         if not self.normalize_before:
             x = self.norm2(x)
 
diff --git a/espnet/nets/pytorch_backend/transformer/encoder_mix.py b/espnet/nets/pytorch_backend/transformer/encoder_mix.py
index eb4bd945368..4fa2d355545 100644
--- a/espnet/nets/pytorch_backend/transformer/encoder_mix.py
+++ b/espnet/nets/pytorch_backend/transformer/encoder_mix.py
@@ -21,7 +21,7 @@ class EncoderMix(Encoder, torch.nn.Module):
     """Transformer encoder module.
 
     :param int idim: input dim
-    :param int attention_dim: dimention of attention
+    :param int attention_dim: dimension of attention
     :param int attention_heads: the number of heads of multi head attention
     :param int linear_units: the number of units of position-wise feed forward
     :param int num_blocks: the number of decoder blocks
diff --git a/espnet/nets/pytorch_backend/transformer/layer_norm.py b/espnet/nets/pytorch_backend/transformer/layer_norm.py
index b47530ece7d..6e934e644bf 100644
--- a/espnet/nets/pytorch_backend/transformer/layer_norm.py
+++ b/espnet/nets/pytorch_backend/transformer/layer_norm.py
@@ -35,4 +35,8 @@ def forward(self, x):
         """
         if self.dim == -1:
             return super(LayerNorm, self).forward(x)
-        return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
+        return (
+            super(LayerNorm, self)
+            .forward(x.transpose(self.dim, -1))
+            .transpose(self.dim, -1)
+        )
diff --git a/espnet/nets/pytorch_backend/transformer/lightconv.py b/espnet/nets/pytorch_backend/transformer/lightconv.py
index a940c6d9042..b249402591e 100644
--- a/espnet/nets/pytorch_backend/transformer/lightconv.py
+++ b/espnet/nets/pytorch_backend/transformer/lightconv.py
@@ -75,7 +75,7 @@ def forward(self, query, key, value, mask):
             mask (torch.Tensor): (batch, time1, time2) mask
 
         Return:
-            x (torch.Tensor): (batch, time1, d_model) ouput
+            x (torch.Tensor): (batch, time1, d_model) output
 
         """
         # linear -> GLU -> lightconv -> linear
diff --git a/espnet/nets/pytorch_backend/transformer/lightconv2d.py b/espnet/nets/pytorch_backend/transformer/lightconv2d.py
index 5effb9729ef..294d23244e4 100644
--- a/espnet/nets/pytorch_backend/transformer/lightconv2d.py
+++ b/espnet/nets/pytorch_backend/transformer/lightconv2d.py
@@ -1,4 +1,4 @@
-"""Lightweight 2-Dimentional Convolution module."""
+"""Lightweight 2-Dimensional Convolution module."""
 
 import numpy
 import torch
@@ -10,7 +10,7 @@
 
 
 class LightweightConvolution2D(nn.Module):
-    """Lightweight 2-Dimentional Convolution layer.
+    """Lightweight 2-Dimensional Convolution layer.
 
     This implementation is based on
     https://github.com/pytorch/fairseq/tree/master/fairseq
@@ -34,7 +34,7 @@ def __init__(
         use_kernel_mask=False,
         use_bias=False,
     ):
-        """Construct Lightweight 2-Dimentional Convolution layer."""
+        """Construct Lightweight 2-Dimensional Convolution layer."""
         super(LightweightConvolution2D, self).__init__()
 
         assert n_feat % wshare == 0
@@ -64,7 +64,7 @@ def __init__(
         self.kernel_mask = torch.cat((kernel_mask1, kernel_mask0), dim=-1).unsqueeze(1)
 
     def forward(self, query, key, value, mask):
-        """Forward of 'Lightweight 2-Dimentional Convolution'.
+        """Forward of 'Lightweight 2-Dimensional Convolution'.
 
         This function takes query, key and value but uses only query.
         This is just for compatibility with self-attention layer (attention.py)
@@ -76,7 +76,7 @@ def forward(self, query, key, value, mask):
             mask (torch.Tensor): (batch, time1, time2) mask
 
         Return:
-            x (torch.Tensor): (batch, time1, d_model) ouput
+            x (torch.Tensor): (batch, time1, d_model) output
 
         """
         # linear -> GLU -> lightconv -> linear
diff --git a/espnet/nets/pytorch_backend/transformer/longformer_attention.py b/espnet/nets/pytorch_backend/transformer/longformer_attention.py
new file mode 100644
index 00000000000..82a54c801d1
--- /dev/null
+++ b/espnet/nets/pytorch_backend/transformer/longformer_attention.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Roshan Sharma (Carnegie Mellon University)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Longformer based Local Attention Definition."""
+
+from longformer.longformer import LongformerConfig
+from longformer.longformer import LongformerSelfAttention
+from torch import nn
+
+
+class LongformerAttention(nn.Module):
+    """Longformer based Local Attention Definition."""
+
+    def __init__(self, config: LongformerConfig, layer_id: int):
+        """Compute Longformer based Self-Attention.
+
+        Args:
+            config : Longformer attention configuration
+            layer_id: Integer representing the layer index
+        """
+        super().__init__()
+        self.attention_window = config.attention_window[layer_id]
+        self.attention_layer = LongformerSelfAttention(config, layer_id=layer_id)
+        self.attention = None
+
+    def forward(self, query, key, value, mask):
+        """Compute Longformer Self-Attention with masking.
+
+        Expects `len(hidden_states)` to be multiple of `attention_window`.
+        Padding to `attention_window` happens in :meth:`encoder.forward`
+        to avoid redoing the padding on each layer.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, 2*time1-1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        attention_mask = mask.int()
+        attention_mask[mask == 0] = -1
+        attention_mask[mask == 1] = 0
+        output, self.attention = self.attention_layer(
+            hidden_states=query,
+            attention_mask=attention_mask.unsqueeze(1),
+            head_mask=None,
+            output_attentions=True,
+        )
+        return output
diff --git a/espnet/nets/pytorch_backend/transformer/mask.py b/espnet/nets/pytorch_backend/transformer/mask.py
index d9245d0917c..8f068e11c33 100644
--- a/espnet/nets/pytorch_backend/transformer/mask.py
+++ b/espnet/nets/pytorch_backend/transformer/mask.py
@@ -1,23 +1,12 @@
-# -*- coding: utf-8 -*-
-
 # Copyright 2019 Shigeki Karita
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 """Mask module."""
 
-from distutils.version import LooseVersion
-
 import torch
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
-# LooseVersion('1.2.0') == LooseVersion(torch.__version__) can't include e.g. 1.2.0+aaa
-is_torch_1_2 = (
-    LooseVersion("1.3") > LooseVersion(torch.__version__) >= LooseVersion("1.2")
-)
-datatype = torch.bool if is_torch_1_2_plus else torch.uint8
-
 
-def subsequent_mask(size, device="cpu", dtype=datatype):
+def subsequent_mask(size, device="cpu", dtype=torch.bool):
     """Create mask for subsequent steps (size, size).
 
     :param int size: size of mask
@@ -29,13 +18,8 @@ def subsequent_mask(size, device="cpu", dtype=datatype):
      [1, 1, 0],
      [1, 1, 1]]
     """
-    if is_torch_1_2 and dtype == torch.bool:
-        # torch=1.2 doesn't support tril for bool tensor
-        ret = torch.ones(size, size, device=device, dtype=torch.uint8)
-        return torch.tril(ret, out=ret).type(dtype)
-    else:
-        ret = torch.ones(size, size, device=device, dtype=dtype)
-        return torch.tril(ret, out=ret)
+    ret = torch.ones(size, size, device=device, dtype=dtype)
+    return torch.tril(ret, out=ret)
 
 
 def target_mask(ys_in_pad, ignore_id):
diff --git a/espnet/nets/pytorch_backend/transformer/plot.py b/espnet/nets/pytorch_backend/transformer/plot.py
index b44673fddc8..5946de6cd56 100644
--- a/espnet/nets/pytorch_backend/transformer/plot.py
+++ b/espnet/nets/pytorch_backend/transformer/plot.py
@@ -3,7 +3,6 @@
 
 import logging
 
-import matplotlib.pyplot as plt
 import numpy
 import os
 
@@ -11,7 +10,10 @@
 
 
 def _plot_and_save_attention(att_w, filename, xtokens=None, ytokens=None):
-    # dynamically import matplotlib due to not found error
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
     from matplotlib.ticker import MaxNLocator
 
     d = os.path.dirname(filename)
@@ -43,6 +45,11 @@ def _plot_and_save_attention(att_w, filename, xtokens=None, ytokens=None):
 
 
 def savefig(plot, filename):
+    import matplotlib
+
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+
     plot.savefig(filename)
     plt.clf()
 
@@ -139,6 +146,11 @@ def get_attention_weights(self):
 
     def log_attentions(self, logger, step):
         def log_fig(plot, filename):
+            import matplotlib
+
+            matplotlib.use("Agg")
+            import matplotlib.pyplot as plt
+
             logger.add_figure(os.path.basename(filename), plot, step)
             plt.clf()
 
diff --git a/espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py b/espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py
index 5a66445e955..e6952551c6e 100644
--- a/espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py
+++ b/espnet/nets/pytorch_backend/transformer/positionwise_feed_forward.py
@@ -28,5 +28,5 @@ def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU())
         self.activation = activation
 
     def forward(self, x):
-        """Forward funciton."""
+        """Forward function."""
         return self.w_2(self.dropout(self.activation(self.w_1(x))))
diff --git a/espnet/nets/pytorch_backend/transformer/subsampling.py b/espnet/nets/pytorch_backend/transformer/subsampling.py
index 1f5a736d3aa..a69bc09445f 100644
--- a/espnet/nets/pytorch_backend/transformer/subsampling.py
+++ b/espnet/nets/pytorch_backend/transformer/subsampling.py
@@ -30,6 +30,8 @@ def __init__(self, message, actual_size, limit):
 
 def check_short_utt(ins, size):
     """Check if the utterance is too short for subsampling."""
+    if isinstance(ins, Conv2dSubsampling2) and size < 3:
+        return True, 3
     if isinstance(ins, Conv2dSubsampling) and size < 7:
         return True, 7
     if isinstance(ins, Conv2dSubsampling6) and size < 11:
@@ -98,6 +100,65 @@ def __getitem__(self, key):
         return self.out[key]
 
 
+class Conv2dSubsampling2(torch.nn.Module):
+    """Convolutional 2D subsampling (to 1/2 length).
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling2 object."""
+        super(Conv2dSubsampling2, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 1),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate),
+        )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:1]
+
+    def __getitem__(self, key):
+        """Get item.
+
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+
+        """
+        if key != -1:
+            raise NotImplementedError("Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
+
+
 class Conv2dSubsampling6(torch.nn.Module):
     """Convolutional 2D subsampling (to 1/6 length).
 
diff --git a/espnet/nets/pytorch_backend/wavenet.py b/espnet/nets/pytorch_backend/wavenet.py
index a14870e5b5a..0539518342c 100644
--- a/espnet/nets/pytorch_backend/wavenet.py
+++ b/espnet/nets/pytorch_backend/wavenet.py
@@ -202,7 +202,7 @@ def __init__(
         self.upsampling_factor = upsampling_factor
 
         self.dilations = [
-            2 ** i for i in range(self.dilation_depth)
+            2**i for i in range(self.dilation_depth)
         ] * self.dilation_repeat
         self.receptive_field = (self.kernel_size - 1) * sum(self.dilations) + 1
 
diff --git a/espnet/nets/scorer_interface.py b/espnet/nets/scorer_interface.py
index c4865d296b6..946ec6be317 100644
--- a/espnet/nets/scorer_interface.py
+++ b/espnet/nets/scorer_interface.py
@@ -132,7 +132,7 @@ class PartialScorerInterface(ScorerInterface):
     """Partial scorer interface for beam search.
 
     The partial scorer performs scoring when non-partial scorer finished scoring,
-    and recieves pre-pruned next tokens to score because it is too heavy to score
+    and receives pre-pruned next tokens to score because it is too heavy to score
     all the tokens.
 
     Examples:
diff --git a/espnet/nets/scorers/ctc.py b/espnet/nets/scorers/ctc.py
index 9305e40a428..1d12ce6e2a2 100644
--- a/espnet/nets/scorers/ctc.py
+++ b/espnet/nets/scorers/ctc.py
@@ -15,7 +15,7 @@ def __init__(self, ctc: torch.nn.Module, eos: int):
         """Initialize class.
 
         Args:
-            ctc (torch.nn.Module): The CTC implementaiton.
+            ctc (torch.nn.Module): The CTC implementation.
                 For example, :class:`espnet.nets.pytorch_backend.ctc.CTC`
             eos (int): The end-of-sequence id.
 
@@ -129,7 +129,7 @@ def batch_score_partial(self, y, ids, state, x):
     def extend_prob(self, x: torch.Tensor):
         """Extend probs for decoding.
 
-        This extention is for streaming decoding
+        This extension is for streaming decoding
         as in Eq (14) in https://arxiv.org/abs/2006.14941
 
         Args:
@@ -142,7 +142,7 @@ def extend_prob(self, x: torch.Tensor):
     def extend_state(self, state):
         """Extend state for decoding.
 
-        This extention is for streaming decoding
+        This extension is for streaming decoding
         as in Eq (14) in https://arxiv.org/abs/2006.14941
 
         Args:
diff --git a/espnet/nets/scorers/ngram.py b/espnet/nets/scorers/ngram.py
index 701bbbdc304..61ed70efdb0 100644
--- a/espnet/nets/scorers/ngram.py
+++ b/espnet/nets/scorers/ngram.py
@@ -10,7 +10,7 @@
 
 
 class Ngrambase(ABC):
-    """Ngram base implemented throught ScorerInterface."""
+    """Ngram base implemented through ScorerInterface."""
 
     def __init__(self, ngram_model, token_list):
         """Initialize Ngrambase.
diff --git a/espnet/nets/transducer_decoder_interface.py b/espnet/nets/transducer_decoder_interface.py
index 111771305f8..eb3ab318dcc 100644
--- a/espnet/nets/transducer_decoder_interface.py
+++ b/espnet/nets/transducer_decoder_interface.py
@@ -13,123 +13,129 @@
 
 @dataclass
 class Hypothesis:
-    """Default hypothesis definition for beam search."""
+    """Default hypothesis definition for Transducer search algorithms."""
 
     score: float
     yseq: List[int]
     dec_state: Union[
-        Tuple[torch.Tensor, Optional[torch.Tensor]], List[torch.Tensor], torch.Tensor
+        Tuple[torch.Tensor, Optional[torch.Tensor]],
+        List[Optional[torch.Tensor]],
+        torch.Tensor,
     ]
     lm_state: Union[Dict[str, Any], List[Any]] = None
 
 
 @dataclass
-class NSCHypothesis(Hypothesis):
-    """Extended hypothesis definition for NSC beam search."""
+class ExtendedHypothesis(Hypothesis):
+    """Extended hypothesis definition for NSC beam search and mAES."""
 
-    y: List[torch.Tensor] = None
+    dec_out: List[torch.Tensor] = None
     lm_scores: torch.Tensor = None
 
 
 class TransducerDecoderInterface:
-    """Decoder interface for transducer models."""
+    """Decoder interface for Transducer models."""
 
     def init_state(
         self,
         batch_size: int,
-        device: torch.device,
     ) -> Union[
         Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
     ]:
         """Initialize decoder states.
 
         Args:
-            batch_size: Batch size for initial state
-            device: Device for initial state
+            batch_size: Batch size.
 
         Returns:
-            state: Initialized state
+            state: Initial decoder hidden states.
 
         """
-        raise NotImplementedError("init_state method is not implemented")
+        raise NotImplementedError("init_state(...) is not implemented")
 
     def score(
         self,
-        hyp: Union[Hypothesis, NSCHypothesis],
+        hyp: Hypothesis,
         cache: Dict[str, Any],
-    ) -> Union[
-        Tuple[torch.Tensor, Optional[torch.Tensor]],
+    ) -> Tuple[
+        torch.Tensor,
+        Union[
+            Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
+        ],
         torch.Tensor,
-        List[Optional[torch.Tensor]],
     ]:
-        """Forward one hypothesis.
+        """One-step forward hypothesis.
 
         Args:
             hyp: Hypothesis.
-            cache: Pairs of (y, state) for each token sequence (key)
+            cache: Pairs of (dec_out, dec_state) for each token sequence. (key)
 
         Returns:
-            y: Decoder outputs
-            new_state: New decoder state
-            lm_tokens: Token id for LM
+            dec_out: Decoder output sequence.
+            new_state: Decoder hidden states.
+            lm_tokens: Label ID for LM.
 
         """
-        raise NotImplementedError("score method is not implemented")
+        raise NotImplementedError("score(...) is not implemented")
 
     def batch_score(
         self,
-        hyps: Union[List[Hypothesis], List[NSCHypothesis]],
-        batch_states: Union[
+        hyps: Union[List[Hypothesis], List[ExtendedHypothesis]],
+        dec_states: Union[
             Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
         ],
         cache: Dict[str, Any],
-    ) -> Union[
-        Tuple[torch.Tensor, Optional[torch.Tensor]],
+        use_lm: bool,
+    ) -> Tuple[
+        torch.Tensor,
+        Union[
+            Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
+        ],
         torch.Tensor,
-        List[Optional[torch.Tensor]],
     ]:
-        """Forward batch of hypotheses.
+        """One-step forward hypotheses.
 
         Args:
-            hyps: Batch of hypotheses
-            batch_states: Batch of decoder states
-            cache: pairs of (y, state) for each token sequence (key)
+            hyps: Hypotheses.
+            dec_states: Decoder hidden states.
+            cache: Pairs of (dec_out, dec_states) for each label sequence. (key)
+            use_lm: Whether to compute label ID sequences for LM.
 
         Returns:
-            batch_y: Decoder outputs
-            batch_states: Batch of decoder states
-            lm_tokens: Batch of token ids for LM
+            dec_out: Decoder output sequences.
+            dec_states: Decoder hidden states.
+            lm_labels: Label ID sequences for LM.
 
         """
-        raise NotImplementedError("batch_score method is not implemented")
+        raise NotImplementedError("batch_score(...) is not implemented")
 
     def select_state(
         self,
         batch_states: Union[
-            Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
+            Tuple[torch.Tensor, Optional[torch.Tensor]], List[torch.Tensor]
         ],
         idx: int,
     ) -> Union[
         Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
     ]:
-        """Get decoder state from batch for given id.
+        """Get specified ID state from decoder hidden states.
 
         Args:
-            batch_states: Batch of decoder states
-            idx: Index to extract state from batch
+            batch_states: Decoder hidden states.
+            idx: State ID to extract.
 
         Returns:
-            state_idx: Decoder state for given id
+            state_idx: Decoder hidden state for given ID.
 
         """
-        raise NotImplementedError("select_state method is not implemented")
+        raise NotImplementedError("select_state(...) is not implemented")
 
     def create_batch_states(
         self,
-        batch_states: Union[
+        states: Union[
             Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
         ],
-        l_states: List[
+        new_states: List[
             Union[
                 Tuple[torch.Tensor, Optional[torch.Tensor]],
                 List[Optional[torch.Tensor]],
@@ -139,7 +145,7 @@ def create_batch_states(
     ) -> Union[
         Tuple[torch.Tensor, Optional[torch.Tensor]], List[Optional[torch.Tensor]]
     ]:
-        """Create batch of decoder states.
+        """Create decoder hidden states.
 
         Args:
             batch_states: Batch of decoder states
@@ -150,4 +156,4 @@ def create_batch_states(
             batch_states: Batch of decoder states
 
         """
-        raise NotImplementedError("create_batch_states method is not implemented")
+        raise NotImplementedError("create_batch_states(...) is not implemented")
diff --git a/espnet/scheduler/scheduler.py b/espnet/scheduler/scheduler.py
index 8d81368884c..f9221e7df8a 100644
--- a/espnet/scheduler/scheduler.py
+++ b/espnet/scheduler/scheduler.py
@@ -135,12 +135,12 @@ def _add_arguments(parser: _PrefixParser):
     def __init__(self, key, args):
         """Initialize class."""
         super().__init__(key, args)
-        self.normalize = 1 / (self.warmup * self.warmup ** -1.5)
+        self.normalize = 1 / (self.warmup * self.warmup**-1.5)
 
     def scale(self, step):
         """Scale of lr."""
         step += 1  # because step starts from 0
-        return self.normalize * min(step ** -0.5, step * self.warmup ** -1.5)
+        return self.normalize * min(step**-0.5, step * self.warmup**-1.5)
 
 
 @register_scheduler
diff --git a/espnet/st/pytorch_backend/st.py b/espnet/st/pytorch_backend/st.py
index d6824280c39..1a56930dcd3 100644
--- a/espnet/st/pytorch_backend/st.py
+++ b/espnet/st/pytorch_backend/st.py
@@ -3,15 +3,14 @@
 
 """Training/decoding definition for the speech translation task."""
 
+import itertools
 import json
 import logging
 import os
-import sys
 
 from chainer import training
 from chainer.training import extensions
 import numpy as np
-from tensorboardX import SummaryWriter
 import torch
 
 from espnet.asr.asr_utils import adadelta_eps_decay
@@ -43,15 +42,6 @@
 from espnet.asr.pytorch_backend.asr import CustomEvaluator
 from espnet.asr.pytorch_backend.asr import CustomUpdater
 
-import matplotlib
-
-matplotlib.use("Agg")
-
-if sys.version_info[0] == 2:
-    from itertools import izip_longest as zip_longest
-else:
-    from itertools import zip_longest as zip_longest
-
 
 class CustomConverter(ASRCustomConverter):
     """Custom batch converter for Pytorch.
@@ -588,6 +578,8 @@ def train(args):
     set_early_stop(trainer, args)
 
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        from torch.utils.tensorboard import SummaryWriter
+
         trainer.extend(
             TensorboardLogger(
                 SummaryWriter(args.tensorboard_dir),
@@ -653,7 +645,7 @@ def trans(args):
 
         def grouper(n, iterable, fillvalue=None):
             kargs = [iter(iterable)] * n
-            return zip_longest(*kargs, fillvalue=fillvalue)
+            return itertools.zip_longest(*kargs, fillvalue=fillvalue)
 
         # sort data if batchsize > 1
         keys = list(js.keys())
diff --git a/espnet/transform/add_deltas.py b/espnet/transform/add_deltas.py
index 93f941c5f04..f80a1dbeb92 100644
--- a/espnet/transform/add_deltas.py
+++ b/espnet/transform/add_deltas.py
@@ -9,7 +9,7 @@ def delta(feat, window):
         delta_feat[i:] += -i * feat[:-i]
         delta_feat[-i:] += i * feat[-1]
         delta_feat[:i] += -i * feat[0]
-    delta_feat /= 2 * sum(i ** 2 for i in range(1, window + 1))
+    delta_feat /= 2 * sum(i**2 for i in range(1, window + 1))
     return delta_feat
 
 
diff --git a/espnet/transform/channel_selector.py b/espnet/transform/channel_selector.py
index 9f303bd5077..caf74c9ec3c 100644
--- a/espnet/transform/channel_selector.py
+++ b/espnet/transform/channel_selector.py
@@ -2,7 +2,7 @@
 
 
 class ChannelSelector(object):
-    """Select 1ch from multi-channel signal """
+    """Select 1ch from multi-channel signal"""
 
     def __init__(self, train_channel="random", eval_channel=0, axis=1):
         self.train_channel = train_channel
diff --git a/espnet/transform/cmvn.py b/espnet/transform/cmvn.py
index 085b243841d..845d0ade3f4 100644
--- a/espnet/transform/cmvn.py
+++ b/espnet/transform/cmvn.py
@@ -130,14 +130,14 @@ def __repr__(self):
 
     def __call__(self, x, uttid=None):
         # x: [Time, Dim]
-        square_sums = (x ** 2).sum(axis=0)
+        square_sums = (x**2).sum(axis=0)
         mean = x.mean(axis=0)
 
         if self.norm_means:
             x = np.subtract(x, mean)
 
         if self.norm_vars:
-            var = square_sums / x.shape[0] - mean ** 2
+            var = square_sums / x.shape[0] - mean**2
             std = np.maximum(np.sqrt(var), self.std_floor)
             x = np.divide(x, std)
 
diff --git a/espnet/transform/perturb.py b/espnet/transform/perturb.py
index a05b72794c2..fdcd7521e0a 100644
--- a/espnet/transform/perturb.py
+++ b/espnet/transform/perturb.py
@@ -270,7 +270,7 @@ def __call__(self, x, uttid=None, train=True):
 
         if self.dbunit:
             ratio = 10 ** (ratio / 20)
-        scale = ratio * numpy.sqrt((x ** 2).mean())
+        scale = ratio * numpy.sqrt((x**2).mean())
 
         # 2. Get noise
         if self.utt2noise is not None:
@@ -281,7 +281,7 @@ def __call__(self, x, uttid=None, train=True):
                 # Randomly select the noise source
                 noise = self.state.choice(list(self.utt2noise.values()))
             # Normalize the level
-            noise /= numpy.sqrt((noise ** 2).mean())
+            noise /= numpy.sqrt((noise**2).mean())
 
             # Adjust the noise length
             diff = abs(len(x) - len(noise))
diff --git a/espnet/transform/spec_augment.py b/espnet/transform/spec_augment.py
index 789bf187a2d..e52034880fb 100644
--- a/espnet/transform/spec_augment.py
+++ b/espnet/transform/spec_augment.py
@@ -3,8 +3,6 @@
 import random
 
 import numpy
-from PIL import Image
-from PIL.Image import BICUBIC
 
 from espnet.transform.functional import FuncTrans
 
@@ -20,6 +18,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
         (slow, differentiable)
     :returns numpy.ndarray: time warped spectrogram (time, freq)
     """
+    from PIL import Image
+    from PIL.Image import BICUBIC
+
     window = max_time_warp
     if mode == "PIL":
         t = x.shape[0]
diff --git a/espnet/transform/spectrogram.py b/espnet/transform/spectrogram.py
index 518a00efea4..d653fa7ead4 100644
--- a/espnet/transform/spectrogram.py
+++ b/espnet/transform/spectrogram.py
@@ -76,7 +76,9 @@ def stft2logmelspectrogram(x_stft, fs, n_mels, n_fft, fmin=None, fmax=None, eps=
     # spc: (Time, Channel, Freq) or (Time, Freq)
     spc = np.abs(x_stft)
     # mel_basis: (Mel_freq, Freq)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+    )
     # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
     lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))
 
diff --git a/espnet/transform/transformation.py b/espnet/transform/transformation.py
index 49418f56039..1a043b00be4 100644
--- a/espnet/transform/transformation.py
+++ b/espnet/transform/transformation.py
@@ -1,25 +1,16 @@
+"""Transformation module."""
+from collections.abc import Sequence
 from collections import OrderedDict
 import copy
+from inspect import signature
 import io
 import logging
-import sys
 
 import yaml
 
 from espnet.utils.dynamic_import import dynamic_import
 
 
-PY2 = sys.version_info[0] == 2
-
-if PY2:
-    from collections import Sequence
-    from funcsigs import signature
-else:
-    # The ABCs from 'collections' will stop working in 3.8
-    from collections.abc import Sequence
-    from inspect import signature
-
-
 # TODO(karita): inherit TransformInterface
 # TODO(karita): register cmd arguments in asr_train.py
 import_alias = dict(
diff --git a/espnet/transform/wpe.py b/espnet/transform/wpe.py
index 8aed97e6bf5..cdc60373434 100644
--- a/espnet/transform/wpe.py
+++ b/espnet/transform/wpe.py
@@ -1,6 +1,3 @@
-from nara_wpe.wpe import wpe
-
-
 class WPE(object):
     def __init__(
         self, taps=10, delay=3, iterations=3, psd_context=0, statistics_mode="full"
@@ -33,6 +30,8 @@ def __call__(self, xs):
         :rtype: np.ndarray
 
         """
+        from nara_wpe.wpe import wpe
+
         # nara_wpe.wpe: (F, C, T)
         xs = wpe(
             xs.transpose((2, 1, 0)),
diff --git a/espnet/tts/pytorch_backend/tts.py b/espnet/tts/pytorch_backend/tts.py
index 8e1e7bf1030..09c45479a48 100644
--- a/espnet/tts/pytorch_backend/tts.py
+++ b/espnet/tts/pytorch_backend/tts.py
@@ -42,12 +42,7 @@
 
 from espnet.utils.training.iterators import ShufflingEnabler
 
-import matplotlib
-
 from espnet.utils.training.tensorboard_logger import TensorboardLogger
-from tensorboardX import SummaryWriter
-
-matplotlib.use("Agg")
 
 
 class CustomEvaluator(BaseEvaluator):
@@ -552,6 +547,8 @@ def train(args):
 
     set_early_stop(trainer, args)
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        from torch.utils.tensorboard import SummaryWriter
+
         writer = SummaryWriter(args.tensorboard_dir)
         trainer.extend(TensorboardLogger(writer, att_reporter), trigger=report_interval)
 
@@ -614,6 +611,9 @@ def decode(args):
 
     # define function for plot prob and att_ws
     def _plot_and_save(array, figname, figsize=(6, 4), dpi=150):
+        import matplotlib
+
+        matplotlib.use("Agg")
         import matplotlib.pyplot as plt
 
         shape = array.shape
diff --git a/espnet/utils/dataset.py b/espnet/utils/dataset.py
index ac37ae6f7c7..202c7fb0027 100644
--- a/espnet/utils/dataset.py
+++ b/espnet/utils/dataset.py
@@ -14,7 +14,7 @@ class TransformDataset(torch.utils.data.Dataset):
 
     Args:
         data: list object from make_batchset
-        transfrom: transform function
+        transform: transform function
 
     """
 
diff --git a/espnet/utils/io_utils.py b/espnet/utils/io_utils.py
index 56277c2a5b8..6a642796c43 100644
--- a/espnet/utils/io_utils.py
+++ b/espnet/utils/io_utils.py
@@ -208,7 +208,7 @@ def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
         :return: batch, uttid_list
         :rtype: Tuple[OrderedDict, List[str]]
         """
-        # handle single-input and multi-input (paralell) asr mode
+        # handle single-input and multi-input (parallel) asr mode
         xs = list(x_feats_dict.values())
 
         if self.load_output:
diff --git a/espnet/vc/pytorch_backend/vc.py b/espnet/vc/pytorch_backend/vc.py
index ec35e20c3f5..bfa3b0d11f3 100644
--- a/espnet/vc/pytorch_backend/vc.py
+++ b/espnet/vc/pytorch_backend/vc.py
@@ -42,12 +42,7 @@
 
 from espnet.utils.training.iterators import ShufflingEnabler
 
-import matplotlib
-
 from espnet.utils.training.tensorboard_logger import TensorboardLogger
-from tensorboardX import SummaryWriter
-
-matplotlib.use("Agg")
 
 
 class CustomEvaluator(BaseEvaluator):
@@ -357,7 +352,10 @@ def train(args):
         from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt
 
         optimizer = get_std_opt(
-            model, args.adim, args.transformer_warmup_steps, args.transformer_lr
+            model.parameters(),
+            args.adim,
+            args.transformer_warmup_steps,
+            args.transformer_lr,
         )
     elif args.opt == "lamb":
         from pytorch_lamb import Lamb
@@ -546,6 +544,8 @@ def train(args):
 
     set_early_stop(trainer, args)
     if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        from torch.utils.tensorboard import SummaryWriter
+
         writer = SummaryWriter(args.tensorboard_dir)
         trainer.extend(TensorboardLogger(writer, att_reporter), trigger=report_interval)
 
@@ -608,6 +608,9 @@ def decode(args):
 
     # define function for plot prob and att_ws
     def _plot_and_save(array, figname, figsize=(6, 4), dpi=150):
+        import matplotlib
+
+        matplotlib.use("Agg")
         import matplotlib.pyplot as plt
 
         shape = array.shape
diff --git a/espnet/version.txt b/espnet/version.txt
index e3e180701e2..574cb0d455e 100644
--- a/espnet/version.txt
+++ b/espnet/version.txt
@@ -1 +1 @@
-0.9.8
+0.10.7a1
diff --git a/espnet2/asr/ctc.py b/espnet2/asr/ctc.py
index c652cff6c7f..64b87106ac8 100644
--- a/espnet2/asr/ctc.py
+++ b/espnet2/asr/ctc.py
@@ -10,7 +10,7 @@ class CTC(torch.nn.Module):
 
     Args:
         odim: dimension of outputs
-        encoder_output_sizse: number of encoder projection units
+        encoder_output_size: number of encoder projection units
         dropout_rate: dropout rate (0.0 ~ 1.0)
         ctc_type: builtin or warpctc
         reduce: reduce the CTC loss into a scalar
@@ -19,15 +19,15 @@ class CTC(torch.nn.Module):
     def __init__(
         self,
         odim: int,
-        encoder_output_sizse: int,
+        encoder_output_size: int,
         dropout_rate: float = 0.0,
         ctc_type: str = "builtin",
         reduce: bool = True,
-        ignore_nan_grad: bool = False,
+        ignore_nan_grad: bool = True,
     ):
         assert check_argument_types()
         super().__init__()
-        eprojs = encoder_output_sizse
+        eprojs = encoder_output_size
         self.dropout_rate = dropout_rate
         self.ctc_lo = torch.nn.Linear(eprojs, odim)
         self.ctc_type = ctc_type
@@ -39,10 +39,13 @@ def __init__(
             import warpctc_pytorch as warp_ctc
 
             if ignore_nan_grad:
-                raise NotImplementedError(
-                    "ignore_nan_grad option is not supported for warp_ctc"
-                )
+                logging.warning("ignore_nan_grad option is not supported for warp_ctc")
             self.ctc_loss = warp_ctc.CTCLoss(size_average=True, reduce=reduce)
+
+        elif self.ctc_type == "gtnctc":
+            from espnet.nets.pytorch_backend.gtn_ctc import GTNCTCLossFunction
+
+            self.ctc_loss = GTNCTCLossFunction.apply
         else:
             raise ValueError(
                 f'ctc_type must be "builtin" or "warpctc": {self.ctc_type}'
@@ -118,6 +121,11 @@ def loss_fn(self, th_pred, th_target, th_ilen, th_olen) -> torch.Tensor:
                 # but builtin return as tensor w/o shape (scalar).
                 loss = loss.sum()
             return loss
+
+        elif self.ctc_type == "gtnctc":
+            log_probs = torch.nn.functional.log_softmax(th_pred, dim=2)
+            return self.ctc_loss(log_probs, th_target, th_ilen, 0, "none")
+
         else:
             raise NotImplementedError
 
@@ -132,11 +140,15 @@ def forward(self, hs_pad, hlens, ys_pad, ys_lens):
         """
         # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab)
         ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
-        # ys_hat: (B, L, D) -> (L, B, D)
-        ys_hat = ys_hat.transpose(0, 1)
 
-        # (B, L) -> (BxL,)
-        ys_true = torch.cat([ys_pad[i, :l] for i, l in enumerate(ys_lens)])
+        if self.ctc_type == "gtnctc":
+            # gtn expects list form for ys
+            ys_true = [y[y != -1] for y in ys_pad]  # parse padded ys
+        else:
+            # ys_hat: (B, L, D) -> (L, B, D)
+            ys_hat = ys_hat.transpose(0, 1)
+            # (B, L) -> (BxL,)
+            ys_true = torch.cat([ys_pad[i, :l] for i, l in enumerate(ys_lens)])
 
         loss = self.loss_fn(ys_hat, ys_true, hlens, ys_lens).to(
             device=hs_pad.device, dtype=hs_pad.dtype
@@ -144,6 +156,16 @@ def forward(self, hs_pad, hlens, ys_pad, ys_lens):
 
         return loss
 
+    def softmax(self, hs_pad):
+        """softmax of frame activations
+
+        Args:
+            Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            torch.Tensor: softmax applied 3d tensor (B, Tmax, odim)
+        """
+        return F.softmax(self.ctc_lo(hs_pad), dim=2)
+
     def log_softmax(self, hs_pad):
         """log_softmax of frame activations
 
diff --git a/espnet2/asr/decoder/mlm_decoder.py b/espnet2/asr/decoder/mlm_decoder.py
new file mode 100644
index 00000000000..85cd1d3757f
--- /dev/null
+++ b/espnet2/asr/decoder/mlm_decoder.py
@@ -0,0 +1,130 @@
+# Copyright 2022 Yosuke Higuchi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Masked LM Decoder definition."""
+from typing import Tuple
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
+from espnet.nets.pytorch_backend.transformer.decoder_layer import DecoderLayer
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+
+
+class MLMDecoder(AbsDecoder):
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        pos_enc_class=PositionalEncoding,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        attention_dim = encoder_output_size
+        vocab_size += 1  # for mask token
+
+        if input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(vocab_size, attention_dim),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(vocab_size, attention_dim),
+                torch.nn.LayerNorm(attention_dim),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        else:
+            raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}")
+
+        self.normalize_before = normalize_before
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        else:
+            self.output_layer = None
+
+        self.decoders = repeat(
+            num_blocks,
+            lambda lnum: DecoderLayer(
+                attention_dim,
+                MultiHeadedAttention(
+                    attention_heads, attention_dim, self_attention_dropout_rate
+                ),
+                MultiHeadedAttention(
+                    attention_heads, attention_dim, src_attention_dropout_rate
+                ),
+                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+    def forward(
+        self,
+        hs_pad: torch.Tensor,
+        hlens: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+
+        Args:
+            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
+            hlens: (batch)
+            ys_in_pad:
+                input token ids, int64 (batch, maxlen_out)
+                if input_layer == "embed"
+                input tensor (batch, maxlen_out, #mels) in the other cases
+            ys_in_lens: (batch)
+        Returns:
+            (tuple): tuple containing:
+            x: decoded token score before softmax (batch, maxlen_out, token)
+                if use_output_layer is True,
+            olens: (batch, )
+        """
+        tgt = ys_in_pad
+        # tgt_mask: (B, 1, L)
+        tgt_mask = (~make_pad_mask(ys_in_lens)[:, None, :]).to(tgt.device)
+        tgt_max_len = tgt_mask.size(-1)
+        # tgt_mask_tmp: (B, L, L)
+        tgt_mask_tmp = tgt_mask.transpose(1, 2).repeat(1, 1, tgt_max_len)
+        tgt_mask = tgt_mask.repeat(1, tgt_max_len, 1) & tgt_mask_tmp
+
+        memory = hs_pad
+        memory_mask = (~make_pad_mask(hlens))[:, None, :].to(memory.device)
+
+        x = self.embed(tgt)
+        x, tgt_mask, memory, memory_mask = self.decoders(
+            x, tgt_mask, memory, memory_mask
+        )
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.output_layer is not None:
+            x = self.output_layer(x)
+
+        olens = tgt_mask.sum(1)
+        return x, olens
diff --git a/espnet2/asr/decoder/transformer_decoder.py b/espnet2/asr/decoder/transformer_decoder.py
index eb612773c96..1bd74cb76c1 100644
--- a/espnet2/asr/decoder/transformer_decoder.py
+++ b/espnet2/asr/decoder/transformer_decoder.py
@@ -125,7 +125,15 @@ def forward(
         tgt_mask = tgt_mask & m
 
         memory = hs_pad
-        memory_mask = (~make_pad_mask(hlens))[:, None, :].to(memory.device)
+        memory_mask = (~make_pad_mask(hlens, maxlen=memory.size(1)))[:, None, :].to(
+            memory.device
+        )
+        # Padding for Longformer
+        if memory_mask.shape[-1] != memory.shape[1]:
+            padlen = memory.shape[1] - memory_mask.shape[-1]
+            memory_mask = torch.nn.functional.pad(
+                memory_mask, (0, padlen), "constant", False
+            )
 
         x = self.embed(tgt)
         x, tgt_mask, memory, memory_mask = self.decoders(
diff --git a/espnet2/asr/encoder/conformer_encoder.py b/espnet2/asr/encoder/conformer_encoder.py
index c94a02b092f..c0c3d92fd1c 100644
--- a/espnet2/asr/encoder/conformer_encoder.py
+++ b/espnet2/asr/encoder/conformer_encoder.py
@@ -3,9 +3,12 @@
 
 """Conformer encoder definition."""
 
+from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import Union
 
+import logging
 import torch
 
 from typeguard import check_argument_types
@@ -17,11 +20,13 @@
 from espnet.nets.pytorch_backend.transformer.attention import (
     MultiHeadedAttention,  # noqa: H301
     RelPositionMultiHeadedAttention,  # noqa: H301
+    LegacyRelPositionMultiHeadedAttention,  # noqa: H301
 )
 from espnet.nets.pytorch_backend.transformer.embedding import (
     PositionalEncoding,  # noqa: H301
     ScaledPositionalEncoding,  # noqa: H301
     RelPositionalEncoding,  # noqa: H301
+    LegacyRelPositionalEncoding,  # noqa: H301
 )
 from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
 from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
@@ -32,9 +37,11 @@
 from espnet.nets.pytorch_backend.transformer.repeat import repeat
 from espnet.nets.pytorch_backend.transformer.subsampling import check_short_utt
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
 from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 
 
@@ -43,7 +50,7 @@ class ConformerEncoder(AbsEncoder):
 
     Args:
         input_size (int): Input dimension.
-        output_size (int): Dimention of attention.
+        output_size (int): Dimension of attention.
         attention_heads (int): The number of heads of multi head attention.
         linear_units (int): The number of units of position-wise feed forward.
         num_blocks (int): The number of decoder blocks.
@@ -58,11 +65,16 @@ class ConformerEncoder(AbsEncoder):
             If False, no additional linear will be applied. i.e. x -> x + att(x)
         positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
         positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        rel_pos_type (str): Whether to use the latest relative positional encoding or
+            the legacy one. The legacy relative positional encoding will be deprecated
+            in the future. More Details can be found in
+            https://github.com/espnet/espnet/pull/2816.
         encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
         encoder_attn_layer_type (str): Encoder attention layer type.
         activation_type (str): Encoder activation function type.
         macaron_style (bool): Whether to use macaron style for positionwise layer.
         use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
         cnn_module_kernel (int): Kernerl size of convolution module.
         padding_idx (int): Padding idx for input_layer=embed.
 
@@ -84,17 +96,33 @@ def __init__(
         positionwise_layer_type: str = "linear",
         positionwise_conv_kernel_size: int = 3,
         macaron_style: bool = False,
+        rel_pos_type: str = "legacy",
         pos_enc_layer_type: str = "rel_pos",
         selfattention_layer_type: str = "rel_selfattn",
         activation_type: str = "swish",
         use_cnn_module: bool = True,
+        zero_triu: bool = False,
         cnn_module_kernel: int = 31,
         padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
+        stochastic_depth_rate: Union[float, List[float]] = 0.0,
     ):
         assert check_argument_types()
         super().__init__()
         self._output_size = output_size
 
+        if rel_pos_type == "legacy":
+            if pos_enc_layer_type == "rel_pos":
+                pos_enc_layer_type = "legacy_rel_pos"
+            if selfattention_layer_type == "rel_selfattn":
+                selfattention_layer_type = "legacy_rel_selfattn"
+        elif rel_pos_type == "latest":
+            assert selfattention_layer_type != "legacy_rel_selfattn"
+            assert pos_enc_layer_type != "legacy_rel_pos"
+        else:
+            raise ValueError("unknown rel_pos_type: " + rel_pos_type)
+
         activation = get_activation(activation_type)
         if pos_enc_layer_type == "abs_pos":
             pos_enc_class = PositionalEncoding
@@ -103,6 +131,12 @@ def __init__(
         elif pos_enc_layer_type == "rel_pos":
             assert selfattention_layer_type == "rel_selfattn"
             pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            assert selfattention_layer_type == "legacy_rel_selfattn"
+            pos_enc_class = LegacyRelPositionalEncoding
+            logging.warning(
+                "Using legacy_rel_pos and it will be deprecated in the future."
+            )
         else:
             raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
 
@@ -120,6 +154,13 @@ def __init__(
                 dropout_rate,
                 pos_enc_class(output_size, positional_dropout_rate),
             )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
         elif input_layer == "conv2d6":
             self.embed = Conv2dSubsampling6(
                 input_size,
@@ -185,6 +226,17 @@ def __init__(
                 output_size,
                 attention_dropout_rate,
             )
+        elif selfattention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+            logging.warning(
+                "Using legacy_rel_selfattn and it will be deprecated in the future."
+            )
         elif selfattention_layer_type == "rel_selfattn":
             assert pos_enc_layer_type == "rel_pos"
             encoder_selfattn_layer = RelPositionMultiHeadedAttention
@@ -192,6 +244,7 @@ def __init__(
                 attention_heads,
                 output_size,
                 attention_dropout_rate,
+                zero_triu,
             )
         else:
             raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type)
@@ -199,6 +252,15 @@ def __init__(
         convolution_layer = ConvolutionModule
         convolution_layer_args = (output_size, cnn_module_kernel, activation)
 
+        if isinstance(stochastic_depth_rate, float):
+            stochastic_depth_rate = [stochastic_depth_rate] * num_blocks
+
+        if len(stochastic_depth_rate) != num_blocks:
+            raise ValueError(
+                f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) "
+                f"should be equal to num_blocks ({num_blocks})"
+            )
+
         self.encoders = repeat(
             num_blocks,
             lambda lnum: EncoderLayer(
@@ -210,11 +272,18 @@ def __init__(
                 dropout_rate,
                 normalize_before,
                 concat_after,
+                stochastic_depth_rate[lnum],
             ),
         )
         if self.normalize_before:
             self.after_norm = LayerNorm(output_size)
 
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
     def output_size(self) -> int:
         return self._output_size
 
@@ -223,6 +292,7 @@ def forward(
         xs_pad: torch.Tensor,
         ilens: torch.Tensor,
         prev_states: torch.Tensor = None,
+        ctc: CTC = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """Calculate forward propagation.
 
@@ -241,6 +311,7 @@ def forward(
 
         if (
             isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
             or isinstance(self.embed, Conv2dSubsampling6)
             or isinstance(self.embed, Conv2dSubsampling8)
         ):
@@ -255,11 +326,41 @@ def forward(
             xs_pad, masks = self.embed(xs_pad, masks)
         else:
             xs_pad = self.embed(xs_pad)
-        xs_pad, masks = self.encoders(xs_pad, masks)
+
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+                    if isinstance(encoder_out, tuple):
+                        encoder_out = encoder_out[0]
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+
+                        if isinstance(xs_pad, tuple):
+                            x, pos_emb = xs_pad
+                            x = x + self.conditioning_layer(ctc_out)
+                            xs_pad = (x, pos_emb)
+                        else:
+                            xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
         if isinstance(xs_pad, tuple):
             xs_pad = xs_pad[0]
         if self.normalize_before:
             xs_pad = self.after_norm(xs_pad)
 
         olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
         return xs_pad, olens, None
diff --git a/espnet2/asr/encoder/contextual_block_conformer_encoder.py b/espnet2/asr/encoder/contextual_block_conformer_encoder.py
new file mode 100644
index 00000000000..7152e34d44a
--- /dev/null
+++ b/espnet2/asr/encoder/contextual_block_conformer_encoder.py
@@ -0,0 +1,592 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Aug 21 17:27:16 2021.
+
+@author: Keqi Deng (UCAS)
+"""
+
+from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
+from espnet.nets.pytorch_backend.conformer.contextual_block_encoder_layer import (
+    ContextualBlockEncoderLayer,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.nets_utils import (
+    make_pad_mask,  # noqa: H301
+    get_activation,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
+from espnet.nets.pytorch_backend.transformer.embedding import StreamPositionalEncoding
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.pytorch_backend.transformer.subsampling_without_posenc import (
+    Conv2dSubsamplingWOPosEnc,  # noqa: H301
+)
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+import math
+import torch
+from typeguard import check_argument_types
+from typing import (
+    Optional,  # noqa: H301
+    Tuple,  # noqa: H301
+)
+
+
+class ContextualBlockConformerEncoder(AbsEncoder):
+    """Contextual Block Conformer encoder module.
+
+    Args:
+        input_size: input dim
+        output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the number of units of position-wise feed forward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        attention_dropout_rate: dropout rate in attention
+        positional_dropout_rate: dropout rate after adding positional encoding
+        input_layer: input layer type
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before: whether to use layer_norm before the first block
+        concat_after: whether to concat attention layer's input and output
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied.
+            i.e. x -> x + att(x)
+        positionwise_layer_type: linear of conv1d
+        positionwise_conv_kernel_size: kernel size of positionwise conv1d layer
+        padding_idx: padding_idx for input_layer=embed
+        block_size: block size for contextual block processing
+        hop_Size: hop size for block processing
+        look_ahead: look-ahead size for block_processing
+        init_average: whether to use average as initial context (otherwise max values)
+        ctx_pos_enc: whether to use positional encoding to the context vectors
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 3,
+        macaron_style: bool = False,
+        pos_enc_class=StreamPositionalEncoding,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 31,
+        padding_idx: int = -1,
+        block_size: int = 40,
+        hop_size: int = 16,
+        look_ahead: int = 16,
+        init_average: bool = True,
+        ctx_pos_enc: bool = True,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self._output_size = output_size
+        self.pos_enc = pos_enc_class(output_size, positional_dropout_rate)
+        activation = get_activation(activation_type)
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.ReLU(),
+            )
+            self.subsample = 1
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsamplingWOPosEnc(
+                input_size, output_size, dropout_rate, kernels=[3, 3], strides=[2, 2]
+            )
+            self.subsample = 4
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsamplingWOPosEnc(
+                input_size, output_size, dropout_rate, kernels=[3, 5], strides=[2, 3]
+            )
+            self.subsample = 6
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsamplingWOPosEnc(
+                input_size,
+                output_size,
+                dropout_rate,
+                kernels=[3, 3, 3],
+                strides=[2, 2, 2],
+            )
+            self.subsample = 8
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+            )
+            self.subsample = 1
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+            self.subsample = 1
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(output_size, positional_dropout_rate)
+            )
+            self.subsample = 1
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (output_size, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: ContextualBlockEncoderLayer(
+                output_size,
+                MultiHeadedAttention(
+                    attention_heads, output_size, attention_dropout_rate
+                ),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                num_blocks,
+                normalize_before,
+                concat_after,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+
+        # for block processing
+        self.block_size = block_size
+        self.hop_size = hop_size
+        self.look_ahead = look_ahead
+        self.init_average = init_average
+        self.ctx_pos_enc = ctx_pos_enc
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        is_final=True,
+        infer_mode=False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Embed positions in tensor.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+            prev_states: Not to be used now.
+            infer_mode: whether to be used for inference. This is used to
+                distinguish between forward_train (train and validate) and
+                forward_infer (decode).
+        Returns:
+            position embedded tensor and mask
+        """
+        if self.training or not infer_mode:
+            return self.forward_train(xs_pad, ilens, prev_states)
+        else:
+            return self.forward_infer(xs_pad, ilens, prev_states, is_final)
+
+    def forward_train(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Embed positions in tensor.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+            prev_states: Not to be used now.
+        Returns:
+            position embedded tensor and mask
+        """
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+
+        if isinstance(self.embed, Conv2dSubsamplingWOPosEnc):
+            xs_pad, masks = self.embed(xs_pad, masks)
+        elif self.embed is not None:
+            xs_pad = self.embed(xs_pad)
+
+        # create empty output container
+        total_frame_num = xs_pad.size(1)
+        ys_pad = xs_pad.new_zeros(xs_pad.size())
+
+        past_size = self.block_size - self.hop_size - self.look_ahead
+
+        # block_size could be 0 meaning infinite
+        # apply usual encoder for short sequence
+        if self.block_size == 0 or total_frame_num <= self.block_size:
+            xs_pad, masks, _, _, _, _, _ = self.encoders(
+                self.pos_enc(xs_pad), masks, False, None, None
+            )
+            if self.normalize_before:
+                xs_pad = self.after_norm(xs_pad)
+
+            olens = masks.squeeze(1).sum(1)
+            return xs_pad, olens, None
+
+        # start block processing
+        cur_hop = 0
+        block_num = math.ceil(
+            float(total_frame_num - past_size - self.look_ahead) / float(self.hop_size)
+        )
+        bsize = xs_pad.size(0)
+        addin = xs_pad.new_zeros(
+            bsize, block_num, xs_pad.size(-1)
+        )  # additional context embedding vecctors
+
+        # first step
+        if self.init_average:  # initialize with average value
+            addin[:, 0, :] = xs_pad.narrow(1, cur_hop, self.block_size).mean(1)
+        else:  # initialize with max value
+            addin[:, 0, :] = xs_pad.narrow(1, cur_hop, self.block_size).max(1)
+        cur_hop += self.hop_size
+        # following steps
+        while cur_hop + self.block_size < total_frame_num:
+            if self.init_average:  # initialize with average value
+                addin[:, cur_hop // self.hop_size, :] = xs_pad.narrow(
+                    1, cur_hop, self.block_size
+                ).mean(1)
+            else:  # initialize with max value
+                addin[:, cur_hop // self.hop_size, :] = xs_pad.narrow(
+                    1, cur_hop, self.block_size
+                ).max(1)
+            cur_hop += self.hop_size
+        # last step
+        if cur_hop < total_frame_num and cur_hop // self.hop_size < block_num:
+            if self.init_average:  # initialize with average value
+                addin[:, cur_hop // self.hop_size, :] = xs_pad.narrow(
+                    1, cur_hop, total_frame_num - cur_hop
+                ).mean(1)
+            else:  # initialize with max value
+                addin[:, cur_hop // self.hop_size, :] = xs_pad.narrow(
+                    1, cur_hop, total_frame_num - cur_hop
+                ).max(1)
+
+        if self.ctx_pos_enc:
+            addin = self.pos_enc(addin)
+
+        xs_pad = self.pos_enc(xs_pad)
+
+        # set up masks
+        mask_online = xs_pad.new_zeros(
+            xs_pad.size(0), block_num, self.block_size + 2, self.block_size + 2
+        )
+        mask_online.narrow(2, 1, self.block_size + 1).narrow(
+            3, 0, self.block_size + 1
+        ).fill_(1)
+
+        xs_chunk = xs_pad.new_zeros(
+            bsize, block_num, self.block_size + 2, xs_pad.size(-1)
+        )
+
+        # fill the input
+        # first step
+        left_idx = 0
+        block_idx = 0
+        xs_chunk[:, block_idx, 1 : self.block_size + 1] = xs_pad.narrow(
+            -2, left_idx, self.block_size
+        )
+        left_idx += self.hop_size
+        block_idx += 1
+        # following steps
+        while left_idx + self.block_size < total_frame_num and block_idx < block_num:
+            xs_chunk[:, block_idx, 1 : self.block_size + 1] = xs_pad.narrow(
+                -2, left_idx, self.block_size
+            )
+            left_idx += self.hop_size
+            block_idx += 1
+        # last steps
+        last_size = total_frame_num - left_idx
+        xs_chunk[:, block_idx, 1 : last_size + 1] = xs_pad.narrow(
+            -2, left_idx, last_size
+        )
+
+        # fill the initial context vector
+        xs_chunk[:, 0, 0] = addin[:, 0]
+        xs_chunk[:, 1:, 0] = addin[:, 0 : block_num - 1]
+        xs_chunk[:, :, self.block_size + 1] = addin
+
+        # forward
+        ys_chunk, mask_online, _, _, _, _, _ = self.encoders(
+            xs_chunk, mask_online, False, xs_chunk
+        )
+
+        # copy output
+        # first step
+        offset = self.block_size - self.look_ahead - self.hop_size + 1
+        left_idx = 0
+        block_idx = 0
+        cur_hop = self.block_size - self.look_ahead
+        ys_pad[:, left_idx:cur_hop] = ys_chunk[:, block_idx, 1 : cur_hop + 1]
+        left_idx += self.hop_size
+        block_idx += 1
+        # following steps
+        while left_idx + self.block_size < total_frame_num and block_idx < block_num:
+            ys_pad[:, cur_hop : cur_hop + self.hop_size] = ys_chunk[
+                :, block_idx, offset : offset + self.hop_size
+            ]
+            cur_hop += self.hop_size
+            left_idx += self.hop_size
+            block_idx += 1
+        ys_pad[:, cur_hop:total_frame_num] = ys_chunk[
+            :, block_idx, offset : last_size + 1, :
+        ]
+
+        if self.normalize_before:
+            ys_pad = self.after_norm(ys_pad)
+
+        olens = masks.squeeze(1).sum(1)
+        return ys_pad, olens, None
+
+    def forward_infer(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        is_final: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Embed positions in tensor.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+            prev_states: Not to be used now.
+        Returns:
+            position embedded tensor and mask
+        """
+        if prev_states is None:
+            prev_addin = None
+            buffer_before_downsampling = None
+            ilens_buffer = None
+            buffer_after_downsampling = None
+            n_processed_blocks = 0
+            past_encoder_ctx = None
+        else:
+            prev_addin = prev_states["prev_addin"]
+            buffer_before_downsampling = prev_states["buffer_before_downsampling"]
+            ilens_buffer = prev_states["ilens_buffer"]
+            buffer_after_downsampling = prev_states["buffer_after_downsampling"]
+            n_processed_blocks = prev_states["n_processed_blocks"]
+            past_encoder_ctx = prev_states["past_encoder_ctx"]
+        bsize = xs_pad.size(0)
+        assert bsize == 1
+
+        if prev_states is not None:
+            xs_pad = torch.cat([buffer_before_downsampling, xs_pad], dim=1)
+            ilens += ilens_buffer
+
+        if is_final:
+            buffer_before_downsampling = None
+        else:
+            n_samples = xs_pad.size(1) // self.subsample - 1
+            if n_samples < 2:
+                next_states = {
+                    "prev_addin": prev_addin,
+                    "buffer_before_downsampling": xs_pad,
+                    "ilens_buffer": ilens,
+                    "buffer_after_downsampling": buffer_after_downsampling,
+                    "n_processed_blocks": n_processed_blocks,
+                    "past_encoder_ctx": past_encoder_ctx,
+                }
+                return (
+                    xs_pad.new_zeros(bsize, 0, self._output_size),
+                    xs_pad.new_zeros(bsize),
+                    next_states,
+                )
+
+            n_res_samples = xs_pad.size(1) % self.subsample + self.subsample * 2
+            buffer_before_downsampling = xs_pad.narrow(
+                1, xs_pad.size(1) - n_res_samples, n_res_samples
+            )
+            xs_pad = xs_pad.narrow(1, 0, n_samples * self.subsample)
+
+            ilens_buffer = ilens.new_full(
+                [1], dtype=torch.long, fill_value=n_res_samples
+            )
+            ilens = ilens.new_full(
+                [1], dtype=torch.long, fill_value=n_samples * self.subsample
+            )
+
+        if isinstance(self.embed, Conv2dSubsamplingWOPosEnc):
+            xs_pad, _ = self.embed(xs_pad, None)
+        elif self.embed is not None:
+            xs_pad = self.embed(xs_pad)
+
+        # create empty output container
+        if buffer_after_downsampling is not None:
+            xs_pad = torch.cat([buffer_after_downsampling, xs_pad], dim=1)
+
+        total_frame_num = xs_pad.size(1)
+
+        if is_final:
+            past_size = self.block_size - self.hop_size - self.look_ahead
+            block_num = math.ceil(
+                float(total_frame_num - past_size - self.look_ahead)
+                / float(self.hop_size)
+            )
+            buffer_after_downsampling = None
+        else:
+            if total_frame_num <= self.block_size:
+                next_states = {
+                    "prev_addin": prev_addin,
+                    "buffer_before_downsampling": buffer_before_downsampling,
+                    "ilens_buffer": ilens_buffer,
+                    "buffer_after_downsampling": xs_pad,
+                    "n_processed_blocks": n_processed_blocks,
+                    "past_encoder_ctx": past_encoder_ctx,
+                }
+                return (
+                    xs_pad.new_zeros(bsize, 0, self._output_size),
+                    xs_pad.new_zeros(bsize),
+                    next_states,
+                )
+
+            overlap_size = self.block_size - self.hop_size
+            block_num = max(0, xs_pad.size(1) - overlap_size) // self.hop_size
+            res_frame_num = xs_pad.size(1) - self.hop_size * block_num
+            buffer_after_downsampling = xs_pad.narrow(
+                1, xs_pad.size(1) - res_frame_num, res_frame_num
+            )
+            xs_pad = xs_pad.narrow(1, 0, block_num * self.hop_size + overlap_size)
+
+        # block_size could be 0 meaning infinite
+        # apply usual encoder for short sequence
+        assert self.block_size > 0
+        if n_processed_blocks == 0 and total_frame_num <= self.block_size and is_final:
+            xs_chunk = self.pos_enc(xs_pad).unsqueeze(1)
+            xs_pad, _, _, _, _, _, _ = self.encoders(
+                xs_chunk, None, True, None, None, True
+            )
+            xs_pad = xs_pad.squeeze(0)
+            if self.normalize_before:
+                xs_pad = self.after_norm(xs_pad)
+            return xs_pad, None, None
+
+        # start block processing
+        xs_chunk = xs_pad.new_zeros(
+            bsize, block_num, self.block_size + 2, xs_pad.size(-1)
+        )
+
+        for i in range(block_num):
+            cur_hop = i * self.hop_size
+            chunk_length = min(self.block_size, total_frame_num - cur_hop)
+            addin = xs_pad.narrow(1, cur_hop, chunk_length)
+            if self.init_average:
+                addin = addin.mean(1, keepdim=True)
+            else:
+                addin = addin.max(1, keepdim=True)
+            if self.ctx_pos_enc:
+                addin = self.pos_enc(addin, i + n_processed_blocks)
+
+            if prev_addin is None:
+                prev_addin = addin
+            xs_chunk[:, i, 0] = prev_addin
+            xs_chunk[:, i, -1] = addin
+
+            chunk = self.pos_enc(
+                xs_pad.narrow(1, cur_hop, chunk_length),
+                cur_hop + self.hop_size * n_processed_blocks,
+            )
+
+            xs_chunk[:, i, 1 : chunk_length + 1] = chunk
+
+            prev_addin = addin
+
+        # mask setup, it should be the same to that of forward_train
+        mask_online = xs_pad.new_zeros(
+            xs_pad.size(0), block_num, self.block_size + 2, self.block_size + 2
+        )
+        mask_online.narrow(2, 1, self.block_size + 1).narrow(
+            3, 0, self.block_size + 1
+        ).fill_(1)
+
+        ys_chunk, _, _, _, past_encoder_ctx, _, _ = self.encoders(
+            xs_chunk, mask_online, True, past_encoder_ctx
+        )
+
+        # remove addin
+        ys_chunk = ys_chunk.narrow(2, 1, self.block_size)
+
+        offset = self.block_size - self.look_ahead - self.hop_size
+        if is_final:
+            if n_processed_blocks == 0:
+                y_length = xs_pad.size(1)
+            else:
+                y_length = xs_pad.size(1) - offset
+        else:
+            y_length = block_num * self.hop_size
+            if n_processed_blocks == 0:
+                y_length += offset
+        ys_pad = xs_pad.new_zeros((xs_pad.size(0), y_length, xs_pad.size(2)))
+        if n_processed_blocks == 0:
+            ys_pad[:, 0:offset] = ys_chunk[:, 0, 0:offset]
+        for i in range(block_num):
+            cur_hop = i * self.hop_size
+            if n_processed_blocks == 0:
+                cur_hop += offset
+            if i == block_num - 1 and is_final:
+                chunk_length = min(self.block_size - offset, ys_pad.size(1) - cur_hop)
+            else:
+                chunk_length = self.hop_size
+            ys_pad[:, cur_hop : cur_hop + chunk_length] = ys_chunk[
+                :, i, offset : offset + chunk_length
+            ]
+        if self.normalize_before:
+            ys_pad = self.after_norm(ys_pad)
+
+        if is_final:
+            next_states = None
+        else:
+            next_states = {
+                "prev_addin": prev_addin,
+                "buffer_before_downsampling": buffer_before_downsampling,
+                "ilens_buffer": ilens_buffer,
+                "buffer_after_downsampling": buffer_after_downsampling,
+                "n_processed_blocks": n_processed_blocks + block_num,
+                "past_encoder_ctx": past_encoder_ctx,
+            }
+
+        return ys_pad, None, next_states
diff --git a/espnet2/asr/encoder/contextual_block_transformer_encoder.py b/espnet2/asr/encoder/contextual_block_transformer_encoder.py
index f5765d45651..ec3b7e28193 100644
--- a/espnet2/asr/encoder/contextual_block_transformer_encoder.py
+++ b/espnet2/asr/encoder/contextual_block_transformer_encoder.py
@@ -2,18 +2,12 @@
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 """Encoder definition."""
-from typing import Optional
-from typing import Tuple
-
-import torch
-from typeguard import check_argument_types
-
 from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
 from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
-from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
 from espnet.nets.pytorch_backend.transformer.contextual_block_encoder_layer import (
     ContextualBlockEncoderLayer,  # noqa: H301
 )
+from espnet.nets.pytorch_backend.transformer.embedding import StreamPositionalEncoding
 from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
 from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
 from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
@@ -26,6 +20,10 @@
 )
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 import math
+import torch
+from typeguard import check_argument_types
+from typing import Optional
+from typing import Tuple
 
 
 class ContextualBlockTransformerEncoder(AbsEncoder):
@@ -39,7 +37,7 @@ class ContextualBlockTransformerEncoder(AbsEncoder):
         output_size: dimension of attention
         attention_heads: the number of heads of multi head attention
         linear_units: the number of units of position-wise feed forward
-        num_blocks: the number of decoder blocks
+        num_blocks: the number of encoder blocks
         dropout_rate: dropout rate
         attention_dropout_rate: dropout rate in attention
         positional_dropout_rate: dropout rate after adding positional encoding
@@ -72,7 +70,7 @@ def __init__(
         positional_dropout_rate: float = 0.1,
         attention_dropout_rate: float = 0.0,
         input_layer: Optional[str] = "conv2d",
-        pos_enc_class=PositionalEncoding,
+        pos_enc_class=StreamPositionalEncoding,
         normalize_before: bool = True,
         concat_after: bool = False,
         positionwise_layer_type: str = "linear",
@@ -97,14 +95,17 @@ def __init__(
                 torch.nn.Dropout(dropout_rate),
                 torch.nn.ReLU(),
             )
+            self.subsample = 1
         elif input_layer == "conv2d":
             self.embed = Conv2dSubsamplingWOPosEnc(
                 input_size, output_size, dropout_rate, kernels=[3, 3], strides=[2, 2]
             )
+            self.subsample = 4
         elif input_layer == "conv2d6":
             self.embed = Conv2dSubsamplingWOPosEnc(
                 input_size, output_size, dropout_rate, kernels=[3, 5], strides=[2, 3]
             )
+            self.subsample = 6
         elif input_layer == "conv2d8":
             self.embed = Conv2dSubsamplingWOPosEnc(
                 input_size,
@@ -113,12 +114,15 @@ def __init__(
                 kernels=[3, 3, 3],
                 strides=[2, 2, 2],
             )
+            self.subsample = 8
         elif input_layer == "embed":
             self.embed = torch.nn.Sequential(
                 torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
             )
+            self.subsample = 1
         elif input_layer is None:
             self.embed = None
+            self.subsample = 1
         else:
             raise ValueError("unknown input_layer: " + input_layer)
         self.normalize_before = normalize_before
@@ -179,6 +183,31 @@ def forward(
         xs_pad: torch.Tensor,
         ilens: torch.Tensor,
         prev_states: torch.Tensor = None,
+        is_final=True,
+        infer_mode=False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Embed positions in tensor.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+            prev_states: Not to be used now.
+            infer_mode: whether to be used for inference. This is used to
+                distinguish between forward_train (train and validate) and
+                forward_infer (decode).
+        Returns:
+            position embedded tensor and mask
+        """
+        if self.training or not infer_mode:
+            return self.forward_train(xs_pad, ilens, prev_states)
+        else:
+            return self.forward_infer(xs_pad, ilens, prev_states, is_final)
+
+    def forward_train(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """Embed positions in tensor.
 
@@ -205,8 +234,8 @@ def forward(
         # block_size could be 0 meaning infinite
         # apply usual encoder for short sequence
         if self.block_size == 0 or total_frame_num <= self.block_size:
-            xs_pad, masks, _, _, _ = self.encoders(
-                self.pos_enc(xs_pad), masks, None, None
+            xs_pad, masks, _, _, _, _, _ = self.encoders(
+                self.pos_enc(xs_pad), masks, False, None, None
             )
             if self.normalize_before:
                 xs_pad = self.after_norm(xs_pad)
@@ -297,7 +326,9 @@ def forward(
         xs_chunk[:, :, self.block_size + 1] = addin
 
         # forward
-        ys_chunk, mask_online, _, _, _ = self.encoders(xs_chunk, mask_online, xs_chunk)
+        ys_chunk, mask_online, _, _, _, _, _ = self.encoders(
+            xs_chunk, mask_online, False, xs_chunk
+        )
 
         # copy output
         # first step
@@ -325,3 +356,213 @@ def forward(
 
         olens = masks.squeeze(1).sum(1)
         return ys_pad, olens, None
+
+    def forward_infer(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        is_final: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Embed positions in tensor.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+            prev_states: Not to be used now.
+        Returns:
+            position embedded tensor and mask
+        """
+        if prev_states is None:
+            prev_addin = None
+            buffer_before_downsampling = None
+            ilens_buffer = None
+            buffer_after_downsampling = None
+            n_processed_blocks = 0
+            past_encoder_ctx = None
+        else:
+            prev_addin = prev_states["prev_addin"]
+            buffer_before_downsampling = prev_states["buffer_before_downsampling"]
+            ilens_buffer = prev_states["ilens_buffer"]
+            buffer_after_downsampling = prev_states["buffer_after_downsampling"]
+            n_processed_blocks = prev_states["n_processed_blocks"]
+            past_encoder_ctx = prev_states["past_encoder_ctx"]
+        bsize = xs_pad.size(0)
+        assert bsize == 1
+
+        if prev_states is not None:
+            xs_pad = torch.cat([buffer_before_downsampling, xs_pad], dim=1)
+            ilens += ilens_buffer
+
+        if is_final:
+            buffer_before_downsampling = None
+        else:
+            n_samples = xs_pad.size(1) // self.subsample - 1
+            if n_samples < 2:
+                next_states = {
+                    "prev_addin": prev_addin,
+                    "buffer_before_downsampling": xs_pad,
+                    "ilens_buffer": ilens,
+                    "buffer_after_downsampling": buffer_after_downsampling,
+                    "n_processed_blocks": n_processed_blocks,
+                    "past_encoder_ctx": past_encoder_ctx,
+                }
+                return (
+                    xs_pad.new_zeros(bsize, 0, self._output_size),
+                    xs_pad.new_zeros(bsize),
+                    next_states,
+                )
+
+            n_res_samples = xs_pad.size(1) % self.subsample + self.subsample * 2
+            buffer_before_downsampling = xs_pad.narrow(
+                1, xs_pad.size(1) - n_res_samples, n_res_samples
+            )
+            xs_pad = xs_pad.narrow(1, 0, n_samples * self.subsample)
+
+            ilens_buffer = ilens.new_full(
+                [1], dtype=torch.long, fill_value=n_res_samples
+            )
+            ilens = ilens.new_full(
+                [1], dtype=torch.long, fill_value=n_samples * self.subsample
+            )
+
+        if isinstance(self.embed, Conv2dSubsamplingWOPosEnc):
+            xs_pad, _ = self.embed(xs_pad, None)
+        elif self.embed is not None:
+            xs_pad = self.embed(xs_pad)
+
+        # create empty output container
+        if buffer_after_downsampling is not None:
+            xs_pad = torch.cat([buffer_after_downsampling, xs_pad], dim=1)
+
+        total_frame_num = xs_pad.size(1)
+
+        if is_final:
+            past_size = self.block_size - self.hop_size - self.look_ahead
+            block_num = math.ceil(
+                float(total_frame_num - past_size - self.look_ahead)
+                / float(self.hop_size)
+            )
+            buffer_after_downsampling = None
+        else:
+            if total_frame_num <= self.block_size:
+                next_states = {
+                    "prev_addin": prev_addin,
+                    "buffer_before_downsampling": buffer_before_downsampling,
+                    "ilens_buffer": ilens_buffer,
+                    "buffer_after_downsampling": xs_pad,
+                    "n_processed_blocks": n_processed_blocks,
+                    "past_encoder_ctx": past_encoder_ctx,
+                }
+                return (
+                    xs_pad.new_zeros(bsize, 0, self._output_size),
+                    xs_pad.new_zeros(bsize),
+                    next_states,
+                )
+
+            overlap_size = self.block_size - self.hop_size
+            block_num = max(0, xs_pad.size(1) - overlap_size) // self.hop_size
+            res_frame_num = xs_pad.size(1) - self.hop_size * block_num
+            buffer_after_downsampling = xs_pad.narrow(
+                1, xs_pad.size(1) - res_frame_num, res_frame_num
+            )
+            xs_pad = xs_pad.narrow(1, 0, block_num * self.hop_size + overlap_size)
+
+        # block_size could be 0 meaning infinite
+        # apply usual encoder for short sequence
+        assert self.block_size > 0
+        if n_processed_blocks == 0 and total_frame_num <= self.block_size and is_final:
+            xs_chunk = self.pos_enc(xs_pad).unsqueeze(1)
+            xs_pad, _, _, _, _, _, _ = self.encoders(
+                xs_chunk, None, True, None, None, True
+            )
+            xs_pad = xs_pad.squeeze(0)
+            if self.normalize_before:
+                xs_pad = self.after_norm(xs_pad)
+            return xs_pad, None, None
+
+        # start block processing
+        xs_chunk = xs_pad.new_zeros(
+            bsize, block_num, self.block_size + 2, xs_pad.size(-1)
+        )
+
+        for i in range(block_num):
+            cur_hop = i * self.hop_size
+            chunk_length = min(self.block_size, total_frame_num - cur_hop)
+            addin = xs_pad.narrow(1, cur_hop, chunk_length)
+            if self.init_average:
+                addin = addin.mean(1, keepdim=True)
+            else:
+                addin = addin.max(1, keepdim=True)
+            if self.ctx_pos_enc:
+                addin = self.pos_enc(addin, i + n_processed_blocks)
+
+            if prev_addin is None:
+                prev_addin = addin
+            xs_chunk[:, i, 0] = prev_addin
+            xs_chunk[:, i, -1] = addin
+
+            chunk = self.pos_enc(
+                xs_pad.narrow(1, cur_hop, chunk_length),
+                cur_hop + self.hop_size * n_processed_blocks,
+            )
+
+            xs_chunk[:, i, 1 : chunk_length + 1] = chunk
+
+            prev_addin = addin
+
+        # mask setup, it should be the same to that of forward_train
+        mask_online = xs_pad.new_zeros(
+            xs_pad.size(0), block_num, self.block_size + 2, self.block_size + 2
+        )
+        mask_online.narrow(2, 1, self.block_size + 1).narrow(
+            3, 0, self.block_size + 1
+        ).fill_(1)
+
+        ys_chunk, _, _, _, past_encoder_ctx, _, _ = self.encoders(
+            xs_chunk, mask_online, True, past_encoder_ctx
+        )
+
+        # remove addin
+        ys_chunk = ys_chunk.narrow(2, 1, self.block_size)
+
+        offset = self.block_size - self.look_ahead - self.hop_size
+        if is_final:
+            if n_processed_blocks == 0:
+                y_length = xs_pad.size(1)
+            else:
+                y_length = xs_pad.size(1) - offset
+        else:
+            y_length = block_num * self.hop_size
+            if n_processed_blocks == 0:
+                y_length += offset
+        ys_pad = xs_pad.new_zeros((xs_pad.size(0), y_length, xs_pad.size(2)))
+        if n_processed_blocks == 0:
+            ys_pad[:, 0:offset] = ys_chunk[:, 0, 0:offset]
+        for i in range(block_num):
+            cur_hop = i * self.hop_size
+            if n_processed_blocks == 0:
+                cur_hop += offset
+            if i == block_num - 1 and is_final:
+                chunk_length = min(self.block_size - offset, ys_pad.size(1) - cur_hop)
+            else:
+                chunk_length = self.hop_size
+            ys_pad[:, cur_hop : cur_hop + chunk_length] = ys_chunk[
+                :, i, offset : offset + chunk_length
+            ]
+        if self.normalize_before:
+            ys_pad = self.after_norm(ys_pad)
+
+        if is_final:
+            next_states = None
+        else:
+            next_states = {
+                "prev_addin": prev_addin,
+                "buffer_before_downsampling": buffer_before_downsampling,
+                "ilens_buffer": ilens_buffer,
+                "buffer_after_downsampling": buffer_after_downsampling,
+                "n_processed_blocks": n_processed_blocks + block_num,
+                "past_encoder_ctx": past_encoder_ctx,
+            }
+
+        return ys_pad, None, next_states
diff --git a/espnet2/asr/encoder/hubert_encoder.py b/espnet2/asr/encoder/hubert_encoder.py
new file mode 100644
index 00000000000..2e96da8bf9b
--- /dev/null
+++ b/espnet2/asr/encoder/hubert_encoder.py
@@ -0,0 +1,393 @@
+# Copyright 2021 Tianzi Wang
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0
+
+# Thanks to Abdelrahman Mohamed and Wei-Ning Hsu's help in this implementation,
+# Their origial Hubert work is in:
+#     Paper: https://arxiv.org/pdf/2106.07447.pdf
+#     Code in Fairseq: https://github.com/pytorch/fairseq/tree/master/examples/hubert
+
+
+"""Encoder definition."""
+import contextlib
+import copy
+import logging
+import os
+import torch
+import yaml
+
+from filelock import FileLock
+from pathlib import Path
+from typeguard import check_argument_types
+from typing import Optional
+from typing import Tuple
+
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+
+
+class FairseqHubertEncoder(AbsEncoder):
+    """FairSeq Hubert encoder module, used for loading pretrained weight and finetuning
+
+    Args:
+        input_size: input dim
+        hubert_url: url to Hubert pretrained model
+        hubert_dir_path: directory to download the Wav2Vec2.0 pretrained model.
+        output_size: dimension of attention
+        normalize_before: whether to use layer_norm before the first block
+        freeze_finetune_updates: steps that freeze all layers except output layer
+            before tuning the whole model (nessasary to prevent overfit).
+        dropout_rate: dropout rate
+        activation_dropout: dropout rate in activation function
+        attention_dropout: dropout rate in attention
+    Hubert specific Args:
+        Please refer to:
+        https://github.com/pytorch/fairseq/blob/master/fairseq/models/hubert/hubert.py
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hubert_url: str = "./",
+        hubert_dir_path: str = "./",
+        output_size: int = 256,
+        normalize_before: bool = False,
+        freeze_finetune_updates: int = 0,
+        dropout_rate: float = 0.0,
+        activation_dropout: float = 0.1,
+        attention_dropout: float = 0.0,
+        mask_length: int = 10,
+        mask_prob: float = 0.75,
+        mask_selection: str = "static",
+        mask_other: int = 0,
+        apply_mask: bool = True,
+        mask_channel_length: int = 64,
+        mask_channel_prob: float = 0.5,
+        mask_channel_other: int = 0,
+        mask_channel_selection: str = "static",
+        layerdrop: float = 0.1,
+        feature_grad_mult: float = 0.0,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self.apply_mask = apply_mask
+        try:
+            import fairseq
+            from fairseq.models.hubert.hubert import HubertModel
+        except Exception as e:
+            print("Error: FairSeq is not properly installed.")
+            print("Please install FairSeq: cd ${MAIN_ROOT}/tools && make fairseq.done")
+            raise e
+
+        arg_overrides = {
+            "dropout": dropout_rate,
+            "activation_dropout": activation_dropout,
+            "attention_dropout": attention_dropout,
+            "mask_length": mask_length,
+            "mask_prob": mask_prob,
+            "mask_selection": mask_selection,
+            "mask_other": mask_other,
+            "mask_channel_length": mask_channel_length,
+            "mask_channel_prob": mask_channel_prob,
+            "mask_channel_selection": mask_channel_selection,
+            "mask_channel_other": mask_channel_other,
+            "encoder_layerdrop": layerdrop,
+            "feature_grad_mult": feature_grad_mult,
+            "data": hubert_dir_path,
+        }
+
+        if hubert_url == "espnet":
+            self.hubert_model_path = hubert_dir_path
+            s = torch.load(
+                self.hubert_model_path,
+                map_location=torch.device("cpu"),
+            )
+
+            if all("encoder.encoder" in k for k in s):
+                try:
+                    state = {
+                        k.replace("encoder.encoder.", ""): v
+                        for k, v in s.items()
+                        if "label_embs_concat" not in k
+                    }
+                except Exception as e:
+                    raise e
+
+            config_file = os.path.join(
+                "/".join(self.hubert_model_path.split("/")[:-1]),
+                "config.yaml",
+            )
+            config_file = Path(config_file)
+
+            with config_file.open("r", encoding="utf-8") as f:
+                self.pretrained_cfg = yaml.safe_load(f)
+
+            model = FairseqHubertPretrainEncoder(
+                input_size=self.pretrained_cfg["input_size"],
+                hubert_dict=self.pretrained_cfg["hubert_dict"],
+                **self.pretrained_cfg["encoder_conf"],
+            )
+            model = model.encoder
+
+            d = self.pretrained_cfg["encoder_conf"]["output_size"]
+            self.pretrained_params = copy.deepcopy(state)
+
+        else:
+
+            self.hubert_model_path = download_hubert(hubert_url, hubert_dir_path)
+
+            (
+                models,
+                self.pretrained_cfg,
+                task,
+            ) = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+                [self.hubert_model_path],
+                arg_overrides=arg_overrides,
+                strict=False,
+            )
+            model = models[0]
+
+            d = self.pretrained_cfg.model.encoder_embed_dim
+            self.pretrained_params = copy.deepcopy(model.state_dict())
+
+        self._output_size = output_size
+
+        if not isinstance(model, HubertModel):
+            try:
+                model = model.hubert_encoder.hubert_model
+            except Exception as e:
+                print(
+                    "Error: pretrained models should be within: "
+                    "'HubertModel, Hubertctc' classes, etc."
+                )
+                raise e
+
+        self.encoders = model
+
+        self.normalize_before = normalize_before
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+
+        if output_size and output_size != d:
+            self.output_layer = torch.nn.Sequential(
+                torch.nn.Linear(d, output_size),
+            )
+        else:
+            self.output_layer = None
+
+        self.freeze_finetune_updates = freeze_finetune_updates
+        self.register_buffer("num_updates", torch.LongTensor([0]))
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Forward Hubert ASR Encoder.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+            prev_states: Not to be used now.
+        Returns:
+            position embedded tensor and mask
+        """
+        masks = make_pad_mask(ilens).to(xs_pad.device)
+
+        ft = self.freeze_finetune_updates <= self.num_updates
+
+        if self.num_updates <= self.freeze_finetune_updates:
+            self.num_updates += 1
+        elif ft and self.num_updates == self.freeze_finetune_updates + 1:
+            self.num_updates += 1
+            logging.info("Start fine-tuning hubert parameters!")
+        else:
+            self.num_updates += 1
+        with torch.no_grad() if not ft else contextlib.nullcontext():
+            enc_outputs = self.encoders(
+                xs_pad,
+                padding_mask=masks,
+                mask=self.apply_mask and self.training,
+                features_only=True,
+                output_layer=None,
+            )
+
+        xs_pad = enc_outputs["x"]  # (B,T,C),
+        masks = enc_outputs["padding_mask"]  # (B, T)
+
+        # save gpu memory
+        del enc_outputs
+
+        olens = (~masks).sum(dim=1)
+
+        if self.output_layer is not None:
+            xs_pad = self.output_layer(xs_pad)
+
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        return xs_pad, olens, None
+
+    def reload_pretrained_parameters(self):
+        self.encoders.load_state_dict(self.pretrained_params, strict=False)
+        logging.info("Pretrained Hubert model parameters reloaded!")
+
+
+class FairseqHubertPretrainEncoder(AbsEncoder):
+    """FairSeq Hubert pretrain encoder module, only used for pretraining stage
+
+    Args:
+        input_size: input dim
+        output_size: dimension of attention
+        linear_units: dimension of feedforward layers
+        attention_heads: the number of heads of multi head attention
+        num_blocks: the number of encoder blocks
+        dropout_rate: dropout rate
+        attention_dropout_rate: dropout rate in attention
+        hubert_dict: target dictionary for Hubert pretraining
+        label_rate: label frame rate. -1 for sequence label
+        sample_rate: target sample rate.
+        use_amp: whether to use automatic mixed precision
+        normalize_before: whether to use layer_norm before the first block
+    """
+
+    def __init__(
+        self,
+        input_size: int = 1,
+        output_size: int = 1024,
+        linear_units: int = 1024,
+        attention_heads: int = 12,
+        num_blocks: int = 12,
+        dropout_rate: float = 0.0,
+        attention_dropout_rate: float = 0.0,
+        activation_dropout_rate: float = 0.0,
+        hubert_dict: str = "./dict.txt",
+        label_rate: int = 100,
+        checkpoint_activations: bool = False,
+        sample_rate: int = 16000,
+        use_amp: bool = False,
+        **kwargs,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self._output_size = output_size
+        self.use_amp = use_amp
+        try:
+            from fairseq.data.dictionary import Dictionary
+            from fairseq.models.hubert.hubert import (
+                HubertModel,  # noqa: H301
+                HubertConfig,  # noqa: H301
+                HubertPretrainingConfig,  # noqa: H301
+            )
+        except Exception as e:
+            print("Error: FairSeq is not properly installed.")
+            print("Please install FairSeq: cd ${MAIN_ROOT}/tools && make fairseq.done")
+            raise e
+
+        cfg_overides = {
+            "encoder_embed_dim": output_size,
+            "encoder_ffn_embed_dim": linear_units,
+            "encoder_attention_heads": attention_heads,
+            "encoder_layers": num_blocks,
+            "final_dim": output_size,
+            "dropout": dropout_rate,
+            "attention_dropout": attention_dropout_rate,
+            "label_rate": label_rate,
+            "checkpoint_activations": checkpoint_activations,
+        }
+        cfg_overides = {**cfg_overides, **kwargs}
+        self.cfg = HubertConfig()
+
+        for key, value in cfg_overides.items():
+            if hasattr(self.cfg, key):
+                setattr(self.cfg, key, value)
+
+        hubert_task_cfg = HubertPretrainingConfig()
+        hubert_task_cfg_overides = {
+            "label_rate": label_rate,
+            "sample_rate": sample_rate,
+        }
+        for key, value in hubert_task_cfg_overides.items():
+            if hasattr(hubert_task_cfg, key):
+                setattr(hubert_task_cfg, key, value)
+
+        d = Dictionary()
+        self._build_dictionary(d, hubert_dict)
+        self.encoder = HubertModel(self.cfg, hubert_task_cfg, self.dictionaries)
+
+    def _build_dictionary(self, dictionary, hubert_dict_path):
+        if os.path.exists(f"{hubert_dict_path}"):
+            setattr(dictionary, "symbols", [])
+            setattr(dictionary, "count", [])
+            setattr(dictionary, "indices", {})
+            dictionary.add_from_file(f"{hubert_dict_path}")
+        else:
+            dictionary.add_symbol("0")
+
+        self.dictionaries = [dictionary]
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_length: torch.Tensor,
+        prev_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Forward Hubert Pretrain Encoder.
+
+        Args:
+            xs_pad: input tensor (B, L, D)
+            ilens: input length (B)
+            prev_states: Not to be used now.
+        Returns:
+            position embedded tensor and mask
+        """
+        self.cast_mask_emb()
+        masks = make_pad_mask(ilens).to(xs_pad.device)
+        ys_pad = ys_pad[:, : min(ys_pad_length)]
+        enc_outputs = self.encoder(
+            xs_pad,
+            padding_mask=masks,
+            mask=True,
+            target_list=[ys_pad],
+            features_only=False,
+        )
+        return enc_outputs
+
+    def cast_mask_emb(self):
+        if self.use_amp and self.encoder.mask_emb.dtype != torch.cuda.HalfTensor:
+            self.encoder.mask_emb = torch.nn.Parameter(self.encoder.mask_emb.half())
+
+    def reload_pretrained_parameters(self):
+        self.encoder.mask_emb = torch.nn.Parameter(
+            torch.HalfTensor(self.cfg.encoder_embed_dim).uniform_()
+        )
+        logging.info(
+            f"Hubert mask embedding re-initiallized!, \
+            {self.encoder.mask_emb.dtype}, \
+            {self.use_amp}"
+        )
+
+
+def download_hubert(model_url, dir_path):
+    os.makedirs(dir_path, exist_ok=True)
+
+    model_name = model_url.split("/")[-1]
+    model_path = os.path.join(dir_path, model_name)
+
+    with FileLock(model_path + ".lock"):
+        if not os.path.exists(model_path):
+            torch.hub.download_url_to_file(model_url, model_path)
+            logging.info(f"Hubert model downloaded {model_path}")
+        else:
+            logging.info(f"Hubert model {model_path} already exists.")
+
+    return model_path
diff --git a/espnet2/asr/encoder/longformer_encoder.py b/espnet2/asr/encoder/longformer_encoder.py
new file mode 100644
index 00000000000..1d9dcfcc864
--- /dev/null
+++ b/espnet2/asr/encoder/longformer_encoder.py
@@ -0,0 +1,374 @@
+# Copyright 2020 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Conformer encoder definition."""
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
+from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer
+from espnet.nets.pytorch_backend.nets_utils import get_activation
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.embedding import (
+    PositionalEncoding,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import Conv1dLinear
+from espnet.nets.pytorch_backend.transformer.multi_layer_conv import MultiLayeredConv1d
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.pytorch_backend.transformer.subsampling import check_short_utt
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+
+
+class LongformerEncoder(ConformerEncoder):
+    """Longformer SA Conformer encoder module.
+
+    Args:
+        input_size (int): Input dimension.
+        output_size (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        attention_dropout_rate (float): Dropout rate in attention.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        input_layer (Union[str, torch.nn.Module]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            If True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            If False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        rel_pos_type (str): Whether to use the latest relative positional encoding or
+            the legacy one. The legacy relative positional encoding will be deprecated
+            in the future. More Details can be found in
+            https://github.com/espnet/espnet/pull/2816.
+        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
+        encoder_attn_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        attention_windows (list): Layer-wise attention window sizes
+            for longformer self-attn
+        attention_dilation(list): Layer-wise attention dilation sizes
+            for longformer self-attn
+        attention_mode(str): Implementation for longformer self-attn.
+            Default="sliding_chunks"
+            Choose 'n2', 'tvm' or 'sliding_chunks'. More details in
+            https://github.com/allenai/longformer
+
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 3,
+        macaron_style: bool = False,
+        rel_pos_type: str = "legacy",
+        pos_enc_layer_type: str = "abs_pos",
+        selfattention_layer_type: str = "lf_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        zero_triu: bool = False,
+        cnn_module_kernel: int = 31,
+        padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
+        attention_windows: list = [100, 100, 100, 100, 100, 100],
+        attention_dilation: list = [1, 1, 1, 1, 1, 1],
+        attention_mode: str = "sliding_chunks",
+    ):
+        assert check_argument_types()
+        super().__init__(input_size)
+        self._output_size = output_size
+
+        activation = get_activation(activation_type)
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        else:
+            raise ValueError(
+                "incorrect or unknown pos_enc_layer: "
+                + pos_enc_layer_type
+                + "Use abs_pos"
+            )
+
+        if len(attention_dilation) != num_blocks:
+            raise ValueError(
+                "incorrect attention_dilation parameter of length"
+                + str(len(attention_dilation))
+                + " does not match num_blocks"
+                + str(num_blocks)
+            )
+
+        if len(attention_windows) != num_blocks:
+            raise ValueError(
+                "incorrect attention_windows parameter of length"
+                + str(len(attention_windows))
+                + " does not match num_blocks"
+                + str(num_blocks)
+            )
+
+        if attention_mode != "tvm" and max(attention_dilation) != 1:
+            raise ValueError(
+                "incorrect attention mode for dilation: "
+                + attention_mode
+                + "Use attention_mode=tvm with Cuda Kernel"
+            )
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(output_size, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        self.selfattention_layer_type = selfattention_layer_type
+        if selfattention_layer_type == "lf_selfattn":
+            assert pos_enc_layer_type == "abs_pos"
+            from espnet.nets.pytorch_backend.transformer.longformer_attention import (
+                LongformerAttention,  # noqa: H301
+            )
+            from longformer.longformer import LongformerConfig
+
+            encoder_selfattn_layer = LongformerAttention
+
+            config = LongformerConfig(
+                attention_window=attention_windows,
+                attention_dilation=attention_dilation,
+                autoregressive=False,
+                num_attention_heads=attention_heads,
+                hidden_size=output_size,
+                attention_probs_dropout_prob=dropout_rate,
+                attention_mode=attention_mode,
+            )
+            encoder_selfattn_layer_args = (config,)
+        else:
+            raise ValueError(
+                "incompatible or unknown encoder_attn_layer: "
+                + selfattention_layer_type
+                + " Use lf_selfattn"
+            )
+
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (output_size, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda layer_id: EncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*(encoder_selfattn_layer_args + (layer_id,))),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        ctc: CTC = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+
+        """
+
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        else:
+            xs_pad = self.embed(xs_pad)
+
+        if self.selfattention_layer_type == "lf_selfattn":
+            seq_len = xs_pad.shape[1]
+            attention_window = (
+                max([x.self_attn.attention_window for x in self.encoders]) * 2
+            )
+            padding_len = (
+                attention_window - seq_len % attention_window
+            ) % attention_window
+            xs_pad = torch.nn.functional.pad(
+                xs_pad, (0, 0, 0, padding_len), "constant", 0
+            )
+            masks = torch.nn.functional.pad(masks, (0, padding_len), "constant", False)
+
+        xs_pad, masks = self.encoders(xs_pad, masks)
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+                    if isinstance(encoder_out, tuple):
+                        encoder_out = encoder_out[0]
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+
+                        if isinstance(xs_pad, tuple):
+                            x, pos_emb = xs_pad
+                            x = x + self.conditioning_layer(ctc_out)
+                            xs_pad = (x, pos_emb)
+                        else:
+                            xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
+        return xs_pad, olens, None
diff --git a/espnet2/asr/encoder/transformer_encoder.py b/espnet2/asr/encoder/transformer_encoder.py
index 472339e4c14..b11cb8c25d3 100644
--- a/espnet2/asr/encoder/transformer_encoder.py
+++ b/espnet2/asr/encoder/transformer_encoder.py
@@ -1,7 +1,9 @@
 # Copyright 2019 Shigeki Karita
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-"""Encoder definition."""
+"""Transformer encoder definition."""
+
+from typing import List
 from typing import Optional
 from typing import Tuple
 
@@ -21,9 +23,11 @@
 from espnet.nets.pytorch_backend.transformer.repeat import repeat
 from espnet.nets.pytorch_backend.transformer.subsampling import check_short_utt
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling
+from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6
 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8
 from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 
 
@@ -69,6 +73,8 @@ def __init__(
         positionwise_layer_type: str = "linear",
         positionwise_conv_kernel_size: int = 1,
         padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
     ):
         assert check_argument_types()
         super().__init__()
@@ -84,6 +90,8 @@ def __init__(
             )
         elif input_layer == "conv2d":
             self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate)
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(input_size, output_size, dropout_rate)
         elif input_layer == "conv2d6":
             self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate)
         elif input_layer == "conv2d8":
@@ -94,9 +102,10 @@ def __init__(
                 pos_enc_class(output_size, positional_dropout_rate),
             )
         elif input_layer is None:
-            self.embed = torch.nn.Sequential(
-                pos_enc_class(output_size, positional_dropout_rate)
-            )
+            if input_size == output_size:
+                self.embed = None
+            else:
+                self.embed = torch.nn.Linear(input_size, output_size)
         else:
             raise ValueError("unknown input_layer: " + input_layer)
         self.normalize_before = normalize_before
@@ -141,6 +150,12 @@ def __init__(
         if self.normalize_before:
             self.after_norm = LayerNorm(output_size)
 
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
     def output_size(self) -> int:
         return self._output_size
 
@@ -149,6 +164,7 @@ def forward(
         xs_pad: torch.Tensor,
         ilens: torch.Tensor,
         prev_states: torch.Tensor = None,
+        ctc: CTC = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """Embed positions in tensor.
 
@@ -161,8 +177,11 @@ def forward(
         """
         masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
 
-        if (
+        if self.embed is None:
+            xs_pad = xs_pad
+        elif (
             isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
             or isinstance(self.embed, Conv2dSubsampling6)
             or isinstance(self.embed, Conv2dSubsampling8)
         ):
@@ -177,9 +196,31 @@ def forward(
             xs_pad, masks = self.embed(xs_pad, masks)
         else:
             xs_pad = self.embed(xs_pad)
-        xs_pad, masks = self.encoders(xs_pad, masks)
+
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+                        xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
         if self.normalize_before:
             xs_pad = self.after_norm(xs_pad)
 
         olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
         return xs_pad, olens, None
diff --git a/espnet2/asr/encoder/wav2vec2_encoder.py b/espnet2/asr/encoder/wav2vec2_encoder.py
index c0a9e6d6e89..68cad0ae60f 100644
--- a/espnet2/asr/encoder/wav2vec2_encoder.py
+++ b/espnet2/asr/encoder/wav2vec2_encoder.py
@@ -128,9 +128,12 @@ def forward(
             )
 
         xs_pad = enc_outputs["x"]  # (B,T,C),
-        masks = enc_outputs["padding_mask"]  # (B, T)
-
-        olens = (~masks).sum(dim=1)
+        bs = xs_pad.shape[0]
+        if enc_outputs["padding_mask"] is not None:
+            masks = enc_outputs["padding_mask"]  # (B, T)
+            olens = (~masks).sum(dim=1)  # (B)
+        else:
+            olens = torch.IntTensor([xs_pad.shape[1]]).repeat(bs).to(xs_pad.device)
 
         if self.output_layer is not None:
             xs_pad = self.output_layer(xs_pad)
diff --git a/espnet2/asr/espnet_model.py b/espnet2/asr/espnet_model.py
index c5763d93a80..570912e02c2 100644
--- a/espnet2/asr/espnet_model.py
+++ b/espnet2/asr/espnet_model.py
@@ -1,5 +1,6 @@
 from contextlib import contextmanager
 from distutils.version import LooseVersion
+import logging
 from typing import Dict
 from typing import List
 from typing import Optional
@@ -19,8 +20,11 @@
 from espnet2.asr.decoder.abs_decoder import AbsDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
 from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
 from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.asr.transducer.error_calculator import ErrorCalculatorTransducer
+from espnet2.asr.transducer.utils import get_transducer_task_io
 from espnet2.layers.abs_normalize import AbsNormalize
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.train.abs_espnet_model import AbsESPnetModel
@@ -46,10 +50,12 @@ def __init__(
         normalize: Optional[AbsNormalize],
         preencoder: Optional[AbsPreEncoder],
         encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
         decoder: AbsDecoder,
         ctc: CTC,
-        rnnt_decoder: None,
+        joint_network: Optional[torch.nn.Module],
         ctc_weight: float = 0.5,
+        interctc_weight: float = 0.0,
         ignore_id: int = -1,
         lsm_weight: float = 0.0,
         length_normalized_loss: bool = False,
@@ -57,44 +63,97 @@ def __init__(
         report_wer: bool = True,
         sym_space: str = "<space>",
         sym_blank: str = "<blank>",
+        extract_feats_in_collect_stats: bool = True,
     ):
         assert check_argument_types()
         assert 0.0 <= ctc_weight <= 1.0, ctc_weight
-        assert rnnt_decoder is None, "Not implemented"
+        assert 0.0 <= interctc_weight < 1.0, interctc_weight
 
         super().__init__()
         # note that eos is the same as sos (equivalent ID)
+        self.blank_id = 0
         self.sos = vocab_size - 1
         self.eos = vocab_size - 1
         self.vocab_size = vocab_size
         self.ignore_id = ignore_id
         self.ctc_weight = ctc_weight
+        self.interctc_weight = interctc_weight
         self.token_list = token_list.copy()
 
         self.frontend = frontend
         self.specaug = specaug
         self.normalize = normalize
         self.preencoder = preencoder
+        self.postencoder = postencoder
         self.encoder = encoder
-        self.decoder = decoder
+
+        if not hasattr(self.encoder, "interctc_use_conditioning"):
+            self.encoder.interctc_use_conditioning = False
+        if self.encoder.interctc_use_conditioning:
+            self.encoder.conditioning_layer = torch.nn.Linear(
+                vocab_size, self.encoder.output_size()
+            )
+
+        self.use_transducer_decoder = joint_network is not None
+
+        self.error_calculator = None
+
+        if self.use_transducer_decoder:
+            from warprnnt_pytorch import RNNTLoss
+
+            self.decoder = decoder
+            self.joint_network = joint_network
+
+            self.criterion_transducer = RNNTLoss(
+                blank=self.blank_id,
+                fastemit_lambda=0.0,
+            )
+
+            if report_cer or report_wer:
+                self.error_calculator_trans = ErrorCalculatorTransducer(
+                    decoder,
+                    joint_network,
+                    token_list,
+                    sym_space,
+                    sym_blank,
+                    report_cer=report_cer,
+                    report_wer=report_wer,
+                )
+            else:
+                self.error_calculator_trans = None
+
+                if self.ctc_weight != 0:
+                    self.error_calculator = ErrorCalculator(
+                        token_list, sym_space, sym_blank, report_cer, report_wer
+                    )
+        else:
+            # we set self.decoder = None in the CTC mode since
+            # self.decoder parameters were never used and PyTorch complained
+            # and threw an Exception in the multi-GPU experiment.
+            # thanks Jeff Farris for pointing out the issue.
+            if ctc_weight == 1.0:
+                self.decoder = None
+            else:
+                self.decoder = decoder
+
+            self.criterion_att = LabelSmoothingLoss(
+                size=vocab_size,
+                padding_idx=ignore_id,
+                smoothing=lsm_weight,
+                normalize_length=length_normalized_loss,
+            )
+
+            if report_cer or report_wer:
+                self.error_calculator = ErrorCalculator(
+                    token_list, sym_space, sym_blank, report_cer, report_wer
+                )
+
         if ctc_weight == 0.0:
             self.ctc = None
         else:
             self.ctc = ctc
-        self.rnnt_decoder = rnnt_decoder
-        self.criterion_att = LabelSmoothingLoss(
-            size=vocab_size,
-            padding_idx=ignore_id,
-            smoothing=lsm_weight,
-            normalize_length=length_normalized_loss,
-        )
 
-        if report_cer or report_wer:
-            self.error_calculator = ErrorCalculator(
-                token_list, sym_space, sym_blank, report_cer, report_wer
-            )
-        else:
-            self.error_calculator = None
+        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
 
     def forward(
         self,
@@ -126,43 +185,97 @@ def forward(
 
         # 1. Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
-
-        # 2a. Attention-decoder branch
-        if self.ctc_weight == 1.0:
-            loss_att, acc_att, cer_att, wer_att = None, None, None, None
-        else:
-            loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        loss_att, acc_att, cer_att, wer_att = None, None, None, None
+        loss_ctc, cer_ctc = None, None
+        loss_transducer, cer_transducer, wer_transducer = None, None, None
+        stats = dict()
+
+        # 1. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
                 encoder_out, encoder_out_lens, text, text_lengths
             )
 
-        # 2b. CTC branch
-        if self.ctc_weight == 0.0:
-            loss_ctc, cer_ctc = None, None
-        else:
-            loss_ctc, cer_ctc = self._calc_ctc_loss(
-                encoder_out, encoder_out_lens, text, text_lengths
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (
+                1 - self.interctc_weight
+            ) * loss_ctc + self.interctc_weight * loss_interctc
+
+        if self.use_transducer_decoder:
+            # 2a. Transducer decoder branch
+            (
+                loss_transducer,
+                cer_transducer,
+                wer_transducer,
+            ) = self._calc_transducer_loss(
+                encoder_out,
+                encoder_out_lens,
+                text,
             )
 
-        # 2c. RNN-T branch
-        if self.rnnt_decoder is not None:
-            _ = self._calc_rnnt_loss(encoder_out, encoder_out_lens, text, text_lengths)
+            if loss_ctc is not None:
+                loss = loss_transducer + (self.ctc_weight * loss_ctc)
+            else:
+                loss = loss_transducer
+
+            # Collect Transducer branch stats
+            stats["loss_transducer"] = (
+                loss_transducer.detach() if loss_transducer is not None else None
+            )
+            stats["cer_transducer"] = cer_transducer
+            stats["wer_transducer"] = wer_transducer
 
-        if self.ctc_weight == 0.0:
-            loss = loss_att
-        elif self.ctc_weight == 1.0:
-            loss = loss_ctc
         else:
-            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
-
-        stats = dict(
-            loss=loss.detach(),
-            loss_att=loss_att.detach() if loss_att is not None else None,
-            loss_ctc=loss_ctc.detach() if loss_ctc is not None else None,
-            acc=acc_att,
-            cer=cer_att,
-            wer=wer_att,
-            cer_ctc=cer_ctc,
-        )
+            # 2b. Attention decoder branch
+            if self.ctc_weight != 1.0:
+                loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
+                    encoder_out, encoder_out_lens, text, text_lengths
+                )
+
+            # 3. CTC-Att loss definition
+            if self.ctc_weight == 0.0:
+                loss = loss_att
+            elif self.ctc_weight == 1.0:
+                loss = loss_ctc
+            else:
+                loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
+
+            # Collect Attn branch stats
+            stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+            stats["acc"] = acc_att
+            stats["cer"] = cer_att
+            stats["wer"] = wer_att
+
+        # Collect total loss stats
+        stats["loss"] = loss.detach()
 
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
@@ -175,7 +288,16 @@ def collect_feats(
         text: torch.Tensor,
         text_lengths: torch.Tensor,
     ) -> Dict[str, torch.Tensor]:
-        feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+        if self.extract_feats_in_collect_stats:
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = speech, speech_lengths
         return {"feats": feats, "feats_lengths": feats_lengths}
 
     def encode(
@@ -206,7 +328,22 @@ def encode(
         # 4. Forward encoder
         # feats: (Batch, Length, Dim)
         # -> encoder_out: (Batch, Length2, Dim2)
-        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+        if self.encoder.interctc_use_conditioning:
+            encoder_out, encoder_out_lens, _ = self.encoder(
+                feats, feats_lengths, ctc=self.ctc
+            )
+        else:
+            encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        # Post-encoder, e.g. NLU
+        if self.postencoder is not None:
+            encoder_out, encoder_out_lens = self.postencoder(
+                encoder_out, encoder_out_lens
+            )
 
         assert encoder_out.size(0) == speech.size(0), (
             encoder_out.size(),
@@ -217,6 +354,9 @@ def encode(
             encoder_out_lens.max(),
         )
 
+        if intermediate_outs is not None:
+            return (encoder_out, intermediate_outs), encoder_out_lens
+
         return encoder_out, encoder_out_lens
 
     def _extract_feats(
@@ -238,6 +378,91 @@ def _extract_feats(
             feats, feats_lengths = speech, speech_lengths
         return feats, feats_lengths
 
+    def nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute negative log likelihood(nll) from transformer-decoder
+
+        Normally, this function is called in batchify_nll.
+
+        Args:
+            encoder_out: (Batch, Length, Dim)
+            encoder_out_lens: (Batch,)
+            ys_pad: (Batch, Length)
+            ys_pad_lens: (Batch,)
+        """
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )  # [batch, seqlen, dim]
+        batch_size = decoder_out.size(0)
+        decoder_num_class = decoder_out.size(2)
+        # nll: negative log-likelihood
+        nll = torch.nn.functional.cross_entropy(
+            decoder_out.view(-1, decoder_num_class),
+            ys_out_pad.view(-1),
+            ignore_index=self.ignore_id,
+            reduction="none",
+        )
+        nll = nll.view(batch_size, -1)
+        nll = nll.sum(dim=1)
+        assert nll.size(0) == batch_size
+        return nll
+
+    def batchify_nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        batch_size: int = 100,
+    ):
+        """Compute negative log likelihood(nll) from transformer-decoder
+
+        To avoid OOM, this fuction seperate the input into batches.
+        Then call nll for each batch and combine and return results.
+        Args:
+            encoder_out: (Batch, Length, Dim)
+            encoder_out_lens: (Batch,)
+            ys_pad: (Batch, Length)
+            ys_pad_lens: (Batch,)
+            batch_size: int, samples each batch contain when computing nll,
+                        you may change this to avoid OOM or increase
+                        GPU memory usage
+        """
+        total_num = encoder_out.size(0)
+        if total_num <= batch_size:
+            nll = self.nll(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+        else:
+            nll = []
+            start_idx = 0
+            while True:
+                end_idx = min(start_idx + batch_size, total_num)
+                batch_encoder_out = encoder_out[start_idx:end_idx, :, :]
+                batch_encoder_out_lens = encoder_out_lens[start_idx:end_idx]
+                batch_ys_pad = ys_pad[start_idx:end_idx, :]
+                batch_ys_pad_lens = ys_pad_lens[start_idx:end_idx]
+                batch_nll = self.nll(
+                    batch_encoder_out,
+                    batch_encoder_out_lens,
+                    batch_ys_pad,
+                    batch_ys_pad_lens,
+                )
+                nll.append(batch_nll)
+                start_idx = end_idx
+                if start_idx == total_num:
+                    break
+            nll = torch.cat(nll)
+        assert nll.size(0) == total_num
+        return nll
+
     def _calc_att_loss(
         self,
         encoder_out: torch.Tensor,
@@ -287,11 +512,50 @@ def _calc_ctc_loss(
             cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
         return loss_ctc, cer_ctc
 
-    def _calc_rnnt_loss(
+    def _calc_transducer_loss(
         self,
         encoder_out: torch.Tensor,
         encoder_out_lens: torch.Tensor,
-        ys_pad: torch.Tensor,
-        ys_pad_lens: torch.Tensor,
+        labels: torch.Tensor,
     ):
-        raise NotImplementedError
+        """Compute Transducer loss.
+
+        Args:
+            encoder_out: Encoder output sequences. (B, T, D_enc)
+            encoder_out_lens: Encoder output sequences lengths. (B,)
+            labels: Label ID sequences. (B, L)
+
+        Return:
+            loss_transducer: Transducer loss value.
+            cer_transducer: Character error rate for Transducer.
+            wer_transducer: Word Error Rate for Transducer.
+
+        """
+        decoder_in, target, t_len, u_len = get_transducer_task_io(
+            labels,
+            encoder_out_lens,
+            ignore_id=self.ignore_id,
+            blank_id=self.blank_id,
+        )
+
+        self.decoder.set_device(encoder_out.device)
+        decoder_out = self.decoder(decoder_in)
+
+        joint_out = self.joint_network(
+            encoder_out.unsqueeze(2), decoder_out.unsqueeze(1)
+        )
+
+        loss_transducer = self.criterion_transducer(
+            joint_out,
+            target,
+            t_len,
+            u_len,
+        )
+
+        cer_transducer, wer_transducer = None, None
+        if not self.training and self.error_calculator_trans is not None:
+            cer_transducer, wer_transducer = self.error_calculator_trans(
+                encoder_out, target
+            )
+
+        return loss_transducer, cer_transducer, wer_transducer
diff --git a/espnet2/asr/frontend/default.py b/espnet2/asr/frontend/default.py
index 6c4a5da7a91..a2aa62c133e 100644
--- a/espnet2/asr/frontend/default.py
+++ b/espnet2/asr/frontend/default.py
@@ -46,6 +46,7 @@ def __init__(
 
         # Deepcopy (In general, dict shouldn't be used as default arg)
         frontend_conf = copy.deepcopy(frontend_conf)
+        self.hop_length = hop_length
 
         if apply_stft:
             self.stft = Stft(
@@ -75,6 +76,7 @@ def __init__(
             htk=htk,
         )
         self.n_mels = n_mels
+        self.frontend_type = "default"
 
     def output_size(self) -> int:
         return self.n_mels
@@ -107,7 +109,7 @@ def forward(
 
         # 4. STFT -> Power spectrum
         # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
-        input_power = input_stft.real ** 2 + input_stft.imag ** 2
+        input_power = input_stft.real**2 + input_stft.imag**2
 
         # 5. Feature transform e.g. Stft -> Log-Mel-Fbank
         # input_power: (Batch, [Channel,] Length, Freq)
diff --git a/espnet2/asr/frontend/fused.py b/espnet2/asr/frontend/fused.py
new file mode 100644
index 00000000000..365de936fc7
--- /dev/null
+++ b/espnet2/asr/frontend/fused.py
@@ -0,0 +1,146 @@
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.asr.frontend.s3prl import S3prlFrontend
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typing import Tuple
+
+
+class FusedFrontends(AbsFrontend):
+    def __init__(
+        self, frontends=None, align_method="linear_projection", proj_dim=100, fs=16000
+    ):
+
+        assert check_argument_types()
+        super().__init__()
+        self.align_method = (
+            align_method  # fusing method : linear_projection only for now
+        )
+        self.proj_dim = proj_dim  # dim of the projection done on each frontend
+        self.frontends = []  # list of the frontends to combine
+
+        for i, frontend in enumerate(frontends):
+            frontend_type = frontend["frontend_type"]
+            if frontend_type == "default":
+                n_mels, fs, n_fft, win_length, hop_length = (
+                    frontend.get("n_mels", 80),
+                    fs,
+                    frontend.get("n_fft", 512),
+                    frontend.get("win_length"),
+                    frontend.get("hop_length", 128),
+                )
+                window, center, normalized, onesided = (
+                    frontend.get("window", "hann"),
+                    frontend.get("center", True),
+                    frontend.get("normalized", False),
+                    frontend.get("onesided", True),
+                )
+                fmin, fmax, htk, apply_stft = (
+                    frontend.get("fmin", None),
+                    frontend.get("fmax", None),
+                    frontend.get("htk", False),
+                    frontend.get("apply_stft", True),
+                )
+
+                self.frontends.append(
+                    DefaultFrontend(
+                        n_mels=n_mels,
+                        n_fft=n_fft,
+                        fs=fs,
+                        win_length=win_length,
+                        hop_length=hop_length,
+                        window=window,
+                        center=center,
+                        normalized=normalized,
+                        onesided=onesided,
+                        fmin=fmin,
+                        fmax=fmax,
+                        htk=htk,
+                        apply_stft=apply_stft,
+                    )
+                )
+            elif frontend_type == "s3prl":
+                frontend_conf, download_dir, multilayer_feature = (
+                    frontend.get("frontend_conf"),
+                    frontend.get("download_dir"),
+                    frontend.get("multilayer_feature"),
+                )
+                self.frontends.append(
+                    S3prlFrontend(
+                        fs=fs,
+                        frontend_conf=frontend_conf,
+                        download_dir=download_dir,
+                        multilayer_feature=multilayer_feature,
+                    )
+                )
+
+            else:
+                raise NotImplementedError  # frontends are only default or s3prl
+
+        self.frontends = torch.nn.ModuleList(self.frontends)
+
+        self.gcd = np.gcd.reduce([frontend.hop_length for frontend in self.frontends])
+        self.factors = [frontend.hop_length // self.gcd for frontend in self.frontends]
+        if torch.cuda.is_available():
+            dev = "cuda"
+        else:
+            dev = "cpu"
+        if self.align_method == "linear_projection":
+            self.projection_layers = [
+                torch.nn.Linear(
+                    in_features=frontend.output_size(),
+                    out_features=self.factors[i] * self.proj_dim,
+                )
+                for i, frontend in enumerate(self.frontends)
+            ]
+            self.projection_layers = torch.nn.ModuleList(self.projection_layers)
+            self.projection_layers = self.projection_layers.to(torch.device(dev))
+
+    def output_size(self) -> int:
+        return len(self.frontends) * self.proj_dim
+
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        # step 0 : get all frontends features
+        self.feats = []
+        for frontend in self.frontends:
+            with torch.no_grad():
+                input_feats, feats_lens = frontend.forward(input, input_lengths)
+            self.feats.append([input_feats, feats_lens])
+
+        if (
+            self.align_method == "linear_projection"
+        ):  # TODO(Dan): to add other align methods
+
+            # first step : projections
+            self.feats_proj = []
+            for i, frontend in enumerate(self.frontends):
+                input_feats = self.feats[i][0]
+                self.feats_proj.append(self.projection_layers[i](input_feats))
+
+            # 2nd step : reshape
+            self.feats_reshaped = []
+            for i, frontend in enumerate(self.frontends):
+                input_feats_proj = self.feats_proj[i]
+                bs, nf, dim = input_feats_proj.shape
+                input_feats_reshaped = torch.reshape(
+                    input_feats_proj, (bs, nf * self.factors[i], dim // self.factors[i])
+                )
+                self.feats_reshaped.append(input_feats_reshaped)
+
+            # 3rd step : drop the few last frames
+            m = min([x.shape[1] for x in self.feats_reshaped])
+            self.feats_final = [x[:, :m, :] for x in self.feats_reshaped]
+
+            input_feats = torch.cat(
+                self.feats_final, dim=-1
+            )  # change the input size of the preencoder : proj_dim * n_frontends
+            feats_lens = torch.ones_like(self.feats[0][1]) * (m)
+
+        else:
+            raise NotImplementedError
+
+        return input_feats, feats_lens
diff --git a/espnet2/asr/frontend/s3prl.py b/espnet2/asr/frontend/s3prl.py
new file mode 100644
index 00000000000..4fe53970380
--- /dev/null
+++ b/espnet2/asr/frontend/s3prl.py
@@ -0,0 +1,143 @@
+from argparse import Namespace
+import copy
+import logging
+import os
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import humanfriendly
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.frontends.frontend import Frontend
+from espnet.nets.pytorch_backend.nets_utils import pad_list
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+
+
+def base_s3prl_setup(args):
+    args.upstream_feature_selection = getattr(args, "upstream_feature_selection", None)
+    args.upstream_model_config = getattr(args, "upstream_model_config", None)
+    args.upstream_refresh = getattr(args, "upstream_refresh", False)
+    args.upstream_ckpt = getattr(args, "upstream_ckpt", None)
+    args.init_ckpt = getattr(args, "init_ckpt", None)
+    args.verbose = getattr(args, "verbose", False)
+    args.tile_factor = getattr(args, "tile_factor", 1)
+    return args
+
+
+class S3prlFrontend(AbsFrontend):
+    """Speech Pretrained Representation frontend structure for ASR."""
+
+    def __init__(
+        self,
+        fs: Union[int, str] = 16000,
+        frontend_conf: Optional[dict] = get_default_kwargs(Frontend),
+        download_dir: str = None,
+        multilayer_feature: bool = False,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        if isinstance(fs, str):
+            fs = humanfriendly.parse_size(fs)
+
+        if download_dir is not None:
+            torch.hub.set_dir(download_dir)
+
+        self.multilayer_feature = multilayer_feature
+        self.upstream, self.featurizer = self._get_upstream(frontend_conf)
+        self.pretrained_params = copy.deepcopy(self.upstream.state_dict())
+        self.output_dim = self.featurizer.output_dim
+        self.frontend_type = "s3prl"
+        self.hop_length = self.upstream.get_downsample_rates("key")
+
+    def _get_upstream(self, frontend_conf):
+        """Get S3PRL upstream model."""
+        s3prl_args = base_s3prl_setup(
+            Namespace(**frontend_conf, device="cpu"),
+        )
+        self.args = s3prl_args
+
+        s3prl_path = None
+        python_path_list = os.environ.get("PYTHONPATH", "(None)").split(":")
+        for p in python_path_list:
+            if p.endswith("s3prl"):
+                s3prl_path = p
+                break
+        assert s3prl_path is not None
+
+        s3prl_upstream = torch.hub.load(
+            s3prl_path,
+            s3prl_args.upstream,
+            ckpt=s3prl_args.upstream_ckpt,
+            model_config=s3prl_args.upstream_model_config,
+            refresh=s3prl_args.upstream_refresh,
+            source="local",
+        ).to("cpu")
+
+        if getattr(
+            s3prl_upstream, "model", None
+        ) is not None and s3prl_upstream.model.__class__.__name__ in [
+            "Wav2Vec2Model",
+            "HubertModel",
+        ]:
+            s3prl_upstream.model.encoder.layerdrop = 0.0
+
+        from s3prl.upstream.interfaces import Featurizer
+
+        if self.multilayer_feature is None:
+            feature_selection = "last_hidden_state"
+        else:
+            feature_selection = "hidden_states"
+        s3prl_featurizer = Featurizer(
+            upstream=s3prl_upstream,
+            feature_selection=feature_selection,
+            upstream_device="cpu",
+        )
+
+        return s3prl_upstream, s3prl_featurizer
+
+    def _tile_representations(self, feature):
+        """Tile up the representations by `tile_factor`.
+
+        Input - sequence of representations
+                shape: (batch_size, seq_len, feature_dim)
+        Output - sequence of tiled representations
+                 shape: (batch_size, seq_len * factor, feature_dim)
+        """
+        assert (
+            len(feature.shape) == 3
+        ), "Input argument `feature` has invalid shape: {}".format(feature.shape)
+        tiled_feature = feature.repeat(1, 1, self.args.tile_factor)
+        tiled_feature = tiled_feature.reshape(
+            feature.size(0), feature.size(1) * self.args.tile_factor, feature.size(2)
+        )
+        return tiled_feature
+
+    def output_size(self) -> int:
+        return self.output_dim
+
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        wavs = [wav[: input_lengths[i]] for i, wav in enumerate(input)]
+        self.upstream.eval()
+        with torch.no_grad():
+            feats = self.upstream(wavs)
+        feats = self.featurizer(wavs, feats)
+
+        if self.args.tile_factor != 1:
+            feats = self._tile_representations(feats)
+
+        input_feats = pad_list(feats, 0.0)
+        feats_lens = torch.tensor([f.shape[0] for f in feats], dtype=torch.long)
+
+        # Saving CUDA Memory
+        del feats
+
+        return input_feats, feats_lens
+
+    def reload_pretrained_parameters(self):
+        self.upstream.load_state_dict(self.pretrained_params)
+        logging.info("Pretrained S3PRL frontend model parameters reloaded!")
diff --git a/espnet2/asr/maskctc_model.py b/espnet2/asr/maskctc_model.py
new file mode 100644
index 00000000000..ab45c625606
--- /dev/null
+++ b/espnet2/asr/maskctc_model.py
@@ -0,0 +1,346 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from itertools import groupby
+import logging
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.e2e_asr_common import ErrorCalculator
+from espnet.nets.pytorch_backend.maskctc.add_mask_token import mask_uniform
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.espnet_model import ESPnetASRModel
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import force_gatherable
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class MaskCTCModel(ESPnetASRModel):
+    """Hybrid CTC/Masked LM Encoder-Decoder model (Mask-CTC)"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: MLMDecoder,
+        ctc: CTC,
+        joint_network: Optional[torch.nn.Module] = None,
+        ctc_weight: float = 0.5,
+        interctc_weight: float = 0.0,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        sym_mask: str = "<mask>",
+        extract_feats_in_collect_stats: bool = True,
+    ):
+        assert check_argument_types()
+
+        super().__init__(
+            vocab_size=vocab_size,
+            token_list=token_list,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            joint_network=joint_network,
+            ctc_weight=ctc_weight,
+            interctc_weight=interctc_weight,
+            ignore_id=ignore_id,
+            lsm_weight=lsm_weight,
+            length_normalized_loss=length_normalized_loss,
+            report_cer=report_cer,
+            report_wer=report_wer,
+            sym_space=sym_space,
+            sym_blank=sym_blank,
+            extract_feats_in_collect_stats=extract_feats_in_collect_stats,
+        )
+
+        # Add <mask> and override inherited fields
+        token_list.append(sym_mask)
+        vocab_size += 1
+        self.vocab_size = vocab_size
+        self.mask_token = vocab_size - 1
+        self.token_list = token_list.copy()
+
+        # MLM loss
+        del self.criterion_att
+        self.criterion_mlm = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        self.error_calculator = None
+        if report_cer or report_wer:
+            self.error_calculator = ErrorCalculator(
+                token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+        batch_size = speech.shape[0]
+
+        # For data-parallel
+        text = text[:, : text_lengths.max()]
+
+        # Define stats to report
+        loss_mlm, acc_mlm = None, None
+        loss_ctc, cer_ctc = None, None
+        stats = dict()
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        # 2. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # 2a. Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (
+                1 - self.interctc_weight
+            ) * loss_ctc + self.interctc_weight * loss_interctc
+
+        # 3. MLM decoder branch
+        if self.ctc_weight != 1.0:
+            loss_mlm, acc_mlm = self._calc_mlm_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+        # 4. CTC/MLM loss definition
+        if self.ctc_weight == 0.0:
+            loss = loss_mlm
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_mlm
+
+        # Collect MLM branch stats
+        stats["loss_mlm"] = loss_mlm.detach() if loss_mlm is not None else None
+        stats["acc_mlm"] = acc_mlm
+
+        # Collect total loss stats
+        stats["loss"] = loss.detach()
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def _calc_mlm_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # 1. Apply masks
+        ys_in_pad, ys_out_pad = mask_uniform(
+            ys_pad, self.mask_token, self.eos, self.ignore_id
+        )
+
+        # 2. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_pad_lens
+        )
+
+        # 3. Compute mlm loss
+        loss_mlm = self.criterion_mlm(decoder_out, ys_out_pad)
+        acc_mlm = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        return loss_mlm, acc_mlm
+
+    def nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def batchify_nll(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        batch_size: int = 100,
+    ):
+        raise NotImplementedError
+
+
+class MaskCTCInference(torch.nn.Module):
+    """Mask-CTC-based non-autoregressive inference"""
+
+    def __init__(
+        self,
+        asr_model: MaskCTCModel,
+        n_iterations: int,
+        threshold_probability: float,
+    ):
+        """Initialize Mask-CTC inference"""
+        super().__init__()
+        self.ctc = asr_model.ctc
+        self.mlm = asr_model.decoder
+        self.mask_token = asr_model.mask_token
+        self.n_iterations = n_iterations
+        self.threshold_probability = threshold_probability
+        self.converter = TokenIDConverter(token_list=asr_model.token_list)
+
+    def ids2text(self, ids: List[int]):
+        text = "".join(self.converter.ids2tokens(ids))
+        return text.replace("<mask>", "_").replace("<space>", " ")
+
+    def forward(self, enc_out: torch.Tensor) -> List[Hypothesis]:
+        """Perform Mask-CTC inference"""
+        # greedy ctc outputs
+        enc_out = enc_out.unsqueeze(0)
+        ctc_probs, ctc_ids = torch.exp(self.ctc.log_softmax(enc_out)).max(dim=-1)
+        y_hat = torch.stack([x[0] for x in groupby(ctc_ids[0])])
+        y_idx = torch.nonzero(y_hat != 0).squeeze(-1)
+
+        logging.info("ctc:{}".format(self.ids2text(y_hat[y_idx].tolist())))
+
+        # calculate token-level ctc probabilities by taking
+        # the maximum probability of consecutive frames with
+        # the same ctc symbols
+        probs_hat = []
+        cnt = 0
+        for i, y in enumerate(y_hat.tolist()):
+            probs_hat.append(-1)
+            while cnt < ctc_ids.shape[1] and y == ctc_ids[0][cnt]:
+                if probs_hat[i] < ctc_probs[0][cnt]:
+                    probs_hat[i] = ctc_probs[0][cnt].item()
+                cnt += 1
+        probs_hat = torch.from_numpy(numpy.array(probs_hat))
+
+        # mask ctc outputs based on ctc probabilities
+        p_thres = self.threshold_probability
+        mask_idx = torch.nonzero(probs_hat[y_idx] < p_thres).squeeze(-1)
+        confident_idx = torch.nonzero(probs_hat[y_idx] >= p_thres).squeeze(-1)
+        mask_num = len(mask_idx)
+
+        y_in = torch.zeros(1, len(y_idx), dtype=torch.long) + self.mask_token
+        y_in[0][confident_idx] = y_hat[y_idx][confident_idx]
+
+        logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+        # iterative decoding
+        if not mask_num == 0:
+            K = self.n_iterations
+            num_iter = K if mask_num >= K and K > 0 else mask_num
+
+            for t in range(num_iter - 1):
+                pred, _ = self.mlm(enc_out, [enc_out.size(1)], y_in, [y_in.size(1)])
+                pred_score, pred_id = pred[0][mask_idx].max(dim=-1)
+                cand = torch.topk(pred_score, mask_num // num_iter, -1)[1]
+                y_in[0][mask_idx[cand]] = pred_id[cand]
+                mask_idx = torch.nonzero(y_in[0] == self.mask_token).squeeze(-1)
+
+                logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+            # predict leftover masks (|masks| < mask_num // num_iter)
+            pred, _ = self.mlm(enc_out, [enc_out.size(1)], y_in, [y_in.size(1)])
+            y_in[0][mask_idx] = pred[0][mask_idx].argmax(dim=-1)
+
+            logging.info("msk:{}".format(self.ids2text(y_in[0].tolist())))
+
+        # pad with mask tokens to ensure compatibility with sos/eos tokens
+        yseq = torch.tensor(
+            [self.mask_token] + y_in.tolist()[0] + [self.mask_token], device=y_in.device
+        )
+
+        return Hypothesis(yseq=yseq)
diff --git a/espnet2/asr/postencoder/__init__.py b/espnet2/asr/postencoder/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/asr/postencoder/abs_postencoder.py b/espnet2/asr/postencoder/abs_postencoder.py
new file mode 100644
index 00000000000..f5ac03be27b
--- /dev/null
+++ b/espnet2/asr/postencoder/abs_postencoder.py
@@ -0,0 +1,17 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import Tuple
+
+import torch
+
+
+class AbsPostEncoder(torch.nn.Module, ABC):
+    @abstractmethod
+    def output_size(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/espnet2/asr/postencoder/hugging_face_transformers_postencoder.py b/espnet2/asr/postencoder/hugging_face_transformers_postencoder.py
new file mode 100644
index 00000000000..a8a8177f8fd
--- /dev/null
+++ b/espnet2/asr/postencoder/hugging_face_transformers_postencoder.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+#  2021, University of Stuttgart;  Pavel Denisov
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Hugging Face Transformers PostEncoder."""
+
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from typeguard import check_argument_types
+from typing import Tuple
+
+import copy
+import logging
+import torch
+
+try:
+    from transformers import AutoModel
+
+    is_transformers_available = True
+except ImportError:
+    is_transformers_available = False
+
+
+class HuggingFaceTransformersPostEncoder(AbsPostEncoder):
+    """Hugging Face Transformers PostEncoder."""
+
+    def __init__(
+        self,
+        input_size: int,
+        model_name_or_path: str,
+    ):
+        """Initialize the module."""
+        assert check_argument_types()
+        super().__init__()
+
+        if not is_transformers_available:
+            raise ImportError(
+                "`transformers` is not available. Please install it via `pip install"
+                " transformers` or `cd /path/to/espnet/tools && . ./activate_python.sh"
+                " && ./installers/install_transformers.sh`."
+            )
+
+        model = AutoModel.from_pretrained(model_name_or_path)
+
+        if hasattr(model, "encoder"):
+            self.transformer = model.encoder
+        else:
+            self.transformer = model
+
+        if hasattr(self.transformer, "embed_tokens"):
+            del self.transformer.embed_tokens
+        if hasattr(self.transformer, "wte"):
+            del self.transformer.wte
+        if hasattr(self.transformer, "word_embedding"):
+            del self.transformer.word_embedding
+
+        self.pretrained_params = copy.deepcopy(self.transformer.state_dict())
+
+        if (
+            self.transformer.config.is_encoder_decoder
+            or self.transformer.config.model_type in ["xlnet", "t5"]
+        ):
+            self.use_inputs_embeds = True
+            self.extend_attention_mask = False
+        elif self.transformer.config.model_type == "gpt2":
+            self.use_inputs_embeds = True
+            self.extend_attention_mask = True
+        else:
+            self.use_inputs_embeds = False
+            self.extend_attention_mask = True
+
+        self.linear_in = torch.nn.Linear(
+            input_size, self.transformer.config.hidden_size
+        )
+
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward."""
+        input = self.linear_in(input)
+
+        args = {"return_dict": True}
+
+        mask = (~make_pad_mask(input_lengths)).to(input.device).float()
+
+        if self.extend_attention_mask:
+            args["attention_mask"] = _extend_attention_mask(mask)
+        else:
+            args["attention_mask"] = mask
+
+        if self.use_inputs_embeds:
+            args["inputs_embeds"] = input
+        else:
+            args["hidden_states"] = input
+
+        if self.transformer.config.model_type == "mpnet":
+            args["head_mask"] = [None for _ in self.transformer.layer]
+
+        output = self.transformer(**args).last_hidden_state
+
+        return output, input_lengths
+
+    def reload_pretrained_parameters(self):
+        self.transformer.load_state_dict(self.pretrained_params)
+        logging.info("Pretrained Transformers model parameters reloaded!")
+
+    def output_size(self) -> int:
+        """Get the output size."""
+        return self.transformer.config.hidden_size
+
+
+def _extend_attention_mask(mask: torch.Tensor) -> torch.Tensor:
+    mask = mask[:, None, None, :]
+    mask = (1.0 - mask) * -10000.0
+    return mask
diff --git a/espnet2/asr/preencoder/linear.py b/espnet2/asr/preencoder/linear.py
new file mode 100644
index 00000000000..9c7cc497fca
--- /dev/null
+++ b/espnet2/asr/preencoder/linear.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+#  2021, Carnegie Mellon University;  Xuankai Chang
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Linear Projection."""
+
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from typeguard import check_argument_types
+from typing import Tuple
+
+import torch
+
+
+class LinearProjection(AbsPreEncoder):
+    """Linear Projection Preencoder."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+    ):
+        """Initialize the module."""
+        assert check_argument_types()
+        super().__init__()
+
+        self.output_dim = output_size
+        self.linear_out = torch.nn.Linear(input_size, output_size)
+
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward."""
+        output = self.linear_out(input)
+        return output, input_lengths  # no state in this layer
+
+    def output_size(self) -> int:
+        """Get the output size."""
+        return self.output_dim
diff --git a/espnet2/asr/specaug/specaug.py b/espnet2/asr/specaug/specaug.py
index 6cfeb1ce00e..65ed221f220 100644
--- a/espnet2/asr/specaug/specaug.py
+++ b/espnet2/asr/specaug/specaug.py
@@ -1,21 +1,14 @@
-from distutils.version import LooseVersion
+"""SpecAugment module."""
+from typing import Optional
 from typing import Sequence
 from typing import Union
 
-import torch
-
 from espnet2.asr.specaug.abs_specaug import AbsSpecAug
 from espnet2.layers.mask_along_axis import MaskAlongAxis
+from espnet2.layers.mask_along_axis import MaskAlongAxisVariableMaxWidth
 from espnet2.layers.time_warp import TimeWarp
 
 
-if LooseVersion(torch.__version__) >= LooseVersion("1.1"):
-    DEFAULT_TIME_WARP_MODE = "bicubic"
-else:
-    # pytorch1.0 doesn't implement bicubic
-    DEFAULT_TIME_WARP_MODE = "bilinear"
-
-
 class SpecAug(AbsSpecAug):
     """Implementation of SpecAug.
 
@@ -34,17 +27,27 @@ def __init__(
         self,
         apply_time_warp: bool = True,
         time_warp_window: int = 5,
-        time_warp_mode: str = DEFAULT_TIME_WARP_MODE,
+        time_warp_mode: str = "bicubic",
         apply_freq_mask: bool = True,
         freq_mask_width_range: Union[int, Sequence[int]] = (0, 20),
         num_freq_mask: int = 2,
         apply_time_mask: bool = True,
-        time_mask_width_range: Union[int, Sequence[int]] = (0, 100),
+        time_mask_width_range: Optional[Union[int, Sequence[int]]] = None,
+        time_mask_width_ratio_range: Optional[Union[float, Sequence[float]]] = None,
         num_time_mask: int = 2,
     ):
         if not apply_time_warp and not apply_time_mask and not apply_freq_mask:
             raise ValueError(
-                "Either one of time_warp, time_mask, or freq_mask should be applied",
+                "Either one of time_warp, time_mask, or freq_mask should be applied"
+            )
+        if (
+            apply_time_mask
+            and (time_mask_width_range is not None)
+            and (time_mask_width_ratio_range is not None)
+        ):
+            raise ValueError(
+                'Either one of "time_mask_width_range" or '
+                '"time_mask_width_ratio_range" can be used'
             )
         super().__init__()
         self.apply_time_warp = apply_time_warp
@@ -66,11 +69,23 @@ def __init__(
             self.freq_mask = None
 
         if apply_time_mask:
-            self.time_mask = MaskAlongAxis(
-                dim="time",
-                mask_width_range=time_mask_width_range,
-                num_mask=num_time_mask,
-            )
+            if time_mask_width_range is not None:
+                self.time_mask = MaskAlongAxis(
+                    dim="time",
+                    mask_width_range=time_mask_width_range,
+                    num_mask=num_time_mask,
+                )
+            elif time_mask_width_ratio_range is not None:
+                self.time_mask = MaskAlongAxisVariableMaxWidth(
+                    dim="time",
+                    mask_width_ratio_range=time_mask_width_ratio_range,
+                    num_mask=num_time_mask,
+                )
+            else:
+                raise ValueError(
+                    'Either one of "time_mask_width_range" or '
+                    '"time_mask_width_ratio_range" should be used.'
+                )
         else:
             self.time_mask = None
 
diff --git a/espnet2/asr/transducer/__init__.py b/espnet2/asr/transducer/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/asr/transducer/beam_search_transducer.py b/espnet2/asr/transducer/beam_search_transducer.py
new file mode 100644
index 00000000000..211b15a0101
--- /dev/null
+++ b/espnet2/asr/transducer/beam_search_transducer.py
@@ -0,0 +1,835 @@
+"""Search algorithms for Transducer models."""
+
+from dataclasses import dataclass
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+
+from espnet.nets.pytorch_backend.transducer.utils import is_prefix
+from espnet.nets.pytorch_backend.transducer.utils import recombine_hyps
+from espnet.nets.pytorch_backend.transducer.utils import select_k_expansions
+from espnet.nets.pytorch_backend.transducer.utils import subtract
+
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.transducer.joint_network import JointNetwork
+
+
+@dataclass
+class Hypothesis:
+    """Default hypothesis definition for Transducer search algorithms."""
+
+    score: float
+    yseq: List[int]
+    dec_state: Union[
+        Tuple[torch.Tensor, Optional[torch.Tensor]],
+        List[Optional[torch.Tensor]],
+        torch.Tensor,
+    ]
+    lm_state: Union[Dict[str, Any], List[Any]] = None
+
+
+@dataclass
+class ExtendedHypothesis(Hypothesis):
+    """Extended hypothesis definition for NSC beam search and mAES."""
+
+    dec_out: List[torch.Tensor] = None
+    lm_scores: torch.Tensor = None
+
+
+class BeamSearchTransducer:
+    """Beam search implementation for Transducer."""
+
+    def __init__(
+        self,
+        decoder: AbsDecoder,
+        joint_network: JointNetwork,
+        beam_size: int,
+        lm: torch.nn.Module = None,
+        lm_weight: float = 0.1,
+        search_type: str = "default",
+        max_sym_exp: int = 2,
+        u_max: int = 50,
+        nstep: int = 1,
+        prefix_alpha: int = 1,
+        expansion_gamma: int = 2.3,
+        expansion_beta: int = 2,
+        score_norm: bool = True,
+        nbest: int = 1,
+    ):
+        """Initialize Transducer search module.
+
+        Args:
+            decoder: Decoder module.
+            joint_network: Joint network module.
+            beam_size: Beam size.
+            lm: LM class.
+            lm_weight: LM weight for soft fusion.
+            search_type: Search algorithm to use during inference.
+            max_sym_exp: Number of maximum symbol expansions at each time step. (TSD)
+            u_max: Maximum output sequence length. (ALSD)
+            nstep: Number of maximum expansion steps at each time step. (NSC/mAES)
+            prefix_alpha: Maximum prefix length in prefix search. (NSC/mAES)
+            expansion_beta:
+              Number of additional candidates for expanded hypotheses selection. (mAES)
+            expansion_gamma: Allowed logp difference for prune-by-value method. (mAES)
+            score_norm: Normalize final scores by length. ("default")
+            nbest: Number of final hypothesis.
+
+        """
+        self.decoder = decoder
+        self.joint_network = joint_network
+
+        self.beam_size = beam_size
+        self.hidden_size = decoder.dunits
+        self.vocab_size = decoder.odim
+
+        self.blank_id = decoder.blank_id
+
+        if self.beam_size <= 1:
+            self.search_algorithm = self.greedy_search
+        elif search_type == "default":
+            self.search_algorithm = self.default_beam_search
+        elif search_type == "tsd":
+            self.max_sym_exp = max_sym_exp
+
+            self.search_algorithm = self.time_sync_decoding
+        elif search_type == "alsd":
+            self.u_max = u_max
+
+            self.search_algorithm = self.align_length_sync_decoding
+        elif search_type == "nsc":
+            self.nstep = nstep
+            self.prefix_alpha = prefix_alpha
+
+            self.search_algorithm = self.nsc_beam_search
+        elif search_type == "maes":
+            self.nstep = nstep if nstep > 1 else 2
+            self.prefix_alpha = prefix_alpha
+            self.expansion_gamma = expansion_gamma
+            self.expansion_beta = expansion_beta
+
+            self.search_algorithm = self.modified_adaptive_expansion_search
+        else:
+            raise NotImplementedError
+
+        self.use_lm = lm is not None
+        self.lm = lm
+        self.lm_weight = lm_weight
+
+        self.score_norm = score_norm
+        self.nbest = nbest
+
+    def __call__(
+        self, enc_out: torch.Tensor
+    ) -> Union[List[Hypothesis], List[ExtendedHypothesis]]:
+        """Perform beam search.
+
+        Args:
+            enc_out: Encoder output sequence. (T, D_enc)
+
+        Returns:
+            nbest_hyps: N-best decoding results
+
+        """
+        self.decoder.set_device(enc_out.device)
+
+        nbest_hyps = self.search_algorithm(enc_out)
+
+        return nbest_hyps
+
+    def sort_nbest(
+        self, hyps: Union[List[Hypothesis], List[ExtendedHypothesis]]
+    ) -> Union[List[Hypothesis], List[ExtendedHypothesis]]:
+        """Sort hypotheses by score or score given sequence length.
+
+        Args:
+            hyps: Hypothesis.
+
+        Return:
+            hyps: Sorted hypothesis.
+
+        """
+        if self.score_norm:
+            hyps.sort(key=lambda x: x.score / len(x.yseq), reverse=True)
+        else:
+            hyps.sort(key=lambda x: x.score, reverse=True)
+
+        return hyps[: self.nbest]
+
+    def prefix_search(
+        self, hyps: List[ExtendedHypothesis], enc_out_t: torch.Tensor
+    ) -> List[ExtendedHypothesis]:
+        """Prefix search for NSC and mAES strategies.
+
+        Based on https://arxiv.org/pdf/1211.3711.pdf
+
+        """
+        for j, hyp_j in enumerate(hyps[:-1]):
+            for hyp_i in hyps[(j + 1) :]:
+                curr_id = len(hyp_j.yseq)
+                pref_id = len(hyp_i.yseq)
+
+                if (
+                    is_prefix(hyp_j.yseq, hyp_i.yseq)
+                    and (curr_id - pref_id) <= self.prefix_alpha
+                ):
+                    logp = torch.log_softmax(
+                        self.joint_network(enc_out_t, hyp_i.dec_out[-1]),
+                        dim=-1,
+                    )
+
+                    curr_score = hyp_i.score + float(logp[hyp_j.yseq[pref_id]])
+
+                    for k in range(pref_id, (curr_id - 1)):
+                        logp = torch.log_softmax(
+                            self.joint_network(enc_out_t, hyp_j.dec_out[k]),
+                            dim=-1,
+                        )
+
+                        curr_score += float(logp[hyp_j.yseq[k + 1]])
+
+                    hyp_j.score = np.logaddexp(hyp_j.score, curr_score)
+
+        return hyps
+
+    def greedy_search(self, enc_out: torch.Tensor) -> List[Hypothesis]:
+        """Greedy search implementation.
+
+        Args:
+            enc_out: Encoder output sequence. (T, D_enc)
+
+        Returns:
+            hyp: 1-best hypotheses.
+
+        """
+        dec_state = self.decoder.init_state(1)
+
+        hyp = Hypothesis(score=0.0, yseq=[self.blank_id], dec_state=dec_state)
+        cache = {}
+
+        dec_out, state, _ = self.decoder.score(hyp, cache)
+
+        for enc_out_t in enc_out:
+            logp = torch.log_softmax(
+                self.joint_network(enc_out_t, dec_out),
+                dim=-1,
+            )
+            top_logp, pred = torch.max(logp, dim=-1)
+
+            if pred != self.blank_id:
+                hyp.yseq.append(int(pred))
+                hyp.score += float(top_logp)
+
+                hyp.dec_state = state
+
+                dec_out, state, _ = self.decoder.score(hyp, cache)
+
+        return [hyp]
+
+    def default_beam_search(self, enc_out: torch.Tensor) -> List[Hypothesis]:
+        """Beam search implementation.
+
+        Modified from https://arxiv.org/pdf/1211.3711.pdf
+
+        Args:
+            enc_out: Encoder output sequence. (T, D)
+
+        Returns:
+            nbest_hyps: N-best hypothesis.
+
+        """
+        beam = min(self.beam_size, self.vocab_size)
+        beam_k = min(beam, (self.vocab_size - 1))
+
+        dec_state = self.decoder.init_state(1)
+
+        kept_hyps = [Hypothesis(score=0.0, yseq=[self.blank_id], dec_state=dec_state)]
+        cache = {}
+
+        for enc_out_t in enc_out:
+            hyps = kept_hyps
+            kept_hyps = []
+
+            while True:
+                max_hyp = max(hyps, key=lambda x: x.score)
+                hyps.remove(max_hyp)
+
+                dec_out, state, lm_tokens = self.decoder.score(max_hyp, cache)
+
+                logp = torch.log_softmax(
+                    self.joint_network(enc_out_t, dec_out),
+                    dim=-1,
+                )
+                top_k = logp[1:].topk(beam_k, dim=-1)
+
+                kept_hyps.append(
+                    Hypothesis(
+                        score=(max_hyp.score + float(logp[0:1])),
+                        yseq=max_hyp.yseq[:],
+                        dec_state=max_hyp.dec_state,
+                        lm_state=max_hyp.lm_state,
+                    )
+                )
+
+                if self.use_lm:
+                    lm_scores, lm_state = self.lm.score(
+                        lm_tokens, max_hyp.lm_state, None
+                    )
+                else:
+                    lm_state = max_hyp.lm_state
+
+                for logp, k in zip(*top_k):
+                    score = max_hyp.score + float(logp)
+
+                    if self.use_lm:
+                        score += self.lm_weight * lm_scores[k + 1]
+
+                    hyps.append(
+                        Hypothesis(
+                            score=score,
+                            yseq=max_hyp.yseq[:] + [int(k + 1)],
+                            dec_state=state,
+                            lm_state=lm_state,
+                        )
+                    )
+
+                hyps_max = float(max(hyps, key=lambda x: x.score).score)
+                kept_most_prob = sorted(
+                    [hyp for hyp in kept_hyps if hyp.score > hyps_max],
+                    key=lambda x: x.score,
+                )
+                if len(kept_most_prob) >= beam:
+                    kept_hyps = kept_most_prob
+                    break
+
+        return self.sort_nbest(kept_hyps)
+
+    def time_sync_decoding(self, enc_out: torch.Tensor) -> List[Hypothesis]:
+        """Time synchronous beam search implementation.
+
+        Based on https://ieeexplore.ieee.org/document/9053040
+
+        Args:
+            enc_out: Encoder output sequence. (T, D)
+
+        Returns:
+            nbest_hyps: N-best hypothesis.
+
+        """
+        beam = min(self.beam_size, self.vocab_size)
+
+        beam_state = self.decoder.init_state(beam)
+
+        B = [
+            Hypothesis(
+                yseq=[self.blank_id],
+                score=0.0,
+                dec_state=self.decoder.select_state(beam_state, 0),
+            )
+        ]
+        cache = {}
+
+        if self.use_lm:
+            B[0].lm_state = self.lm.zero_state()
+
+        for enc_out_t in enc_out:
+            A = []
+            C = B
+
+            enc_out_t = enc_out_t.unsqueeze(0)
+
+            for v in range(self.max_sym_exp):
+                D = []
+
+                beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                    C,
+                    beam_state,
+                    cache,
+                    self.use_lm,
+                )
+
+                beam_logp = torch.log_softmax(
+                    self.joint_network(enc_out_t, beam_dec_out),
+                    dim=-1,
+                )
+                beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)
+
+                seq_A = [h.yseq for h in A]
+
+                for i, hyp in enumerate(C):
+                    if hyp.yseq not in seq_A:
+                        A.append(
+                            Hypothesis(
+                                score=(hyp.score + float(beam_logp[i, 0])),
+                                yseq=hyp.yseq[:],
+                                dec_state=hyp.dec_state,
+                                lm_state=hyp.lm_state,
+                            )
+                        )
+                    else:
+                        dict_pos = seq_A.index(hyp.yseq)
+
+                        A[dict_pos].score = np.logaddexp(
+                            A[dict_pos].score, (hyp.score + float(beam_logp[i, 0]))
+                        )
+
+                if v < (self.max_sym_exp - 1):
+                    if self.use_lm:
+                        beam_lm_scores, beam_lm_states = self.lm.batch_score(
+                            beam_lm_tokens, [c.lm_state for c in C], None
+                        )
+
+                    for i, hyp in enumerate(C):
+                        for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
+                            new_hyp = Hypothesis(
+                                score=(hyp.score + float(logp)),
+                                yseq=(hyp.yseq + [int(k)]),
+                                dec_state=self.decoder.select_state(beam_state, i),
+                                lm_state=hyp.lm_state,
+                            )
+
+                            if self.use_lm:
+                                new_hyp.score += self.lm_weight * beam_lm_scores[i, k]
+                                new_hyp.lm_state = beam_lm_states[i]
+
+                            D.append(new_hyp)
+
+                C = sorted(D, key=lambda x: x.score, reverse=True)[:beam]
+
+            B = sorted(A, key=lambda x: x.score, reverse=True)[:beam]
+
+        return self.sort_nbest(B)
+
+    def align_length_sync_decoding(self, enc_out: torch.Tensor) -> List[Hypothesis]:
+        """Alignment-length synchronous beam search implementation.
+
+        Based on https://ieeexplore.ieee.org/document/9053040
+
+        Args:
+            h: Encoder output sequences. (T, D)
+
+        Returns:
+            nbest_hyps: N-best hypothesis.
+
+        """
+        beam = min(self.beam_size, self.vocab_size)
+
+        t_max = int(enc_out.size(0))
+        u_max = min(self.u_max, (t_max - 1))
+
+        beam_state = self.decoder.init_state(beam)
+
+        B = [
+            Hypothesis(
+                yseq=[self.blank_id],
+                score=0.0,
+                dec_state=self.decoder.select_state(beam_state, 0),
+            )
+        ]
+        final = []
+        cache = {}
+
+        if self.use_lm:
+            B[0].lm_state = self.lm.zero_state()
+
+        for i in range(t_max + u_max):
+            A = []
+
+            B_ = []
+            B_enc_out = []
+            for hyp in B:
+                u = len(hyp.yseq) - 1
+                t = i - u
+
+                if t > (t_max - 1):
+                    continue
+
+                B_.append(hyp)
+                B_enc_out.append((t, enc_out[t]))
+
+            if B_:
+                beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                    B_,
+                    beam_state,
+                    cache,
+                    self.use_lm,
+                )
+
+                beam_enc_out = torch.stack([x[1] for x in B_enc_out])
+
+                beam_logp = torch.log_softmax(
+                    self.joint_network(beam_enc_out, beam_dec_out),
+                    dim=-1,
+                )
+                beam_topk = beam_logp[:, 1:].topk(beam, dim=-1)
+
+                if self.use_lm:
+                    beam_lm_scores, beam_lm_states = self.lm.batch_score(
+                        beam_lm_tokens,
+                        [b.lm_state for b in B_],
+                        None,
+                    )
+
+                for i, hyp in enumerate(B_):
+                    new_hyp = Hypothesis(
+                        score=(hyp.score + float(beam_logp[i, 0])),
+                        yseq=hyp.yseq[:],
+                        dec_state=hyp.dec_state,
+                        lm_state=hyp.lm_state,
+                    )
+
+                    A.append(new_hyp)
+
+                    if B_enc_out[i][0] == (t_max - 1):
+                        final.append(new_hyp)
+
+                    for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
+                        new_hyp = Hypothesis(
+                            score=(hyp.score + float(logp)),
+                            yseq=(hyp.yseq[:] + [int(k)]),
+                            dec_state=self.decoder.select_state(beam_state, i),
+                            lm_state=hyp.lm_state,
+                        )
+
+                        if self.use_lm:
+                            new_hyp.score += self.lm_weight * beam_lm_scores[i, k]
+                            new_hyp.lm_state = beam_lm_states[i]
+
+                        A.append(new_hyp)
+
+                B = sorted(A, key=lambda x: x.score, reverse=True)[:beam]
+                B = recombine_hyps(B)
+
+        if final:
+            return self.sort_nbest(final)
+        else:
+            return B
+
+    def nsc_beam_search(self, enc_out: torch.Tensor) -> List[ExtendedHypothesis]:
+        """N-step constrained beam search implementation.
+
+        Based on/Modified from https://arxiv.org/pdf/2002.03577.pdf.
+        Please reference ESPnet (b-flo, PR #2444) for any usage outside ESPnet
+        until further modifications.
+
+        Args:
+            enc_out: Encoder output sequence. (T, D_enc)
+
+        Returns:
+            nbest_hyps: N-best hypothesis.
+
+        """
+        beam = min(self.beam_size, self.vocab_size)
+        beam_k = min(beam, (self.vocab_size - 1))
+
+        beam_state = self.decoder.init_state(beam)
+
+        init_tokens = [
+            ExtendedHypothesis(
+                yseq=[self.blank_id],
+                score=0.0,
+                dec_state=self.decoder.select_state(beam_state, 0),
+            )
+        ]
+
+        cache = {}
+
+        beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
+            init_tokens,
+            beam_state,
+            cache,
+            self.use_lm,
+        )
+
+        state = self.decoder.select_state(beam_state, 0)
+
+        if self.use_lm:
+            beam_lm_scores, beam_lm_states = self.lm.batch_score(
+                beam_lm_tokens,
+                [i.lm_state for i in init_tokens],
+                None,
+            )
+            lm_state = beam_lm_states[0]
+            lm_scores = beam_lm_scores[0]
+        else:
+            lm_state = None
+            lm_scores = None
+
+        kept_hyps = [
+            ExtendedHypothesis(
+                yseq=[self.blank_id],
+                score=0.0,
+                dec_state=state,
+                dec_out=[beam_dec_out[0]],
+                lm_state=lm_state,
+                lm_scores=lm_scores,
+            )
+        ]
+
+        for enc_out_t in enc_out:
+            hyps = self.prefix_search(
+                sorted(kept_hyps, key=lambda x: len(x.yseq), reverse=True),
+                enc_out_t,
+            )
+            kept_hyps = []
+
+            beam_enc_out = enc_out_t.unsqueeze(0)
+
+            S = []
+            V = []
+            for n in range(self.nstep):
+                beam_dec_out = torch.stack([hyp.dec_out[-1] for hyp in hyps])
+
+                beam_logp = torch.log_softmax(
+                    self.joint_network(beam_enc_out, beam_dec_out),
+                    dim=-1,
+                )
+                beam_topk = beam_logp[:, 1:].topk(beam_k, dim=-1)
+
+                for i, hyp in enumerate(hyps):
+                    S.append(
+                        ExtendedHypothesis(
+                            yseq=hyp.yseq[:],
+                            score=hyp.score + float(beam_logp[i, 0:1]),
+                            dec_out=hyp.dec_out[:],
+                            dec_state=hyp.dec_state,
+                            lm_state=hyp.lm_state,
+                            lm_scores=hyp.lm_scores,
+                        )
+                    )
+
+                    for logp, k in zip(beam_topk[0][i], beam_topk[1][i] + 1):
+                        score = hyp.score + float(logp)
+
+                        if self.use_lm:
+                            score += self.lm_weight * float(hyp.lm_scores[k])
+
+                        V.append(
+                            ExtendedHypothesis(
+                                yseq=hyp.yseq[:] + [int(k)],
+                                score=score,
+                                dec_out=hyp.dec_out[:],
+                                dec_state=hyp.dec_state,
+                                lm_state=hyp.lm_state,
+                                lm_scores=hyp.lm_scores,
+                            )
+                        )
+
+                V.sort(key=lambda x: x.score, reverse=True)
+                V = subtract(V, hyps)[:beam]
+
+                beam_state = self.decoder.create_batch_states(
+                    beam_state,
+                    [v.dec_state for v in V],
+                    [v.yseq for v in V],
+                )
+                beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                    V,
+                    beam_state,
+                    cache,
+                    self.use_lm,
+                )
+
+                if self.use_lm:
+                    beam_lm_scores, beam_lm_states = self.lm.batch_score(
+                        beam_lm_tokens, [v.lm_state for v in V], None
+                    )
+
+                if n < (self.nstep - 1):
+                    for i, v in enumerate(V):
+                        v.dec_out.append(beam_dec_out[i])
+
+                        v.dec_state = self.decoder.select_state(beam_state, i)
+
+                        if self.use_lm:
+                            v.lm_state = beam_lm_states[i]
+                            v.lm_scores = beam_lm_scores[i]
+
+                    hyps = V[:]
+                else:
+                    beam_logp = torch.log_softmax(
+                        self.joint_network(beam_enc_out, beam_dec_out),
+                        dim=-1,
+                    )
+
+                    for i, v in enumerate(V):
+                        if self.nstep != 1:
+                            v.score += float(beam_logp[i, 0])
+
+                        v.dec_out.append(beam_dec_out[i])
+
+                        v.dec_state = self.decoder.select_state(beam_state, i)
+
+                        if self.use_lm:
+                            v.lm_state = beam_lm_states[i]
+                            v.lm_scores = beam_lm_scores[i]
+
+            kept_hyps = sorted((S + V), key=lambda x: x.score, reverse=True)[:beam]
+
+        return self.sort_nbest(kept_hyps)
+
+    def modified_adaptive_expansion_search(
+        self, enc_out: torch.Tensor
+    ) -> List[ExtendedHypothesis]:
+        """It's the modified Adaptive Expansion Search (mAES) implementation.
+
+        Based on/modified from https://ieeexplore.ieee.org/document/9250505 and NSC.
+
+        Args:
+            enc_out: Encoder output sequence. (T, D_enc)
+
+        Returns:
+            nbest_hyps: N-best hypothesis.
+
+        """
+        beam = min(self.beam_size, self.vocab_size)
+        beam_state = self.decoder.init_state(beam)
+
+        init_tokens = [
+            ExtendedHypothesis(
+                yseq=[self.blank_id],
+                score=0.0,
+                dec_state=self.decoder.select_state(beam_state, 0),
+            )
+        ]
+
+        cache = {}
+
+        beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
+            init_tokens,
+            beam_state,
+            cache,
+            self.use_lm,
+        )
+
+        state = self.decoder.select_state(beam_state, 0)
+
+        if self.use_lm:
+            beam_lm_scores, beam_lm_states = self.lm.batch_score(
+                beam_lm_tokens, [i.lm_state for i in init_tokens], None
+            )
+
+            lm_state = beam_lm_states[0]
+            lm_scores = beam_lm_scores[0]
+        else:
+            lm_state = None
+            lm_scores = None
+
+        kept_hyps = [
+            ExtendedHypothesis(
+                yseq=[self.blank_id],
+                score=0.0,
+                dec_state=state,
+                dec_out=[beam_dec_out[0]],
+                lm_state=lm_state,
+                lm_scores=lm_scores,
+            )
+        ]
+
+        for enc_out_t in enc_out:
+            hyps = self.prefix_search(
+                sorted(kept_hyps, key=lambda x: len(x.yseq), reverse=True),
+                enc_out_t,
+            )
+            kept_hyps = []
+
+            beam_enc_out = enc_out_t.unsqueeze(0)
+
+            list_b = []
+            for n in range(self.nstep):
+                beam_dec_out = torch.stack([h.dec_out[-1] for h in hyps])
+
+                beam_logp = torch.log_softmax(
+                    self.joint_network(beam_enc_out, beam_dec_out),
+                    dim=-1,
+                )
+                k_expansions = select_k_expansions(
+                    hyps, beam_logp, beam, self.expansion_gamma, self.expansion_beta
+                )
+
+                list_exp = []
+                for i, hyp in enumerate(hyps):
+                    for k, new_score in k_expansions[i]:
+                        new_hyp = ExtendedHypothesis(
+                            yseq=hyp.yseq[:],
+                            score=new_score,
+                            dec_out=hyp.dec_out[:],
+                            dec_state=hyp.dec_state,
+                            lm_state=hyp.lm_state,
+                            lm_scores=hyp.lm_scores,
+                        )
+
+                        if k == 0:
+                            list_b.append(new_hyp)
+                        else:
+                            new_hyp.yseq.append(int(k))
+
+                            if self.use_lm:
+                                new_hyp.score += self.lm_weight * float(
+                                    hyp.lm_scores[k]
+                                )
+
+                            list_exp.append(new_hyp)
+
+                if not list_exp:
+                    kept_hyps = sorted(list_b, key=lambda x: x.score, reverse=True)[
+                        :beam
+                    ]
+
+                    break
+                else:
+                    beam_state = self.decoder.create_batch_states(
+                        beam_state,
+                        [hyp.dec_state for hyp in list_exp],
+                        [hyp.yseq for hyp in list_exp],
+                    )
+
+                    beam_dec_out, beam_state, beam_lm_tokens = self.decoder.batch_score(
+                        list_exp,
+                        beam_state,
+                        cache,
+                        self.use_lm,
+                    )
+
+                    if self.use_lm:
+                        beam_lm_scores, beam_lm_states = self.lm.batch_score(
+                            beam_lm_tokens, [k.lm_state for k in list_exp], None
+                        )
+
+                    if n < (self.nstep - 1):
+                        for i, hyp in enumerate(list_exp):
+                            hyp.dec_out.append(beam_dec_out[i])
+                            hyp.dec_state = self.decoder.select_state(beam_state, i)
+
+                            if self.use_lm:
+                                hyp.lm_state = beam_lm_states[i]
+                                hyp.lm_scores = beam_lm_scores[i]
+
+                        hyps = list_exp[:]
+                    else:
+                        beam_logp = torch.log_softmax(
+                            self.joint_network(beam_enc_out, beam_dec_out),
+                            dim=-1,
+                        )
+
+                        for i, hyp in enumerate(list_exp):
+                            hyp.score += float(beam_logp[i, 0])
+
+                            hyp.dec_out.append(beam_dec_out[i])
+                            hyp.dec_state = self.decoder.select_state(beam_state, i)
+
+                            if self.use_lm:
+                                hyp.lm_states = beam_lm_states[i]
+                                hyp.lm_scores = beam_lm_scores[i]
+
+                        kept_hyps = sorted(
+                            list_b + list_exp, key=lambda x: x.score, reverse=True
+                        )[:beam]
+
+        return self.sort_nbest(kept_hyps)
diff --git a/espnet2/asr/transducer/error_calculator.py b/espnet2/asr/transducer/error_calculator.py
new file mode 100644
index 00000000000..5c624825a4f
--- /dev/null
+++ b/espnet2/asr/transducer/error_calculator.py
@@ -0,0 +1,171 @@
+"""Error Calculator module for Transducer."""
+
+from typing import List
+from typing import Tuple
+
+import torch
+
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.transducer.beam_search_transducer import BeamSearchTransducer
+
+
+class ErrorCalculatorTransducer(object):
+    """Calculate CER and WER for transducer models.
+
+    Args:
+        decoder: Decoder module.
+        token_list: List of tokens.
+        sym_space: Space symbol.
+        sym_blank: Blank symbol.
+        report_cer: Whether to compute CER.
+        report_wer: Whether to compute WER.
+
+    """
+
+    def __init__(
+        self,
+        decoder: AbsDecoder,
+        joint_network: torch.nn.Module,
+        token_list: List[int],
+        sym_space: str,
+        sym_blank: str,
+        report_cer: bool = False,
+        report_wer: bool = False,
+    ):
+        """Construct an ErrorCalculatorTransducer."""
+        super().__init__()
+
+        self.beam_search = BeamSearchTransducer(
+            decoder=decoder,
+            joint_network=joint_network,
+            beam_size=2,
+            search_type="default",
+            score_norm=False,
+        )
+
+        self.decoder = decoder
+
+        self.token_list = token_list
+        self.space = sym_space
+        self.blank = sym_blank
+
+        self.report_cer = report_cer
+        self.report_wer = report_wer
+
+    def __call__(self, encoder_out: torch.Tensor, target: torch.Tensor):
+        """Calculate sentence-level WER/CER score for Transducer model.
+
+        Args:
+            encoder_out: Encoder output sequences. (B, T, D_enc)
+            target: Target label ID sequences. (B, L)
+
+        Returns:
+            : Sentence-level CER score.
+            : Sentence-level WER score.
+
+        """
+        cer, wer = None, None
+
+        batchsize = int(encoder_out.size(0))
+        batch_nbest = []
+
+        encoder_out = encoder_out.to(next(self.decoder.parameters()).device)
+
+        for b in range(batchsize):
+            nbest_hyps = self.beam_search(encoder_out[b])
+            batch_nbest.append(nbest_hyps)
+
+        pred = [nbest_hyp[0].yseq[1:] for nbest_hyp in batch_nbest]
+
+        char_pred, char_target = self.convert_to_char(pred, target)
+
+        if self.report_cer:
+            cer = self.calculate_cer(char_pred, char_target)
+
+        if self.report_wer:
+            wer = self.calculate_wer(char_pred, char_target)
+
+        return cer, wer
+
+    def convert_to_char(
+        self, pred: torch.Tensor, target: torch.Tensor
+    ) -> Tuple[List, List]:
+        """Convert label ID sequences to character sequences.
+
+        Args:
+            pred: Prediction label ID sequences. (B, U)
+            target: Target label ID sequences. (B, L)
+
+        Returns:
+            char_pred: Prediction character sequences. (B, ?)
+            char_target: Target character sequences. (B, ?)
+
+        """
+        char_pred, char_target = [], []
+
+        for i, pred_i in enumerate(pred):
+            char_pred_i = [self.token_list[int(h)] for h in pred_i]
+            char_target_i = [self.token_list[int(r)] for r in target[i]]
+
+            char_pred_i = "".join(char_pred_i).replace(self.space, " ")
+            char_pred_i = char_pred_i.replace(self.blank, "")
+
+            char_target_i = "".join(char_target_i).replace(self.space, " ")
+            char_target_i = char_target_i.replace(self.blank, "")
+
+            char_pred.append(char_pred_i)
+            char_target.append(char_target_i)
+
+        return char_pred, char_target
+
+    def calculate_cer(
+        self, char_pred: torch.Tensor, char_target: torch.Tensor
+    ) -> float:
+        """Calculate sentence-level CER score.
+
+        Args:
+            char_pred: Prediction character sequences. (B, ?)
+            char_target: Target character sequences. (B, ?)
+
+        Returns:
+            : Average sentence-level CER score.
+
+        """
+        import editdistance
+
+        distances, lens = [], []
+
+        for i, char_pred_i in enumerate(char_pred):
+            pred = char_pred_i.replace(" ", "")
+            target = char_target[i].replace(" ", "")
+
+            distances.append(editdistance.eval(pred, target))
+            lens.append(len(target))
+
+        return float(sum(distances)) / sum(lens)
+
+    def calculate_wer(
+        self, char_pred: torch.Tensor, char_target: torch.Tensor
+    ) -> float:
+        """Calculate sentence-level WER score.
+
+        Args:
+            char_pred: Prediction character sequences. (B, ?)
+            char_target: Target character sequences. (B, ?)
+
+        Returns:
+            : Average sentence-level WER score
+
+        """
+        import editdistance
+
+        distances, lens = [], []
+
+        for i, char_pred_i in enumerate(char_pred):
+            pred = char_pred_i.split()
+            target = char_target[i].split()
+
+            distances.append(editdistance.eval(pred, target))
+            lens.append(len(target))
+
+        return float(sum(distances)) / sum(lens)
diff --git a/espnet2/asr/transducer/joint_network.py b/espnet2/asr/transducer/joint_network.py
new file mode 100644
index 00000000000..b2b6c10016b
--- /dev/null
+++ b/espnet2/asr/transducer/joint_network.py
@@ -0,0 +1,55 @@
+"""Transducer joint network implementation."""
+
+import torch
+
+from espnet.nets.pytorch_backend.nets_utils import get_activation
+
+
+class JointNetwork(torch.nn.Module):
+    """Transducer joint network module.
+
+    Args:
+        joint_output_size: Joint network output dimension
+        encoder_output_size: Encoder output dimension.
+        decoder_output_size: Decoder output dimension.
+        joint_space_size: Dimension of joint space.
+        joint_activation_type: Type of activation for joint network.
+
+    """
+
+    def __init__(
+        self,
+        joint_output_size: int,
+        encoder_output_size: int,
+        decoder_output_size: int,
+        joint_space_size: int = 256,
+        joint_activation_type: str = "tanh",
+    ):
+        """Joint network initializer."""
+        super().__init__()
+
+        self.lin_enc = torch.nn.Linear(encoder_output_size, joint_space_size)
+        self.lin_dec = torch.nn.Linear(decoder_output_size, joint_space_size)
+
+        self.lin_out = torch.nn.Linear(joint_space_size, joint_output_size)
+
+        self.joint_activation = get_activation(joint_activation_type)
+
+    def forward(
+        self,
+        enc_out: torch.Tensor,
+        dec_out: torch.Tensor,
+    ) -> torch.Tensor:
+        """Joint computation of encoder and decoder hidden state sequences.
+
+        Args:
+            enc_out: Expanded encoder output state sequences (B, T, 1, D_enc)
+            dec_out: Expanded decoder output state sequences (B, 1, U, D_dec)
+
+        Returns:
+            joint_out: Joint output state sequences. (B, T, U, D_out)
+
+        """
+        joint_out = self.joint_activation(self.lin_enc(enc_out) + self.lin_dec(dec_out))
+
+        return self.lin_out(joint_out)
diff --git a/espnet2/asr/transducer/transducer_decoder.py b/espnet2/asr/transducer/transducer_decoder.py
new file mode 100644
index 00000000000..8543cb22752
--- /dev/null
+++ b/espnet2/asr/transducer/transducer_decoder.py
@@ -0,0 +1,301 @@
+"""(RNN-)Transducer decoder definition."""
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.transducer.beam_search_transducer import ExtendedHypothesis
+from espnet2.asr.transducer.beam_search_transducer import Hypothesis
+
+
+class TransducerDecoder(AbsDecoder):
+    """(RNN-)Transducer decoder module.
+
+    Args:
+        vocab_size: Output dimension.
+        layers_type: (RNN-)Decoder layers type.
+        num_layers: Number of decoder layers.
+        hidden_size: Number of decoder units per layer.
+        dropout: Dropout rate for decoder layers.
+        dropout_embed: Dropout rate for embedding layer.
+        embed_pad: Embed/Blank symbol ID.
+
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        rnn_type: str = "lstm",
+        num_layers: int = 1,
+        hidden_size: int = 320,
+        dropout: float = 0.0,
+        dropout_embed: float = 0.0,
+        embed_pad: int = 0,
+    ):
+        assert check_argument_types()
+
+        if rnn_type not in {"lstm", "gru"}:
+            raise ValueError(f"Not supported: rnn_type={rnn_type}")
+
+        super().__init__()
+
+        self.embed = torch.nn.Embedding(vocab_size, hidden_size, padding_idx=embed_pad)
+        self.dropout_embed = torch.nn.Dropout(p=dropout_embed)
+
+        dec_net = torch.nn.LSTM if rnn_type == "lstm" else torch.nn.GRU
+
+        self.decoder = torch.nn.ModuleList(
+            [
+                dec_net(hidden_size, hidden_size, 1, batch_first=True)
+                for _ in range(num_layers)
+            ]
+        )
+        self.dropout_dec = torch.nn.ModuleList(
+            [torch.nn.Dropout(p=dropout) for _ in range(num_layers)]
+        )
+
+        self.dlayers = num_layers
+        self.dunits = hidden_size
+        self.dtype = rnn_type
+        self.odim = vocab_size
+
+        self.ignore_id = -1
+        self.blank_id = embed_pad
+
+        self.device = next(self.parameters()).device
+
+    def set_device(self, device: torch.device):
+        """Set GPU device to use.
+
+        Args:
+            device: Device ID.
+
+        """
+        self.device = device
+
+    def init_state(
+        self, batch_size: int
+    ) -> Tuple[torch.Tensor, Optional[torch.tensor]]:
+        """Initialize decoder states.
+
+        Args:
+            batch_size: Batch size.
+
+        Returns:
+            : Initial decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+
+        """
+        h_n = torch.zeros(
+            self.dlayers,
+            batch_size,
+            self.dunits,
+            device=self.device,
+        )
+
+        if self.dtype == "lstm":
+            c_n = torch.zeros(
+                self.dlayers,
+                batch_size,
+                self.dunits,
+                device=self.device,
+            )
+
+            return (h_n, c_n)
+
+        return (h_n, None)
+
+    def rnn_forward(
+        self,
+        sequence: torch.Tensor,
+        state: Tuple[torch.Tensor, Optional[torch.Tensor]],
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Encode source label sequences.
+
+        Args:
+            sequence: RNN input sequences. (B, D_emb)
+            state: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+
+        Returns:
+            sequence: RNN output sequences. (B, D_dec)
+            (h_next, c_next): Decoder hidden states. (N, B, D_dec), (N, B, D_dec))
+
+        """
+        h_prev, c_prev = state
+        h_next, c_next = self.init_state(sequence.size(0))
+
+        for layer in range(self.dlayers):
+            if self.dtype == "lstm":
+                sequence, (
+                    h_next[layer : layer + 1],
+                    c_next[layer : layer + 1],
+                ) = self.decoder[layer](
+                    sequence, hx=(h_prev[layer : layer + 1], c_prev[layer : layer + 1])
+                )
+            else:
+                sequence, h_next[layer : layer + 1] = self.decoder[layer](
+                    sequence, hx=h_prev[layer : layer + 1]
+                )
+
+            sequence = self.dropout_dec[layer](sequence)
+
+        return sequence, (h_next, c_next)
+
+    def forward(self, labels: torch.Tensor) -> torch.Tensor:
+        """Encode source label sequences.
+
+        Args:
+            labels: Label ID sequences. (B, L)
+
+        Returns:
+            dec_out: Decoder output sequences. (B, T, U, D_dec)
+
+        """
+        init_state = self.init_state(labels.size(0))
+        dec_embed = self.dropout_embed(self.embed(labels))
+
+        dec_out, _ = self.rnn_forward(dec_embed, init_state)
+
+        return dec_out
+
+    def score(
+        self, hyp: Hypothesis, cache: Dict[str, Any]
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]], torch.Tensor]:
+        """One-step forward hypothesis.
+
+        Args:
+            hyp: Hypothesis.
+            cache: Pairs of (dec_out, state) for each label sequence. (key)
+
+        Returns:
+            dec_out: Decoder output sequence. (1, D_dec)
+            new_state: Decoder hidden states. ((N, 1, D_dec), (N, 1, D_dec))
+            label: Label ID for LM. (1,)
+
+        """
+        label = torch.full((1, 1), hyp.yseq[-1], dtype=torch.long, device=self.device)
+
+        str_labels = "_".join(list(map(str, hyp.yseq)))
+
+        if str_labels in cache:
+            dec_out, dec_state = cache[str_labels]
+        else:
+            dec_emb = self.embed(label)
+
+            dec_out, dec_state = self.rnn_forward(dec_emb, hyp.dec_state)
+            cache[str_labels] = (dec_out, dec_state)
+
+        return dec_out[0][0], dec_state, label[0]
+
+    def batch_score(
+        self,
+        hyps: Union[List[Hypothesis], List[ExtendedHypothesis]],
+        dec_states: Tuple[torch.Tensor, Optional[torch.Tensor]],
+        cache: Dict[str, Any],
+        use_lm: bool,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
+        """One-step forward hypotheses.
+
+        Args:
+            hyps: Hypotheses.
+            states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+            cache: Pairs of (dec_out, dec_states) for each label sequences. (keys)
+            use_lm: Whether to compute label ID sequences for LM.
+
+        Returns:
+            dec_out: Decoder output sequences. (B, D_dec)
+            dec_states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+            lm_labels: Label ID sequences for LM. (B,)
+
+        """
+        final_batch = len(hyps)
+
+        process = []
+        done = [None] * final_batch
+
+        for i, hyp in enumerate(hyps):
+            str_labels = "_".join(list(map(str, hyp.yseq)))
+
+            if str_labels in cache:
+                done[i] = cache[str_labels]
+            else:
+                process.append((str_labels, hyp.yseq[-1], hyp.dec_state))
+
+        if process:
+            labels = torch.LongTensor([[p[1]] for p in process], device=self.device)
+            p_dec_states = self.create_batch_states(
+                self.init_state(labels.size(0)), [p[2] for p in process]
+            )
+
+            dec_emb = self.embed(labels)
+            dec_out, new_states = self.rnn_forward(dec_emb, p_dec_states)
+
+        j = 0
+        for i in range(final_batch):
+            if done[i] is None:
+                state = self.select_state(new_states, j)
+
+                done[i] = (dec_out[j], state)
+                cache[process[j][0]] = (dec_out[j], state)
+
+                j += 1
+
+        dec_out = torch.cat([d[0] for d in done], dim=0)
+        dec_states = self.create_batch_states(dec_states, [d[1] for d in done])
+
+        if use_lm:
+            lm_labels = torch.LongTensor(
+                [h.yseq[-1] for h in hyps], device=self.device
+            ).view(final_batch, 1)
+
+            return dec_out, dec_states, lm_labels
+
+        return dec_out, dec_states, None
+
+    def select_state(
+        self, states: Tuple[torch.Tensor, Optional[torch.Tensor]], idx: int
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Get specified ID state from decoder hidden states.
+
+        Args:
+            states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+            idx: State ID to extract.
+
+        Returns:
+            : Decoder hidden state for given ID.
+              ((N, 1, D_dec), (N, 1, D_dec))
+
+        """
+        return (
+            states[0][:, idx : idx + 1, :],
+            states[1][:, idx : idx + 1, :] if self.dtype == "lstm" else None,
+        )
+
+    def create_batch_states(
+        self,
+        states: Tuple[torch.Tensor, Optional[torch.Tensor]],
+        new_states: List[Tuple[torch.Tensor, Optional[torch.Tensor]]],
+        check_list: Optional[List] = None,
+    ) -> List[Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Create decoder hidden states.
+
+        Args:
+            states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+            new_states: Decoder hidden states. [N x ((1, D_dec), (1, D_dec))]
+
+        Returns:
+            states: Decoder hidden states. ((N, B, D_dec), (N, B, D_dec))
+
+        """
+        return (
+            torch.cat([s[0] for s in new_states], dim=1),
+            torch.cat([s[1] for s in new_states], dim=1)
+            if self.dtype == "lstm"
+            else None,
+        )
diff --git a/espnet2/asr/transducer/utils.py b/espnet2/asr/transducer/utils.py
new file mode 100644
index 00000000000..7540fc45981
--- /dev/null
+++ b/espnet2/asr/transducer/utils.py
@@ -0,0 +1,49 @@
+"""Utility functions for Transducer models."""
+
+import torch
+
+from espnet.nets.pytorch_backend.nets_utils import pad_list
+
+
+def get_transducer_task_io(
+    labels: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    ignore_id: int = -1,
+    blank_id: int = 0,
+):
+    """Get Transducer loss I/O.
+
+    Args:
+        labels: Label ID sequences. (B, L)
+        encoder_out_lens: Encoder output lengths. (B,)
+        ignore_id: Padding symbol ID.
+        blank_id: Blank symbol ID.
+
+    Return:
+        decoder_in: Decoder inputs. (B, U)
+        target: Target label ID sequences. (B, U)
+        t_len: Time lengths. (B,)
+        u_len: Label lengths. (B,)
+
+    """
+    device = labels.device
+
+    labels_unpad = [y[y != ignore_id] for y in labels]
+    blank = labels[0].new([blank_id])
+
+    decoder_in = pad_list(
+        [torch.cat([blank, label], dim=0) for label in labels_unpad], blank_id
+    ).to(device)
+
+    target = pad_list(labels_unpad, blank_id).type(torch.int32).to(device)
+
+    if encoder_out_lens.dim() > 1:
+        enc_mask = [m[m != 0] for m in encoder_out_lens]
+        encoder_out_lens = list(map(int, [m.size(0) for m in enc_mask]))
+    else:
+        encoder_out_lens = list(map(int, encoder_out_lens))
+
+    t_len = torch.IntTensor(encoder_out_lens).to(device)
+    u_len = torch.IntTensor([y.size(0) for y in labels_unpad]).to(device)
+
+    return decoder_in, target, t_len, u_len
diff --git a/espnet2/bin/asr_align.py b/espnet2/bin/asr_align.py
new file mode 100755
index 00000000000..a9f8823ca57
--- /dev/null
+++ b/espnet2/bin/asr_align.py
@@ -0,0 +1,827 @@
+#!/usr/bin/env python3
+# Copyright 2021, Ludwig Kürzinger; Kamo Naoyuki
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Perform CTC segmentation to align utterances within audio files."""
+
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Optional
+from typing import TextIO
+from typing import Union
+
+import numpy as np
+import soundfile
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+# imports for inference
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.tasks.asr import ASRTask
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+# imports for CTC segmentation
+from ctc_segmentation import ctc_segmentation
+from ctc_segmentation import CtcSegmentationParameters
+from ctc_segmentation import determine_utterance_segments
+from ctc_segmentation import prepare_text
+from ctc_segmentation import prepare_token_list
+
+
+class CTCSegmentationTask:
+    """Task object for CTC segmentation.
+
+    When formatted with str(·), this object returns
+    results in a kaldi-style segments file formatting.
+    The human-readable output can be configured with
+    the printing options.
+
+    Properties:
+        text: Utterance texts, separated by line. But without the utterance
+            name at the beginning of the line (as in kaldi-style text).
+        ground_truth_mat: Ground truth matrix (CTC segmentation).
+        utt_begin_indices: Utterance separator for the Ground truth matrix.
+        timings: Time marks of the corresponding chars.
+        state_list: Estimated alignment of chars/tokens.
+        segments: Calculated segments as: (start, end, confidence score).
+        config: CTC Segmentation configuration object.
+        name: Name of aligned audio file (Optional). If given, name is
+            considered when generating the text.
+        utt_ids: The list of utterance names (Optional). This list should
+            have the same length as the number of utterances.
+        lpz: CTC posterior log probabilities (Optional).
+
+    Properties for printing:
+        print_confidence_score: Includes the confidence score.
+        print_utterance_text: Includes utterance text.
+    """
+
+    text = None
+    ground_truth_mat = None
+    utt_begin_indices = None
+    timings = None
+    char_probs = None
+    state_list = None
+    segments = None
+    config = None
+    done = False
+    # Optional
+    name = "utt"
+    utt_ids = None
+    lpz = None
+    # Printing
+    print_confidence_score = True
+    print_utterance_text = True
+
+    def __init__(self, **kwargs):
+        """Initialize the module."""
+        self.set(**kwargs)
+
+    def set(self, **kwargs):
+        """Update properties.
+
+        Args:
+            **kwargs: Key-value dict that contains all properties
+                with their new values. Unknown properties are ignored.
+        """
+        for key in kwargs:
+            if (
+                not key.startswith("_")
+                and hasattr(self, key)
+                and kwargs[key] is not None
+            ):
+                setattr(self, key, kwargs[key])
+
+    def __str__(self):
+        """Return a kaldi-style ``segments`` file (string)."""
+        output = ""
+        num_utts = len(self.segments)
+        if self.utt_ids is None:
+            utt_names = [f"{self.name}_{i:04}" for i in range(num_utts)]
+        else:
+            # ensure correct mapping of segments to utterance ids
+            assert num_utts == len(self.utt_ids)
+            utt_names = self.utt_ids
+        for i, boundary in enumerate(self.segments):
+            # utterance name and file name
+            utt_entry = f"{utt_names[i]} {self.name} "
+            # segment start and end
+            utt_entry += f"{boundary[0]:.2f} {boundary[1]:.2f}"
+            # confidence score
+            if self.print_confidence_score:
+                utt_entry += f" {boundary[2]:3.4f}"
+            # utterance ground truth
+            if self.print_utterance_text:
+                utt_entry += f" {self.text[i]}"
+            output += utt_entry + "\n"
+        return output
+
+
+class CTCSegmentation:
+    """Align text to audio using CTC segmentation.
+
+    Usage:
+        Initialize with given ASR model and parameters.
+        If needed, parameters for CTC segmentation can be set with ``set_config(·)``.
+        Then call the instance as function to align text within an audio file.
+
+    Example:
+        >>> # example file included in the ESPnet repository
+        >>> import soundfile
+        >>> speech, fs = soundfile.read("test_utils/ctc_align_test.wav")
+        >>> # load an ASR model
+        >>> from espnet_model_zoo.downloader import ModelDownloader
+        >>> d = ModelDownloader()
+        >>> wsjmodel = d.download_and_unpack( "kamo-naoyuki/wsj" )
+        >>> # Apply CTC segmentation
+        >>> aligner = CTCSegmentation( **wsjmodel )
+        >>> text=["utt1 THE SALE OF THE HOTELS", "utt2 ON PROPERTY MANAGEMENT"]
+        >>> aligner.set_config( gratis_blank=True )
+        >>> segments = aligner( speech, text, fs=fs )
+        >>> print( segments )
+        utt1 utt 0.27 1.72 -0.1663 THE SALE OF THE HOTELS
+        utt2 utt 4.54 6.10 -4.9646 ON PROPERTY MANAGEMENT
+
+    On multiprocessing:
+        To parallelize the computation with multiprocessing, these three steps
+        can be separated:
+        (1) ``get_lpz``: obtain the lpz,
+        (2) ``prepare_segmentation_task``: prepare the task, and
+        (3) ``get_segments``: perform CTC segmentation.
+        Note that the function `get_segments` is a staticmethod and therefore
+        independent of an already initialized CTCSegmentation object.
+
+    References:
+        CTC-Segmentation of Large Corpora for German End-to-end Speech Recognition
+        2020, Kürzinger, Winkelbauer, Li, Watzel, Rigoll
+        https://arxiv.org/abs/2007.09127
+
+    More parameters are described in https://github.com/lumaku/ctc-segmentation
+
+    """
+
+    fs = 16000
+    samples_to_frames_ratio = None
+    time_stamps = "auto"
+    choices_time_stamps = ["auto", "fixed"]
+    text_converter = "tokenize"
+    choices_text_converter = ["tokenize", "classic"]
+    warned_about_misconfiguration = False
+    config = CtcSegmentationParameters()
+
+    def __init__(
+        self,
+        asr_train_config: Union[Path, str],
+        asr_model_file: Union[Path, str] = None,
+        fs: int = 16000,
+        ngpu: int = 0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        kaldi_style_text: bool = True,
+        text_converter: str = "tokenize",
+        time_stamps: str = "auto",
+        **ctc_segmentation_args,
+    ):
+        """Initialize the CTCSegmentation module.
+
+        Args:
+            asr_train_config: ASR model config file (yaml).
+            asr_model_file: ASR model file (pth).
+            fs: Sample rate of audio file.
+            ngpu: Number of GPUs. Set 0 for processing on CPU, set to 1 for
+                processing on GPU. Multi-GPU aligning is currently not
+                implemented. Default: 0.
+            batch_size: Currently, only batch size == 1 is implemented.
+            dtype: Data type used for inference. Set dtype according to
+                the ASR model.
+            kaldi_style_text: A kaldi-style text file includes the name of the
+                utterance at the start of the line. If True, the utterance name
+                is expected as first word at each line. If False, utterance
+                names are automatically generated. Set this option according to
+                your input data. Default: True.
+            text_converter: How CTC segmentation handles text.
+                "tokenize": Use ESPnet 2 preprocessing to tokenize the text.
+                "classic": The text is preprocessed as in ESPnet 1 which takes
+                token length into account. If the ASR model has longer tokens,
+                this option may yield better results. Default: "tokenize".
+            time_stamps: Choose the method how the time stamps are
+                calculated. While "fixed" and "auto" use both the sample rate,
+                the ratio of samples to one frame is either automatically
+                determined for each inference or fixed at a certain ratio that
+                is initially determined by the module, but can be changed via
+                the parameter ``samples_to_frames_ratio``. Recommended for
+                longer audio files: "auto".
+            **ctc_segmentation_args: Parameters for CTC segmentation.
+        """
+        assert check_argument_types()
+
+        # Basic settings
+        if batch_size > 1:
+            raise NotImplementedError("Batch decoding is not implemented")
+        device = "cpu"
+        if ngpu == 1:
+            device = "cuda"
+        elif ngpu > 1:
+            logging.error("Multi-GPU not yet implemented.")
+            raise NotImplementedError("Only single GPU decoding is supported")
+
+        # Prepare ASR model
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, device
+        )
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+        self.preprocess_fn = ASRTask.build_preprocess_fn(asr_train_args, False)
+
+        # Warn for nets with high memory consumption on long audio files
+        if hasattr(asr_model, "encoder"):
+            encoder_module = asr_model.encoder.__class__.__module__
+        else:
+            encoder_module = "Unknown"
+        logging.info(f"Encoder module: {encoder_module}")
+        logging.info(f"CTC module:     {asr_model.ctc.__class__.__module__}")
+        if "rnn" not in encoder_module.lower():
+            logging.warning("No RNN model detected; memory consumption may be high.")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.device = device
+        self.dtype = dtype
+        self.ctc = asr_model.ctc
+
+        self.kaldi_style_text = kaldi_style_text
+        self.token_list = asr_model.token_list
+        # Apply configuration
+        self.set_config(
+            fs=fs,
+            time_stamps=time_stamps,
+            kaldi_style_text=kaldi_style_text,
+            text_converter=text_converter,
+            **ctc_segmentation_args,
+        )
+        # last token "<sos/eos>", not needed
+        self.config.char_list = asr_model.token_list[:-1]
+
+    def set_config(self, **kwargs):
+        """Set CTC segmentation parameters.
+
+        Parameters for timing:
+            time_stamps: Select method how CTC index duration is estimated, and
+                thus how the time stamps are calculated.
+            fs: Sample rate.
+            samples_to_frames_ratio: If you want to directly determine the
+                ratio of samples to CTC frames, set this parameter, and
+                set ``time_stamps`` to "fixed".
+                Note: If you want to calculate the time stamps as in
+                ESPnet 1, set this parameter to:
+                ``subsampling_factor * frame_duration / 1000``.
+
+        Parameters for text preparation:
+            set_blank: Index of blank in token list. Default: 0.
+            replace_spaces_with_blanks: Inserts blanks between words, which is
+                useful for handling long pauses between words. Only used in
+                ``text_converter="classic"`` preprocessing mode. Default: False.
+            kaldi_style_text: Determines whether the utterance name is expected
+                as fist word of the utterance. Set at module initialization.
+            text_converter: How CTC segmentation handles text.
+                Set at module initialization.
+
+        Parameters for alignment:
+            min_window_size: Minimum number of frames considered for a single
+                utterance. The current default value of 8000 corresponds to
+                roughly 4 minutes (depending on ASR model) and should be OK in
+                most cases. If your utterances are further apart, increase
+                this value, or decrease it for smaller audio files.
+            max_window_size: Maximum window size. It should not be necessary
+                to change this value.
+            gratis_blank: If True, the transition cost of blank is set to zero.
+                Useful for long preambles or if there are large unrelated segments
+                between utterances. Default: False.
+
+        Parameters for calculation of confidence score:
+            scoring_length: Block length to calculate confidence score. The
+                default value of 30 should be OK in most cases.
+        """
+        # Parameters for timing
+        if "time_stamps" in kwargs:
+            if kwargs["time_stamps"] not in self.choices_time_stamps:
+                raise NotImplementedError(
+                    f"Parameter ´time_stamps´ has to be one of "
+                    f"{list(self.choices_time_stamps)}",
+                )
+            self.time_stamps = kwargs["time_stamps"]
+        if "fs" in kwargs:
+            self.fs = float(kwargs["fs"])
+        if "samples_to_frames_ratio" in kwargs:
+            self.samples_to_frames_ratio = float(kwargs["samples_to_frames_ratio"])
+        # Parameters for text preparation
+        if "set_blank" in kwargs:
+            assert isinstance(kwargs["set_blank"], int)
+            self.config.blank = kwargs["set_blank"]
+        if "replace_spaces_with_blanks" in kwargs:
+            self.config.replace_spaces_with_blanks = bool(
+                kwargs["replace_spaces_with_blanks"]
+            )
+        if "kaldi_style_text" in kwargs:
+            assert isinstance(kwargs["kaldi_style_text"], bool)
+            self.kaldi_style_text = kwargs["kaldi_style_text"]
+        if "text_converter" in kwargs:
+            if kwargs["text_converter"] not in self.choices_text_converter:
+                raise NotImplementedError(
+                    f"Parameter ´text_converter´ has to be one of "
+                    f"{list(self.choices_text_converter)}",
+                )
+            self.text_converter = kwargs["text_converter"]
+        # Parameters for alignment
+        if "min_window_size" in kwargs:
+            assert isinstance(kwargs["min_window_size"], int)
+            self.config.min_window_size = kwargs["min_window_size"]
+        if "max_window_size" in kwargs:
+            assert isinstance(kwargs["max_window_size"], int)
+            self.config.max_window_size = kwargs["max_window_size"]
+        if "gratis_blank" in kwargs:
+            self.config.blank_transition_cost_zero = bool(kwargs["gratis_blank"])
+        if (
+            self.config.blank_transition_cost_zero
+            and self.config.replace_spaces_with_blanks
+            and not self.warned_about_misconfiguration
+        ):
+            logging.error(
+                "Blanks are inserted between words, and also the transition cost of"
+                " blank is zero. This configuration may lead to misalignments!"
+            )
+            self.warned_about_misconfiguration = True
+        # Parameter for calculation of confidence score
+        if "scoring_length" in kwargs:
+            assert isinstance(kwargs["scoring_length"], int)
+            self.config.score_min_mean_over_L = kwargs["scoring_length"]
+
+    def get_timing_config(self, speech_len=None, lpz_len=None):
+        """Obtain parameters to determine time stamps."""
+        timing_cfg = {
+            "index_duration": self.config.index_duration,
+        }
+        # As the parameter ctc_index_duration vetoes the other
+        if self.time_stamps == "fixed":
+            # Initialize the value, if not yet available
+            if self.samples_to_frames_ratio is None:
+                ratio = self.estimate_samples_to_frames_ratio()
+                self.samples_to_frames_ratio = ratio
+            index_duration = self.samples_to_frames_ratio / self.fs
+        else:
+            assert self.time_stamps == "auto"
+            samples_to_frames_ratio = speech_len / lpz_len
+            index_duration = samples_to_frames_ratio / self.fs
+        timing_cfg["index_duration"] = index_duration
+        return timing_cfg
+
+    def estimate_samples_to_frames_ratio(self, speech_len=215040):
+        """Determine the ratio of encoded frames to sample points.
+
+        This method helps to determine the time a single encoded frame occupies.
+        As the sample rate already gave the number of samples, only the ratio
+        of samples per encoded CTC frame are needed. This function estimates them by
+        doing one inference, which is only needed once.
+
+        Args:
+            speech_len: Length of randomly generated speech vector for single
+                inference. Default: 215040.
+
+        Returns:
+            samples_to_frames_ratio: Estimated ratio.
+        """
+        random_input = torch.rand(speech_len)
+        lpz = self.get_lpz(random_input)
+        lpz_len = lpz.shape[0]
+        # Most frontends (DefaultFrontend, SlidingWindow) discard trailing data
+        lpz_len = lpz_len + 1
+        samples_to_frames_ratio = speech_len // lpz_len
+        return samples_to_frames_ratio
+
+    @torch.no_grad()
+    def get_lpz(self, speech: Union[torch.Tensor, np.ndarray]):
+        """Obtain CTC posterior log probabilities for given speech data.
+
+        Args:
+            speech: Speech audio input.
+
+        Returns:
+            lpz: Numpy vector with CTC log posterior probabilities.
+        """
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lengths: (1,)
+        lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        batch = {"speech": speech, "speech_lengths": lengths}
+        batch = to_device(batch, device=self.device)
+        # Encode input
+        enc, _ = self.asr_model.encode(**batch)
+        assert len(enc) == 1, len(enc)
+        # Apply ctc layer to obtain log character probabilities
+        lpz = self.ctc.log_softmax(enc).detach()
+        #  Shape should be ( <time steps>, <classes> )
+        lpz = lpz.squeeze(0).cpu().numpy()
+        return lpz
+
+    def _split_text(self, text):
+        """Convert text to list and extract utterance IDs."""
+        utt_ids = None
+        # Handle multiline strings
+        if isinstance(text, str):
+            text = text.splitlines()
+        # Remove empty lines
+        text = list(filter(len, text))
+        # Handle kaldi-style text format
+        if self.kaldi_style_text:
+            utt_ids_and_text = [utt.split(" ", 1) for utt in text]
+            # remove utterances with empty text
+            utt_ids_and_text = filter(lambda ui: len(ui) == 2, utt_ids_and_text)
+            utt_ids_and_text = list(utt_ids_and_text)
+            utt_ids = [utt[0] for utt in utt_ids_and_text]
+            text = [utt[1] for utt in utt_ids_and_text]
+        return utt_ids, text
+
+    def prepare_segmentation_task(self, text, lpz, name=None, speech_len=None):
+        """Preprocess text, and gather text and lpz into a task object.
+
+        Text is pre-processed and tokenized depending on configuration.
+        If ``speech_len`` is given, the timing configuration is updated.
+        Text, lpz, and configuration is collected in a CTCSegmentationTask
+        object. The resulting object can be serialized and passed in a
+        multiprocessing computation.
+
+        A minimal amount of text processing is done, i.e., splitting the
+        utterances in ``text`` into a list and applying ``text_cleaner``.
+        It is recommended that you normalize the text beforehand, e.g.,
+        change numbers into their spoken equivalent word, remove special
+        characters, and convert UTF-8 characters to chars corresponding to
+        your ASR model dictionary.
+
+        The text is tokenized based on the ``text_converter`` setting:
+
+        The "tokenize" method is more efficient and the easiest for models
+        based on latin or cyrillic script that only contain the main chars,
+        ["a", "b", ...] or for Japanese or Chinese ASR models with ~3000
+        short Kanji / Hanzi tokens.
+
+        The "classic" method improves the the accuracy of the alignments
+        for models that contain longer tokens, but with a greater complexity
+        for computation. The function scans for partial tokens which may
+        improve time resolution.
+        For example, the word "▁really" will be broken down into
+        ``['▁', '▁r', '▁re', '▁real', '▁really']``. The alignment will be
+        based on the most probable activation sequence given by the network.
+
+        Args:
+            text: List or multiline-string with utterance ground truths.
+            lpz: Log CTC posterior probabilities obtained from the CTC-network;
+                numpy array shaped as ( <time steps>, <classes> ).
+            name: Audio file name. Choose a unique name, or the original audio
+                file name, to distinguish multiple audio files. Default: None.
+            speech_len: Number of sample points. If given, the timing
+                configuration is automatically derived from length of fs, length
+                of speech and length of lpz. If None is given, make sure the
+                timing parameters are correct, see time_stamps for reference!
+                Default: None.
+
+        Returns:
+            task: CTCSegmentationTask object that can be passed to
+                ``get_segments()`` in order to obtain alignments.
+        """
+        config = self.config
+        # Update timing parameters, if needed
+        if speech_len is not None:
+            lpz_len = lpz.shape[0]
+            timing_cfg = self.get_timing_config(speech_len, lpz_len)
+            config.set(**timing_cfg)
+        # `text` is needed in the form of a list.
+        utt_ids, text = self._split_text(text)
+        # Obtain utterance & label sequence from text
+        if self.text_converter == "tokenize":
+            # list of str --tokenize--> list of np.array
+            token_list = [
+                self.preprocess_fn("<dummy>", {"text": utt})["text"] for utt in text
+            ]
+            # filter out any instances of the <unk> token
+            unk = config.char_list.index("<unk>")
+            token_list = [utt[utt != unk] for utt in token_list]
+            ground_truth_mat, utt_begin_indices = prepare_token_list(config, token_list)
+        else:
+            assert self.text_converter == "classic"
+            text = [self.preprocess_fn.text_cleaner(utt) for utt in text]
+            token_list = [
+                "".join(self.preprocess_fn.tokenizer.text2tokens(utt)) for utt in text
+            ]
+            token_list = [utt.replace("<unk>", "") for utt in token_list]
+            ground_truth_mat, utt_begin_indices = prepare_text(config, token_list)
+        task = CTCSegmentationTask(
+            config=config,
+            name=name,
+            text=text,
+            ground_truth_mat=ground_truth_mat,
+            utt_begin_indices=utt_begin_indices,
+            utt_ids=utt_ids,
+            lpz=lpz,
+        )
+        return task
+
+    @staticmethod
+    def get_segments(task: CTCSegmentationTask):
+        """Obtain segments for given utterance texts and CTC log posteriors.
+
+        Args:
+            task: CTCSegmentationTask object that contains ground truth and
+                CTC posterior probabilities.
+
+        Returns:
+            result: Dictionary with alignments. Combine this with the task
+                object to obtain a human-readable segments representation.
+        """
+        assert check_argument_types()
+        assert task.config is not None
+        config = task.config
+        lpz = task.lpz
+        ground_truth_mat = task.ground_truth_mat
+        utt_begin_indices = task.utt_begin_indices
+        text = task.text
+        # Align using CTC segmentation
+        timings, char_probs, state_list = ctc_segmentation(
+            config, lpz, ground_truth_mat
+        )
+        # Obtain list of utterances with time intervals and confidence score
+        segments = determine_utterance_segments(
+            config, utt_begin_indices, char_probs, timings, text
+        )
+        # Store results
+        result = {
+            "name": task.name,
+            "timings": timings,
+            "char_probs": char_probs,
+            "state_list": state_list,
+            "segments": segments,
+            "done": True,
+        }
+        return result
+
+    def __call__(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        text: Union[List[str], str],
+        fs: Optional[int] = None,
+        name: Optional[str] = None,
+    ) -> CTCSegmentationTask:
+        """Align utterances.
+
+        Args:
+            speech: Audio file.
+            text: List or multiline-string with utterance ground truths.
+            fs: Sample rate in Hz. Optional, as this can be given when
+                the module is initialized.
+            name: Name of the file. Utterance names are derived from it.
+
+        Returns:
+            CTCSegmentationTask object with segments.
+        """
+        assert check_argument_types()
+        if fs is not None:
+            self.set_config(fs=fs)
+        # Get log CTC posterior probabilities
+        lpz = self.get_lpz(speech)
+        # Conflate text & lpz & config as a segmentation task object
+        task = self.prepare_segmentation_task(text, lpz, name, speech.shape[0])
+        # Apply CTC segmentation
+        segments = self.get_segments(task)
+        task.set(**segments)
+        assert check_return_type(task)
+        return task
+
+
+def ctc_align(
+    log_level: Union[int, str],
+    asr_train_config: str,
+    asr_model_file: str,
+    audio: Path,
+    text: TextIO,
+    output: TextIO,
+    print_utt_text: bool = True,
+    print_utt_score: bool = True,
+    **kwargs,
+):
+    """Provide the scripting interface to align text to audio."""
+    assert check_argument_types()
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    # Ignore configuration values that are set to None (from parser).
+    kwargs = {k: v for (k, v) in kwargs.items() if v is not None}
+
+    # Prepare CTC segmentation module
+    model = {
+        "asr_train_config": asr_train_config,
+        "asr_model_file": asr_model_file,
+    }
+    aligner = CTCSegmentation(**model, **kwargs)
+
+    # load audio file
+    assert audio.name != ""
+    name = audio.stem
+    speech, fs = soundfile.read(str(audio))
+    # load text file
+    transcripts = text.read()
+
+    # perform inference and CTC segmentation
+    segments = aligner(speech=speech, text=transcripts, fs=fs, name=name)
+
+    # Write to "segments" file or stdout
+    segments.print_utterance_text = print_utt_text
+    segments.print_confidence_score = print_utt_score
+    segments_str = str(segments)
+    output.write(segments_str)
+
+
+def get_parser():
+    """Obtain an argument-parser for the script interface."""
+    parser = config_argparse.ArgumentParser(
+        description="ASR Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+
+    group = parser.add_argument_group("Model configuration related")
+    group.add_argument("--asr_train_config", type=str, required=True)
+    group.add_argument("--asr_model_file", type=str, required=True)
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    group = parser.add_argument_group("CTC segmentation related")
+    group.add_argument(
+        "--fs",
+        type=int,
+        default=16000,
+        help="Sampling Frequency."
+        " The sampling frequency (in Hz) is needed to correctly determine the"
+        " starting and ending time of aligned segments.",
+    )
+    group.add_argument(
+        "--min_window_size",
+        type=int,
+        default=None,
+        help="Minimum window size considered for utterance.",
+    )
+    group.add_argument(
+        "--max_window_size",
+        type=int,
+        default=None,
+        help="Maximum window size considered for utterance.",
+    )
+    group.add_argument(
+        "--set_blank",
+        type=int,
+        default=None,
+        help="Index of model dictionary for blank token.",
+    )
+    group.add_argument(
+        "--gratis_blank",
+        type=str2bool,
+        default=False,
+        help="Set the transition cost of the blank token to zero. Audio sections"
+        " labeled with blank tokens can then be skipped without penalty. Useful"
+        " if there are unrelated audio segments between utterances.",
+    )
+    group.add_argument(
+        "--replace_spaces_with_blanks",
+        type=str2bool,
+        default=False,
+        help="Fill blanks in between words to better model pauses between words."
+        " This option is only active for `--text_converter classic`."
+        " Segments can be misaligned if this option is combined with"
+        " --gratis-blank.",
+    )
+    group.add_argument(
+        "--scoring_length",
+        type=int,
+        default=None,
+        help="Changes partitioning length L for calculation of the confidence score.",
+    )
+    group.add_argument(
+        "--time_stamps",
+        type=str,
+        default=CTCSegmentation.time_stamps,
+        choices=CTCSegmentation.choices_time_stamps,
+        help="Select method how CTC index duration is estimated, and"
+        " thus how the time stamps are calculated.",
+    )
+    group.add_argument(
+        "--text_converter",
+        type=str,
+        default=CTCSegmentation.text_converter,
+        choices=CTCSegmentation.choices_text_converter,
+        help="How CTC segmentation handles text.",
+    )
+
+    group = parser.add_argument_group("Input/output arguments")
+    group.add_argument(
+        "--kaldi_style_text",
+        type=str2bool,
+        default=True,
+        help="Assume that the input text file is kaldi-style formatted, i.e., the"
+        " utterance name is at the beginning of each line.",
+    )
+    group.add_argument(
+        "--print_utt_text",
+        type=str2bool,
+        default=True,
+        help="Include the utterance text in the segments output.",
+    )
+    group.add_argument(
+        "--print_utt_score",
+        type=str2bool,
+        default=True,
+        help="Include the confidence score in the segments output.",
+    )
+    group.add_argument(
+        "-a",
+        "--audio",
+        type=Path,
+        required=True,
+        help="Input audio file.",
+    )
+    group.add_argument(
+        "-t",
+        "--text",
+        type=argparse.FileType("r"),
+        required=True,
+        help="Input text file."
+        " Each line contains the ground truth of a single utterance."
+        " Kaldi-style text files include the name of the utterance as"
+        " the first word in the line.",
+    )
+    group.add_argument(
+        "-o",
+        "--output",
+        type=argparse.FileType("w"),
+        default="-",
+        help="Output in the form of a `segments` file."
+        " If not given, output is written to stdout.",
+    )
+    return parser
+
+
+def main(cmd=None):
+    """Parse arguments and start the alignment in ctc_align(·)."""
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    ctc_align(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py
index 7b7e45732ba..d5a7932b0e6 100755
--- a/espnet2/bin/asr_inference.py
+++ b/espnet2/bin/asr_inference.py
@@ -3,6 +3,7 @@
 import logging
 from pathlib import Path
 import sys
+from typing import Any
 from typing import Optional
 from typing import Sequence
 from typing import Tuple
@@ -23,6 +24,11 @@
 from espnet.nets.scorers.ctc import CTCPrefixScorer
 from espnet.nets.scorers.length_bonus import LengthBonus
 from espnet.utils.cli_utils import get_commandline_args
+from espnet2.asr.transducer.beam_search_transducer import BeamSearchTransducer
+from espnet2.asr.transducer.beam_search_transducer import (
+    ExtendedHypothesis as ExtTransHypothesis,  # noqa: H301
+)
+from espnet2.asr.transducer.beam_search_transducer import Hypothesis as TransHypothesis
 from espnet2.fileio.datadir_writer import DatadirWriter
 from espnet2.tasks.asr import ASRTask
 from espnet2.tasks.lm import LMTask
@@ -50,10 +56,13 @@ class Speech2Text:
 
     def __init__(
         self,
-        asr_train_config: Union[Path, str],
+        asr_train_config: Union[Path, str] = None,
         asr_model_file: Union[Path, str] = None,
+        transducer_conf: dict = None,
         lm_train_config: Union[Path, str] = None,
         lm_file: Union[Path, str] = None,
+        ngram_scorer: str = "full",
+        ngram_file: Union[Path, str] = None,
         token_type: str = None,
         bpemodel: str = None,
         device: str = "cpu",
@@ -64,6 +73,7 @@ def __init__(
         beam_size: int = 20,
         ctc_weight: float = 0.5,
         lm_weight: float = 1.0,
+        ngram_weight: float = 0.9,
         penalty: float = 0.0,
         nbest: int = 1,
         streaming: bool = False,
@@ -78,6 +88,7 @@ def __init__(
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
         decoder = asr_model.decoder
+
         ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
         token_list = asr_model.token_list
         scorers.update(
@@ -93,51 +104,83 @@ def __init__(
             )
             scorers["lm"] = lm.lm
 
-        # 3. Build BeamSearch object
-        weights = dict(
-            decoder=1.0 - ctc_weight,
-            ctc=ctc_weight,
-            lm=lm_weight,
-            length_bonus=penalty,
-        )
-        beam_search = BeamSearch(
-            beam_size=beam_size,
-            weights=weights,
-            scorers=scorers,
-            sos=asr_model.sos,
-            eos=asr_model.eos,
-            vocab_size=len(token_list),
-            token_list=token_list,
-            pre_beam_score_key=None if ctc_weight == 1.0 else "full",
-        )
-        # TODO(karita): make all scorers batchfied
-        if batch_size == 1:
-            non_batch = [
-                k
-                for k, v in beam_search.full_scorers.items()
-                if not isinstance(v, BatchScorerInterface)
-            ]
-            if len(non_batch) == 0:
-                if streaming:
-                    beam_search.__class__ = BatchBeamSearchOnlineSim
-                    beam_search.set_streaming_config(asr_train_config)
-                    logging.info("BatchBeamSearchOnlineSim implementation is selected.")
-                else:
-                    beam_search.__class__ = BatchBeamSearch
-                    logging.info("BatchBeamSearch implementation is selected.")
+        # 3. Build ngram model
+        if ngram_file is not None:
+            if ngram_scorer == "full":
+                from espnet.nets.scorers.ngram import NgramFullScorer
+
+                ngram = NgramFullScorer(ngram_file, token_list)
             else:
-                logging.warning(
-                    f"As non-batch scorers {non_batch} are found, "
-                    f"fall back to non-batch implementation."
-                )
-        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
-        for scorer in scorers.values():
-            if isinstance(scorer, torch.nn.Module):
-                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
-        logging.info(f"Beam_search: {beam_search}")
-        logging.info(f"Decoding device={device}, dtype={dtype}")
-
-        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+                from espnet.nets.scorers.ngram import NgramPartScorer
+
+                ngram = NgramPartScorer(ngram_file, token_list)
+        else:
+            ngram = None
+        scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        if asr_model.use_transducer_decoder:
+            beam_search_transducer = BeamSearchTransducer(
+                decoder=asr_model.decoder,
+                joint_network=asr_model.joint_network,
+                beam_size=beam_size,
+                lm=scorers["lm"] if "lm" in scorers else None,
+                lm_weight=lm_weight,
+                **transducer_conf,
+            )
+            beam_search = None
+        else:
+            beam_search_transducer = None
+
+            weights = dict(
+                decoder=1.0 - ctc_weight,
+                ctc=ctc_weight,
+                lm=lm_weight,
+                ngram=ngram_weight,
+                length_bonus=penalty,
+            )
+            beam_search = BeamSearch(
+                beam_size=beam_size,
+                weights=weights,
+                scorers=scorers,
+                sos=asr_model.sos,
+                eos=asr_model.eos,
+                vocab_size=len(token_list),
+                token_list=token_list,
+                pre_beam_score_key=None if ctc_weight == 1.0 else "full",
+            )
+
+            # TODO(karita): make all scorers batchfied
+            if batch_size == 1:
+                non_batch = [
+                    k
+                    for k, v in beam_search.full_scorers.items()
+                    if not isinstance(v, BatchScorerInterface)
+                ]
+                if len(non_batch) == 0:
+                    if streaming:
+                        beam_search.__class__ = BatchBeamSearchOnlineSim
+                        beam_search.set_streaming_config(asr_train_config)
+                        logging.info(
+                            "BatchBeamSearchOnlineSim implementation is selected."
+                        )
+                    else:
+                        beam_search.__class__ = BatchBeamSearch
+                        logging.info("BatchBeamSearch implementation is selected.")
+                else:
+                    logging.warning(
+                        f"As non-batch scorers {non_batch} are found, "
+                        f"fall back to non-batch implementation."
+                    )
+
+            beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+            for scorer in scorers.values():
+                if isinstance(scorer, torch.nn.Module):
+                    scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+            logging.info(f"Beam_search: {beam_search}")
+            logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
         if token_type is None:
             token_type = asr_train_args.token_type
         if bpemodel is None:
@@ -160,6 +203,7 @@ def __init__(
         self.converter = converter
         self.tokenizer = tokenizer
         self.beam_search = beam_search
+        self.beam_search_transducer = beam_search_transducer
         self.maxlenratio = maxlenratio
         self.minlenratio = minlenratio
         self.device = device
@@ -169,7 +213,14 @@ def __init__(
     @torch.no_grad()
     def __call__(
         self, speech: Union[torch.Tensor, np.ndarray]
-    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+    ) -> List[
+        Tuple[
+            Optional[str],
+            List[str],
+            List[int],
+            Union[Hypothesis, ExtTransHypothesis, TransHypothesis],
+        ]
+    ]:
         """Inference
 
         Args:
@@ -186,7 +237,7 @@ def __call__(
 
         # data: (Nsamples,) -> (1, Nsamples)
         speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
-        # lenghts: (1,)
+        # lengths: (1,)
         lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
         batch = {"speech": speech, "speech_lengths": lengths}
 
@@ -195,20 +246,30 @@ def __call__(
 
         # b. Forward Encoder
         enc, _ = self.asr_model.encode(**batch)
+        if isinstance(enc, tuple):
+            enc = enc[0]
         assert len(enc) == 1, len(enc)
 
         # c. Passed the encoder result and the beam search
-        nbest_hyps = self.beam_search(
-            x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
-        )
+        if self.beam_search_transducer:
+            nbest_hyps = self.beam_search_transducer(enc[0])
+        else:
+            nbest_hyps = self.beam_search(
+                x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+            )
+
         nbest_hyps = nbest_hyps[: self.nbest]
 
         results = []
         for hyp in nbest_hyps:
-            assert isinstance(hyp, Hypothesis), type(hyp)
+            assert isinstance(hyp, (Hypothesis, TransHypothesis)), type(hyp)
 
             # remove sos/eos and get results
-            token_int = hyp.yseq[1:-1].tolist()
+            last_pos = None if self.asr_model.use_transducer_decoder else -1
+            if isinstance(hyp.yseq, list):
+                token_int = hyp.yseq[1:last_pos]
+            else:
+                token_int = hyp.yseq[1:last_pos].tolist()
 
             # remove blank symbol id, which is assumed to be 0
             token_int = list(filter(lambda x: x != 0, token_int))
@@ -225,6 +286,36 @@ def __call__(
         assert check_return_type(results)
         return results
 
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Speech2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            Speech2Text: Speech2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Speech2Text(**kwargs)
+
 
 def inference(
     output_dir: str,
@@ -237,21 +328,25 @@ def inference(
     seed: int,
     ctc_weight: float,
     lm_weight: float,
+    ngram_weight: float,
     penalty: float,
     nbest: int,
     num_workers: int,
     log_level: Union[int, str],
     data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
     key_file: Optional[str],
-    asr_train_config: str,
-    asr_model_file: str,
+    asr_train_config: Optional[str],
+    asr_model_file: Optional[str],
     lm_train_config: Optional[str],
     lm_file: Optional[str],
     word_lm_train_config: Optional[str],
     word_lm_file: Optional[str],
+    ngram_file: Optional[str],
+    model_tag: Optional[str],
     token_type: Optional[str],
     bpemodel: Optional[str],
     allow_variable_data_keys: bool,
+    transducer_conf: Optional[dict],
     streaming: bool,
 ):
     assert check_argument_types()
@@ -276,11 +371,13 @@ def inference(
     set_all_random_seed(seed)
 
     # 2. Build speech2text
-    speech2text = Speech2Text(
+    speech2text_kwargs = dict(
         asr_train_config=asr_train_config,
         asr_model_file=asr_model_file,
+        transducer_conf=transducer_conf,
         lm_train_config=lm_train_config,
         lm_file=lm_file,
+        ngram_file=ngram_file,
         token_type=token_type,
         bpemodel=bpemodel,
         device=device,
@@ -290,10 +387,15 @@ def inference(
         beam_size=beam_size,
         ctc_weight=ctc_weight,
         lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
         penalty=penalty,
         nbest=nbest,
         streaming=streaming,
     )
+    speech2text = Speech2Text.from_pretrained(
+        model_tag=model_tag,
+        **speech2text_kwargs,
+    )
 
     # 3. Build data-iterator
     loader = ASRTask.build_streaming_iterator(
@@ -389,12 +491,47 @@ def get_parser():
     group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
 
     group = parser.add_argument_group("The model configuration related")
-    group.add_argument("--asr_train_config", type=str, required=True)
-    group.add_argument("--asr_model_file", type=str, required=True)
-    group.add_argument("--lm_train_config", type=str)
-    group.add_argument("--lm_file", type=str)
-    group.add_argument("--word_lm_train_config", type=str)
-    group.add_argument("--word_lm_file", type=str)
+    group.add_argument(
+        "--asr_train_config",
+        type=str,
+        help="ASR training configuration",
+    )
+    group.add_argument(
+        "--asr_model_file",
+        type=str,
+        help="ASR model parameter file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
 
     group = parser.add_argument_group("Beam-search related")
     group.add_argument(
@@ -413,7 +550,9 @@ def get_parser():
         help="Input length ratio to obtain max output length. "
         "If maxlenratio=0.0 (default), it uses a end-detect "
         "function "
-        "to automatically find maximum hypothesis lengths",
+        "to automatically find maximum hypothesis lengths."
+        "If maxlenratio<0.0, its absolute value is interpreted"
+        "as a constant max output length",
     )
     group.add_argument(
         "--minlenratio",
@@ -428,8 +567,15 @@ def get_parser():
         help="CTC weight in joint decoding",
     )
     group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
     group.add_argument("--streaming", type=str2bool, default=False)
 
+    group.add_argument(
+        "--transducer_conf",
+        default=None,
+        help="The keyword arguments for transducer beam search.",
+    )
+
     group = parser.add_argument_group("Text converter related")
     group.add_argument(
         "--token_type",
diff --git a/espnet2/bin/asr_inference_k2.py b/espnet2/bin/asr_inference_k2.py
new file mode 100755
index 00000000000..81b206fc978
--- /dev/null
+++ b/espnet2/bin/asr_inference_k2.py
@@ -0,0 +1,767 @@
+#!/usr/bin/env python3
+import argparse
+import datetime
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Dict
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import k2
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+import yaml
+
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.fst.lm_rescore import nbest_am_lm_scores
+from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.lm import LMTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+def indices_to_split_size(indices, total_elements: int = None):
+    """convert indices to split_size
+
+    During decoding, the api torch.tensor_split should be used.
+    However, torch.tensor_split is only available with pytorch >= 1.8.0.
+    So torch.split is used to pass ci with pytorch < 1.8.0.
+    This fuction is used to prepare input for torch.split.
+    """
+    if indices[0] != 0:
+        indices = [0] + indices
+
+    split_size = [indices[i] - indices[i - 1] for i in range(1, len(indices))]
+    if total_elements is not None and sum(split_size) != total_elements:
+        split_size.append(total_elements - sum(split_size))
+    return split_size
+
+
+# copied from:
+# https://github.com/k2-fsa/snowfall/blob/master/snowfall/training/ctc_graph.py#L13
+def build_ctc_topo(tokens: List[int]) -> k2.Fsa:
+    """Build CTC topology.
+
+    A token which appears once on the right side (i.e. olabels) may
+    appear multiple times on the left side (ilabels), possibly with
+    epsilons in between.
+    When 0 appears on the left side, it represents the blank symbol;
+    when it appears on the right side, it indicates an epsilon. That
+    is, 0 has two meanings here.
+    Args:
+      tokens:
+        A list of tokens, e.g., phones, characters, etc.
+    Returns:
+      Returns an FST that converts repeated tokens to a single token.
+    """
+    assert 0 in tokens, "We assume 0 is ID of the blank symbol"
+
+    num_states = len(tokens)
+    final_state = num_states
+    arcs = ""
+    for i in range(num_states):
+        for j in range(num_states):
+            if i == j:
+                arcs += f"{i} {i} {tokens[i]} 0 0.0\n"
+            else:
+                arcs += f"{i} {j} {tokens[j]} {tokens[j]} 0.0\n"
+        arcs += f"{i} {final_state} -1 -1 0.0\n"
+    arcs += f"{final_state}"
+    ans = k2.Fsa.from_str(arcs, num_aux_labels=1)
+    ans = k2.arc_sort(ans)
+    return ans
+
+
+# Modified from: https://github.com/k2-fsa/snowfall/blob/master/snowfall/common.py#L309
+def get_texts(best_paths: k2.Fsa) -> List[List[int]]:
+    """Extract the texts from the best-path FSAs.
+
+     Args:
+         best_paths:  a k2.Fsa with best_paths.arcs.num_axes() == 3, i.e.
+                  containing multiple FSAs, which is expected to be the result
+                  of k2.shortest_path (otherwise the returned values won't
+                  be meaningful).  Must have the 'aux_labels' attribute, as
+                a ragged tensor.
+    Return:
+        Returns a list of lists of int, containing the label sequences we
+        decoded.
+    """
+    # remove any 0's or -1's (there should be no 0's left but may be -1's.)
+
+    if isinstance(best_paths.aux_labels, k2.RaggedTensor):
+        aux_labels = best_paths.aux_labels.remove_values_leq(0)
+        aux_shape = best_paths.arcs.shape().compose(aux_labels.shape())
+
+        # remove the states and arcs axes.
+        aux_shape = aux_shape.remove_axis(1)
+        aux_shape = aux_shape.remove_axis(1)
+        aux_labels = k2.RaggedTensor(aux_shape, aux_labels.values())
+    else:
+        # remove axis corresponding to states.
+        aux_shape = best_paths.arcs.shape().remove_axis(1)
+        aux_labels = k2.RaggedTensor(aux_shape, best_paths.aux_labels)
+        # remove 0's and -1's.
+        aux_labels = aux_labels.remove_values_leq(0)
+
+    assert aux_labels.num_axes == 2
+    return aux_labels.tolist()
+
+
+class k2Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = k2Speech2Text("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech = np.expand_dims(audio, 0) # shape: [batch_size, speech_length]
+        >>> speech_lengths = np.array([audio.shape[0]]) # shape: [batch_size]
+        >>> batch = {"speech": speech, "speech_lengths", speech_lengths}
+        >>> speech2text(batch)
+        [(text, token, token_int, score), ...]
+
+    """
+
+    def __init__(
+        self,
+        asr_train_config: Union[Path, str],
+        asr_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 8,
+        ctc_weight: float = 0.5,
+        lm_weight: float = 1.0,
+        penalty: float = 0.0,
+        nbest: int = 1,
+        streaming: bool = False,
+        search_beam_size: int = 20,
+        output_beam_size: int = 20,
+        min_active_states: int = 30,
+        max_active_states: int = 10000,
+        blank_bias: float = 0.0,
+        lattice_weight: float = 1.0,
+        is_ctc_decoding: bool = True,
+        lang_dir: Optional[str] = None,
+        use_fgram_rescoring: bool = False,
+        use_nbest_rescoring: bool = False,
+        am_weight: float = 1.0,
+        decoder_weight: float = 0.5,
+        nnlm_weight: float = 1.0,
+        num_paths: int = 1000,
+        nbest_batch_size: int = 500,
+        nll_batch_size: int = 100,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ASR model
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, device
+        )
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+        token_list = asr_model.token_list
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            self.lm = lm
+
+        self.is_ctc_decoding = is_ctc_decoding
+        self.use_fgram_rescoring = use_fgram_rescoring
+        self.use_nbest_rescoring = use_nbest_rescoring
+
+        assert self.is_ctc_decoding, "Currently, only ctc_decoding graph is supported."
+        if self.is_ctc_decoding:
+            self.decode_graph = k2.arc_sort(
+                build_ctc_topo(list(range(len(token_list))))
+            )
+
+        self.decode_graph = self.decode_graph.to(device)
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+        logging.info(f"Running on : {device}")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.device = device
+        self.dtype = dtype
+        self.search_beam_size = search_beam_size
+        self.output_beam_size = output_beam_size
+        self.min_active_states = min_active_states
+        self.max_active_states = max_active_states
+        self.blank_bias = blank_bias
+        self.lattice_weight = lattice_weight
+        self.am_weight = am_weight
+        self.decoder_weight = decoder_weight
+        self.nnlm_weight = nnlm_weight
+        self.num_paths = num_paths
+        self.nbest_batch_size = nbest_batch_size
+        self.nll_batch_size = nll_batch_size
+
+    @torch.no_grad()
+    def __call__(
+        self, batch: Dict[str, Union[torch.Tensor, np.ndarray]]
+    ) -> List[Tuple[Optional[str], List[str], List[int], float]]:
+        """Inference
+
+        Args:
+            batch: Input speech data and corresponding lengths
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        if isinstance(batch["speech"], np.ndarray):
+            batch["speech"] = torch.tensor(batch["speech"])
+        if isinstance(batch["speech_lengths"], np.ndarray):
+            batch["speech_lengths"] = torch.tensor(batch["speech_lengths"])
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        # enc: [N, T, C]
+        enc, encoder_out_lens = self.asr_model.encode(**batch)
+
+        # logp_encoder_output: [N, T, C]
+        logp_encoder_output = torch.nn.functional.log_softmax(
+            self.asr_model.ctc.ctc_lo(enc), dim=2
+        )
+
+        # It maybe useful to tune blank_bias.
+        # The valid range of blank_bias is [-inf, 0]
+        logp_encoder_output[:, :, 0] += self.blank_bias
+
+        batch_size = encoder_out_lens.size(0)
+        sequence_idx = torch.arange(0, batch_size).unsqueeze(0).t().to(torch.int32)
+        start_frame = torch.zeros([batch_size], dtype=torch.int32).unsqueeze(0).t()
+        num_frames = encoder_out_lens.cpu().unsqueeze(0).t().to(torch.int32)
+        supervision_segments = torch.cat([sequence_idx, start_frame, num_frames], dim=1)
+
+        supervision_segments = supervision_segments.to(torch.int32)
+
+        # An introduction to DenseFsaVec:
+        # https://k2-fsa.github.io/k2/core_concepts/index.html#dense-fsa-vector
+        # It could be viewed as a fsa-type lopg_encoder_output,
+        # whose weight on the arcs are initialized with logp_encoder_output.
+        # The goal of converting tensor-type to fsa-type is using
+        # fsa related functions in k2. e.g. k2.intersect_dense_pruned below
+        dense_fsa_vec = k2.DenseFsaVec(logp_encoder_output, supervision_segments)
+
+        # The term "intersect" is similar to "compose" in k2.
+        # The differences is are:
+        # for "compose" functions, the composition involves
+        # mathcing output label of a.fsa and input label of b.fsa
+        # while for "intersect" functions, the composition involves
+        # matching input label of a.fsa and input label of b.fsa
+        # Actually, in compose functions, b.fsa is inverted and then
+        # a.fsa and inv_b.fsa are intersected together.
+        # For difference between compose and interset:
+        # https://github.com/k2-fsa/k2/blob/master/k2/python/k2/fsa_algo.py#L308
+        # For definition of k2.intersect_dense_pruned:
+        # https://github.com/k2-fsa/k2/blob/master/k2/python/k2/autograd.py#L648
+        lattices = k2.intersect_dense_pruned(
+            self.decode_graph,
+            dense_fsa_vec,
+            self.search_beam_size,
+            self.output_beam_size,
+            self.min_active_states,
+            self.max_active_states,
+        )
+
+        # lattices.scores is the sum of decode_graph.scores(a.k.a. lm weight) and
+        # dense_fsa_vec.scores(a.k.a. am weight) on related arcs.
+        # For ctc decoding graph, lattices.scores only store am weight
+        # since the decoder_graph only define the ctc topology and
+        # has no lm weight on its arcs.
+        # While for 3-gram decoding, whose graph is converted from language models,
+        # lattice.scores contains both am weights and lm weights
+        #
+        # It maybe useful to tune lattice.scores
+        # The valid range of lattice_weight is [0, inf)
+        # The lattice_weight will affect the search of k2.random_paths
+        lattices.scores *= self.lattice_weight
+
+        results = []
+        if self.use_nbest_rescoring:
+            (
+                am_scores,
+                lm_scores,
+                token_ids,
+                new2old,
+                path_to_seq_map,
+                seq_to_path_splits,
+            ) = nbest_am_lm_scores(
+                lattices, self.num_paths, self.device, self.nbest_batch_size
+            )
+
+            ys_pad_lens = torch.tensor([len(hyp) for hyp in token_ids]).to(self.device)
+            max_token_length = max(ys_pad_lens)
+            ys_pad_list = []
+            for hyp in token_ids:
+                ys_pad_list.append(
+                    torch.cat(
+                        [
+                            torch.tensor(hyp, dtype=torch.long),
+                            torch.tensor(
+                                [self.asr_model.ignore_id]
+                                * (max_token_length.item() - len(hyp)),
+                                dtype=torch.long,
+                            ),
+                        ]
+                    )
+                )
+
+            ys_pad = (
+                torch.stack(ys_pad_list).to(torch.long).to(self.device)
+            )  # [batch, max_token_length]
+
+            encoder_out = enc.index_select(0, path_to_seq_map.to(torch.long)).to(
+                self.device
+            )  # [batch, T, dim]
+            encoder_out_lens = encoder_out_lens.index_select(
+                0, path_to_seq_map.to(torch.long)
+            ).to(
+                self.device
+            )  # [batch]
+
+            decoder_scores = -self.asr_model.batchify_nll(
+                encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, self.nll_batch_size
+            )
+
+            # padded_value for nnlm is 0
+            ys_pad[ys_pad == self.asr_model.ignore_id] = 0
+            nnlm_nll, x_lengths = self.lm.batchify_nll(
+                ys_pad, ys_pad_lens, self.nll_batch_size
+            )
+            nnlm_scores = -nnlm_nll.sum(dim=1)
+
+            batch_tot_scores = (
+                self.am_weight * am_scores
+                + self.decoder_weight * decoder_scores
+                + self.nnlm_weight * nnlm_scores
+            )
+            split_size = indices_to_split_size(
+                seq_to_path_splits.tolist(), total_elements=batch_tot_scores.size(0)
+            )
+            batch_tot_scores = torch.split(
+                batch_tot_scores,
+                split_size,
+            )
+
+            hyps = []
+            scores = []
+            processed_seqs = 0
+            for tot_scores in batch_tot_scores:
+                if tot_scores.nelement() == 0:
+                    # the last element by torch.tensor_split may be empty
+                    # e.g.
+                    # torch.tensor_split(torch.tensor([1,2,3,4]), torch.tensor([2,4]))
+                    # (tensor([1, 2]), tensor([3, 4]), tensor([], dtype=torch.int64))
+                    break
+                best_seq_idx = processed_seqs + torch.argmax(tot_scores)
+
+                assert best_seq_idx < len(token_ids)
+                best_token_seqs = token_ids[best_seq_idx]
+                processed_seqs += tot_scores.nelement()
+                hyps.append(best_token_seqs)
+                scores.append(tot_scores.max().item())
+
+            assert len(hyps) == len(split_size)
+        else:
+            best_paths = k2.shortest_path(lattices, use_double_scores=True)
+            scores = best_paths.get_tot_scores(
+                use_double_scores=True, log_semiring=False
+            ).tolist()
+            hyps = get_texts(best_paths)
+
+        assert len(scores) == len(hyps)
+
+        for token_int, score in zip(hyps, scores):
+            # For decoding methods nbest_rescoring and ctc_decoding
+            # hyps stores token_index, which is lattice.labels.
+
+            # convert token_id to text with self.tokenizer
+            token = self.converter.ids2tokens(token_int)
+            assert self.tokenizer is not None
+            text = self.tokenizer.tokens2text(token)
+            results.append((text, token, token_int, score))
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build k2Speech2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            Speech2Text: Speech2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return k2Speech2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    ctc_weight: float,
+    lm_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    asr_train_config: Optional[str],
+    asr_model_file: Optional[str],
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    streaming: bool,
+    is_ctc_decoding: bool,
+    use_nbest_rescoring: bool,
+    num_paths: int,
+    nbest_batch_size: int,
+    nll_batch_size: int,
+    k2_config: Optional[str],
+):
+    assert is_ctc_decoding, "Currently, only ctc_decoding graph is supported."
+    assert check_argument_types()
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+    with open(k2_config) as k2_config_file:
+        dict_k2_config = yaml.safe_load(k2_config_file)
+
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        ctc_weight=ctc_weight,
+        lm_weight=lm_weight,
+        penalty=penalty,
+        nbest=nbest,
+        streaming=streaming,
+        is_ctc_decoding=is_ctc_decoding,
+        use_nbest_rescoring=use_nbest_rescoring,
+        num_paths=num_paths,
+        nbest_batch_size=nbest_batch_size,
+        nll_batch_size=nll_batch_size,
+    )
+
+    speech2text_kwargs = dict(**speech2text_kwargs, **dict_k2_config)
+    speech2text = k2Speech2Text.from_pretrained(
+        model_tag=model_tag,
+        **speech2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = ASRTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+        collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    with DatadirWriter(output_dir) as writer:
+        start_decoding_time = datetime.datetime.now()
+        for batch_idx, (keys, batch) in enumerate(loader):
+            if batch_idx % 10 == 0:
+                logging.info(f"Processing {batch_idx} batch")
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+
+            # 1-best list of (text, token, token_int)
+            results = speech2text(batch)
+
+            for key_idx, (text, token, token_int, score) in enumerate(results):
+                key = keys[key_idx]
+                best_writer = writer["1best_recog"]
+                # Write the result to each file
+                best_writer["token"][key] = " ".join(token)
+                best_writer["token_int"][key] = " ".join(map(str, token_int))
+                best_writer["score"][key] = str(score)
+
+                if text is not None:
+                    best_writer["text"][key] = text
+
+        end_decoding_time = datetime.datetime.now()
+        decoding_duration = end_decoding_time - start_decoding_time
+        logging.info(f"Decoding duration is {decoding_duration.seconds} seconds")
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ASR Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--asr_train_config",
+        type=str,
+        help="ASR training configuration",
+    )
+    group.add_argument(
+        "--asr_model_file",
+        type=str,
+        help="ASR model parameter file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument(
+        "--ctc_weight",
+        type=float,
+        default=0.5,
+        help="CTC weight in joint decoding",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--streaming", type=str2bool, default=False)
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--is_ctc_decoding",
+        type=str2bool,
+        default=True,
+        help="Use ctc topology as decoding graph",
+    )
+    group.add_argument("--use_nbest_rescoring", type=str2bool, default=False)
+    group.add_argument(
+        "--num_paths",
+        type=int,
+        default=1000,
+        help="The third argument for k2.random_paths",
+    )
+    group.add_argument(
+        "--nbest_batch_size",
+        type=int,
+        default=500,
+        help="batchify nbest list when computing am/lm scores to avoid OOM",
+    )
+    group.add_argument(
+        "--nll_batch_size",
+        type=int,
+        default=100,
+        help="batch_size when computing nll during nbest rescoring",
+    )
+    group.add_argument("--k2_config", type=str, help="Config file for decoding with k2")
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/asr_inference_maskctc.py b/espnet2/bin/asr_inference_maskctc.py
new file mode 100644
index 00000000000..20b857482f1
--- /dev/null
+++ b/espnet2/bin/asr_inference_maskctc.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.asr.maskctc_model import MaskCTCInference
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.asr import ASRTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        asr_train_config: Union[Path, str],
+        asr_model_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        batch_size: int = 1,
+        dtype: str = "float32",
+        maskctc_n_iterations: int = 10,
+        maskctc_threshold_probability: float = 0.99,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ASR model
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, device
+        )
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+        token_list = asr_model.token_list
+
+        s2t = MaskCTCInference(
+            asr_model=asr_model,
+            n_iterations=maskctc_n_iterations,
+            threshold_probability=maskctc_threshold_probability,
+        )
+        s2t.to(device=device, dtype=getattr(torch, dtype)).eval()
+
+        # 2. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.s2t = s2t
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.device = device
+        self.dtype = dtype
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lenghts: (1,)
+        lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        batch = {"speech": speech, "speech_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.asr_model.encode(**batch)
+        if isinstance(enc, tuple):
+            enc = enc[0]
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the inference algorithm
+        hyp = self.s2t(enc[0])
+        assert isinstance(hyp, Hypothesis), type(hyp)
+
+        # remove sos/eos and get results
+        token_int = hyp.yseq[1:-1].tolist()
+
+        # remove blank symbol id, which is assumed to be 0
+        token_int = list(filter(lambda x: x != 0, token_int))
+
+        # Change integer-ids to tokens
+        token = self.converter.ids2tokens(token_int)
+
+        if self.tokenizer is not None:
+            text = self.tokenizer.tokens2text(token)
+        else:
+            text = None
+        results = [(text, token, token_int, hyp)]
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Speech2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            Speech2Text: Speech2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Speech2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    batch_size: int,
+    dtype: str,
+    ngpu: int,
+    seed: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    asr_train_config: str,
+    asr_model_file: str,
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    maskctc_n_iterations: int,
+    maskctc_threshold_probability: float,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        batch_size=batch_size,
+        dtype=dtype,
+        maskctc_n_iterations=maskctc_n_iterations,
+        maskctc_threshold_probability=maskctc_threshold_probability,
+    )
+    speech2text = Speech2Text.from_pretrained(
+        model_tag=model_tag,
+        **speech2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = ASRTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+        collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            try:
+                results = speech2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]]
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            (text, token, token_int, hyp) = results[0]
+
+            # Create a directory: outdir/{n}best_recog
+            ibest_writer = writer["1best_recog"]
+
+            # Write the result to each file
+            ibest_writer["token"][key] = " ".join(token)
+            ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+            ibest_writer["score"][key] = str(hyp.score)
+
+            if text is not None:
+                ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ASR Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument("--asr_train_config", type=str, required=True)
+    group.add_argument("--asr_model_file", type=str, required=True)
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Decoding related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--maskctc_n_iterations", type=int, default=10)
+    group.add_argument("--maskctc_threshold_probability", type=float, default=0.99)
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/asr_inference_streaming.py b/espnet2/bin/asr_inference_streaming.py
new file mode 100755
index 00000000000..4eebbb6e2a2
--- /dev/null
+++ b/espnet2/bin/asr_inference_streaming.py
@@ -0,0 +1,585 @@
+#!/usr/bin/env python3
+import argparse
+from espnet.nets.batch_beam_search_online import BatchBeamSearchOnline
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.ctc import CTCPrefixScorer
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,  # noqa: H301
+)
+from espnet2.asr.encoder.contextual_block_conformer_encoder import (
+    ContextualBlockConformerEncoder,  # noqa: H301
+)
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.lm import LMTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+import logging
+import numpy as np
+from pathlib import Path
+import sys
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+
+class Speech2TextStreaming:
+    """Speech2TextStreaming class
+
+    Details in "Streaming Transformer ASR with Blockwise Synchronous Beam Search"
+    (https://arxiv.org/abs/2006.14941)
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2TextStreaming("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        asr_train_config: Union[Path, str],
+        asr_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        ctc_weight: float = 0.5,
+        lm_weight: float = 1.0,
+        penalty: float = 0.0,
+        nbest: int = 1,
+        disable_repetition_detection=False,
+        decoder_text_length_limit=0,
+        encoded_feat_length_limit=0,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ASR model
+        scorers = {}
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, device
+        )
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+        assert isinstance(
+            asr_model.encoder, ContextualBlockTransformerEncoder
+        ) or isinstance(asr_model.encoder, ContextualBlockConformerEncoder)
+
+        decoder = asr_model.decoder
+        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+        token_list = asr_model.token_list
+        scorers.update(
+            decoder=decoder,
+            ctc=ctc,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build BeamSearch object
+        weights = dict(
+            decoder=1.0 - ctc_weight,
+            ctc=ctc_weight,
+            lm=lm_weight,
+            length_bonus=penalty,
+        )
+
+        assert "encoder_conf" in asr_train_args
+        assert "look_ahead" in asr_train_args.encoder_conf
+        assert "hop_size" in asr_train_args.encoder_conf
+        assert "block_size" in asr_train_args.encoder_conf
+        # look_ahead = asr_train_args.encoder_conf['look_ahead']
+        # hop_size   = asr_train_args.encoder_conf['hop_size']
+        # block_size = asr_train_args.encoder_conf['block_size']
+
+        assert batch_size == 1
+
+        beam_search = BatchBeamSearchOnline(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=asr_model.sos,
+            eos=asr_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key=None if ctc_weight == 1.0 else "full",
+            disable_repetition_detection=disable_repetition_detection,
+            decoder_text_length_limit=decoder_text_length_limit,
+            encoded_feat_length_limit=encoded_feat_length_limit,
+        )
+
+        non_batch = [
+            k
+            for k, v in beam_search.full_scorers.items()
+            if not isinstance(v, BatchScorerInterface)
+        ]
+        assert len(non_batch) == 0
+
+        # TODO(karita): make all scorers batchfied
+        logging.info("BatchBeamSearchOnline implementation is selected.")
+
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+
+        self.reset()
+
+    def reset(self):
+        self.frontend_states = None
+        self.encoder_states = None
+        self.beam_search.reset()
+
+    def apply_frontend(
+        self, speech: torch.Tensor, prev_states=None, is_final: bool = False
+    ):
+        if prev_states is not None:
+            buf = prev_states["waveform_buffer"]
+            speech = torch.cat([buf, speech], dim=0)
+
+        if is_final:
+            speech_to_process = speech
+            waveform_buffer = None
+        else:
+            n_frames = (speech.size(0) - 384) // 128
+            n_residual = (speech.size(0) - 384) % 128
+            speech_to_process = speech.narrow(0, 0, 384 + n_frames * 128)
+            waveform_buffer = speech.narrow(
+                0, speech.size(0) - 384 - n_residual, 384 + n_residual
+            ).clone()
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech_to_process = speech_to_process.unsqueeze(0).to(
+            getattr(torch, self.dtype)
+        )
+        lengths = speech_to_process.new_full(
+            [1], dtype=torch.long, fill_value=speech_to_process.size(1)
+        )
+        batch = {"speech": speech_to_process, "speech_lengths": lengths}
+
+        # lenghts: (1,)
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        feats, feats_lengths = self.asr_model._extract_feats(**batch)
+        if self.asr_model.normalize is not None:
+            feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
+
+        # Trimming
+        if is_final:
+            if prev_states is None:
+                pass
+            else:
+                feats = feats.narrow(1, 2, feats.size(1) - 2)
+        else:
+            if prev_states is None:
+                feats = feats.narrow(1, 0, feats.size(1) - 2)
+            else:
+                feats = feats.narrow(1, 2, feats.size(1) - 4)
+
+        feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
+
+        if is_final:
+            next_states = None
+        else:
+            next_states = {"waveform_buffer": waveform_buffer}
+        return feats, feats_lengths, next_states
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray], is_final: bool = True
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        feats, feats_lengths, self.frontend_states = self.apply_frontend(
+            speech, self.frontend_states, is_final=is_final
+        )
+        enc, _, self.encoder_states = self.asr_model.encoder(
+            feats,
+            feats_lengths,
+            self.encoder_states,
+            is_final=is_final,
+            infer_mode=True,
+        )
+        nbest_hyps = self.beam_search(
+            x=enc[0],
+            maxlenratio=self.maxlenratio,
+            minlenratio=self.minlenratio,
+            is_final=is_final,
+        )
+
+        ret = self.assemble_hyps(nbest_hyps)
+        if is_final:
+            self.reset()
+        return ret
+
+    def assemble_hyps(self, hyps):
+        nbest_hyps = hyps[: self.nbest]
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            token_int = hyp.yseq[1:-1].tolist()
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    ctc_weight: float,
+    lm_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    asr_train_config: str,
+    asr_model_file: str,
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    sim_chunk_length: int,
+    disable_repetition_detection: bool,
+    encoded_feat_length_limit: int,
+    decoder_text_length_limit: int,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text = Speech2TextStreaming(
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        ctc_weight=ctc_weight,
+        lm_weight=lm_weight,
+        penalty=penalty,
+        nbest=nbest,
+        disable_repetition_detection=disable_repetition_detection,
+        decoder_text_length_limit=decoder_text_length_limit,
+        encoded_feat_length_limit=encoded_feat_length_limit,
+    )
+
+    # 3. Build data-iterator
+    loader = ASRTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+        collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+            assert len(batch.keys()) == 1
+
+            try:
+                if sim_chunk_length == 0:
+                    # N-best list of (text, token, token_int, hyp_object)
+                    results = speech2text(**batch)
+                else:
+                    speech = batch["speech"]
+                    for i in range(len(speech) // sim_chunk_length):
+                        speech2text(
+                            speech=speech[
+                                i * sim_chunk_length : (i + 1) * sim_chunk_length
+                            ],
+                            is_final=False,
+                        )
+                    results = speech2text(
+                        speech[(i + 1) * sim_chunk_length : len(speech)], is_final=True
+                    )
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ASR Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+    group.add_argument(
+        "--sim_chunk_length",
+        type=int,
+        default=0,
+        help="The length of one chunk, to which speech will be "
+        "divided for evalution of streaming processing.",
+    )
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument("--asr_train_config", type=str, required=True)
+    group.add_argument("--asr_model_file", type=str, required=True)
+    group.add_argument("--lm_train_config", type=str)
+    group.add_argument("--lm_file", type=str)
+    group.add_argument("--word_lm_train_config", type=str)
+    group.add_argument("--word_lm_file", type=str)
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument(
+        "--ctc_weight",
+        type=float,
+        default=0.5,
+        help="CTC weight in joint decoding",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--disable_repetition_detection", type=str2bool, default=False)
+
+    group.add_argument(
+        "--encoded_feat_length_limit",
+        type=int,
+        default=0,
+        help="Limit the lengths of the encoded feature" "to input to the decoder.",
+    )
+    group.add_argument(
+        "--decoder_text_length_limit",
+        type=int,
+        default=0,
+        help="Limit the lengths of the text" "to input to the decoder.",
+    )
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/diar_inference.py b/espnet2/bin/diar_inference.py
index 46d23b24757..df44afed1f4 100755
--- a/espnet2/bin/diar_inference.py
+++ b/espnet2/bin/diar_inference.py
@@ -4,6 +4,7 @@
 import logging
 from pathlib import Path
 import sys
+from typing import Any
 from typing import List
 from typing import Optional
 from typing import Sequence
@@ -22,6 +23,7 @@
 from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
 from espnet2.utils import config_argparse
 from espnet2.utils.types import humanfriendly_parse_size_or_none
+from espnet2.utils.types import int_or_none
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str2triple_str
 from espnet2.utils.types import str_or_none
@@ -41,11 +43,12 @@ class DiarizeSpeech:
 
     def __init__(
         self,
-        diar_train_config: Union[Path, str],
-        diar_model_file: Union[Path, str] = None,
+        train_config: Union[Path, str] = None,
+        model_file: Union[Path, str] = None,
         segment_size: Optional[float] = None,
         normalize_segment_scale: bool = False,
         show_progressbar: bool = False,
+        num_spk: Optional[int] = None,
         device: str = "cpu",
         dtype: str = "float32",
     ):
@@ -53,7 +56,7 @@ def __init__(
 
         # 1. Build Diar model
         diar_model, diar_train_args = DiarizationTask.build_model_from_file(
-            diar_train_config, diar_model_file, device
+            train_config, model_file, device
         )
         diar_model.to(dtype=getattr(torch, dtype)).eval()
 
@@ -67,8 +70,9 @@ def __init__(
         self.segment_size = segment_size
         self.normalize_segment_scale = normalize_segment_scale
         self.show_progressbar = show_progressbar
-
-        self.num_spk = diar_model.num_spk
+        # not specifying "num_spk" in inference config file
+        # will enable speaker number prediction during inference
+        self.num_spk = num_spk
 
         self.segmenting = segment_size is not None
         if self.segmenting:
@@ -99,7 +103,7 @@ def __call__(
         assert speech.dim() > 1, speech.size()
         batch_size = speech.size(0)
         speech = speech.to(getattr(torch, self.dtype))
-        # lenghts: (B,)
+        # lengths: (B,)
         lengths = speech.new_full(
             [batch_size], dtype=torch.long, fill_value=speech.size(1)
         )
@@ -135,21 +139,105 @@ def __call__(
                 encoder_out, encoder_out_lens = self.diar_model.encode(
                     speech_seg, lengths_seg
                 )
-                spk_prediction = self.diar_model.decoder(encoder_out, encoder_out_lens)
-
+                # SA-EEND
+                if self.diar_model.attractor is None:
+                    assert (
+                        self.num_spk is not None
+                    ), 'Argument "num_spk" must be specified'
+                    spk_prediction = self.diar_model.decoder(
+                        encoder_out, encoder_out_lens
+                    )
+                # EEND-EDA
+                else:
+                    # if num_spk is specified, use that number
+                    if self.num_spk is not None:
+                        attractor, att_prob = self.diar_model.attractor(
+                            encoder_out,
+                            encoder_out_lens,
+                            torch.zeros(
+                                encoder_out.size(0),
+                                self.num_spk + 1,
+                                encoder_out.size(2),
+                            ),
+                        )
+                        spk_prediction = torch.bmm(
+                            encoder_out,
+                            attractor[:, : self.num_spk, :].permute(0, 2, 1),
+                        )
+                    # else find the first att_prob[i] < 0
+                    else:
+                        max_num_spk = 15  # upper bound number for estimation
+                        attractor, att_prob = self.diar_model.attractor(
+                            encoder_out,
+                            encoder_out_lens,
+                            torch.zeros(
+                                encoder_out.size(0),
+                                max_num_spk + 1,
+                                encoder_out.size(2),
+                            ),
+                        )
+                        att_prob = torch.squeeze(att_prob)
+                        for pred_num_spk in range(len(att_prob)):
+                            if att_prob[pred_num_spk].item() < 0:
+                                break
+                        spk_prediction = torch.bmm(
+                            encoder_out, attractor[:, :pred_num_spk, :].permute(0, 2, 1)
+                        )
                 # List[torch.Tensor(B, T, num_spks)]
                 diarized_wavs.append(spk_prediction)
-
+            # Determine maximum estimated number of speakers among the segments
+            max_len = max([x.size(2) for x in diarized_wavs])
+            # pad tensors in diarized_wavs with "float('-inf')" to have same size
+            diarized_wavs = [
+                torch.nn.functional.pad(
+                    x, (0, max_len - x.size(2)), "constant", float("-inf")
+                )
+                for x in diarized_wavs
+            ]
             spk_prediction = torch.cat(diarized_wavs, dim=1)
         else:
             # b. Diarization Forward
             encoder_out, encoder_out_lens = self.diar_model.encode(speech, lengths)
-            spk_prediction = self.diar_model.decoder(encoder_out, encoder_out_lens)
-
-        assert spk_prediction.size(2) == self.num_spk, (
-            spk_prediction.size(2),
-            self.num_spk,
-        )
+            # SA-EEND
+            if self.diar_model.attractor is None:
+                assert self.num_spk is not None, 'Argument "num_spk" must be specified'
+                spk_prediction = self.diar_model.decoder(encoder_out, encoder_out_lens)
+            # EEND-EDA
+            else:
+                # if num_spk is specified, use that number
+                if self.num_spk is not None:
+                    attractor, att_prob = self.diar_model.attractor(
+                        encoder_out,
+                        encoder_out_lens,
+                        torch.zeros(
+                            encoder_out.size(0), self.num_spk + 1, encoder_out.size(2)
+                        ),
+                    )
+                    spk_prediction = torch.bmm(
+                        encoder_out, attractor[:, : self.num_spk, :].permute(0, 2, 1)
+                    )
+                # else find the first att_prob[i] < 0
+                else:
+                    max_num_spk = 15  # upper bound number for estimation
+                    attractor, att_prob = self.diar_model.attractor(
+                        encoder_out,
+                        encoder_out_lens,
+                        torch.zeros(
+                            encoder_out.size(0), max_num_spk + 1, encoder_out.size(2)
+                        ),
+                    )
+                    att_prob = torch.squeeze(att_prob)
+                    for pred_num_spk in range(len(att_prob)):
+                        if att_prob[pred_num_spk].item() < 0:
+                            break
+                    spk_prediction = torch.bmm(
+                        encoder_out, attractor[:, :pred_num_spk, :].permute(0, 2, 1)
+                    )
+        if self.num_spk is not None:
+            assert spk_prediction.size(2) == self.num_spk, (
+                spk_prediction.size(2),
+                self.num_spk,
+            )
         assert spk_prediction.size(0) == batch_size, (
             spk_prediction.size(0),
             batch_size,
@@ -159,6 +247,36 @@ def __call__(
 
         return spk_prediction
 
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build DiarizeSpeech instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            DiarizeSpeech: DiarizeSpeech instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return DiarizeSpeech(**kwargs)
+
 
 def inference(
     output_dir: str,
@@ -171,11 +289,13 @@ def inference(
     log_level: Union[int, str],
     data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
     key_file: Optional[str],
-    diar_train_config: str,
-    diar_model_file: str,
+    train_config: Optional[str],
+    model_file: Optional[str],
+    model_tag: Optional[str],
     allow_variable_data_keys: bool,
     segment_size: Optional[float],
     show_progressbar: bool,
+    num_spk: Optional[int],
 ):
     assert check_argument_types()
     if batch_size > 1:
@@ -197,14 +317,19 @@ def inference(
     set_all_random_seed(seed)
 
     # 2. Build separate_speech
-    diarize_speech = DiarizeSpeech(
-        diar_train_config=diar_train_config,
-        diar_model_file=diar_model_file,
+    diarize_speech_kwargs = dict(
+        train_config=train_config,
+        model_file=model_file,
         segment_size=segment_size,
         show_progressbar=show_progressbar,
+        num_spk=num_spk,
         device=device,
         dtype=dtype,
     )
+    diarize_speech = DiarizeSpeech.from_pretrained(
+        model_tag=model_tag,
+        **diarize_speech_kwargs,
+    )
 
     # 3. Build data-iterator
     loader = DiarizationTask.build_streaming_iterator(
@@ -294,8 +419,22 @@ def get_parser():
     group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
 
     group = parser.add_argument_group("The model configuration related")
-    group.add_argument("--diar_train_config", type=str, required=True)
-    group.add_argument("--diar_model_file", type=str, required=True)
+    group.add_argument(
+        "--train_config",
+        type=str,
+        help="Diarization training configuration",
+    )
+    group.add_argument(
+        "--model_file",
+        type=str,
+        help="Diarization model parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, train_config and "
+        "model_file will be overwritten",
+    )
 
     group = parser.add_argument_group("Data loading related")
     group.add_argument(
@@ -318,6 +457,12 @@ def get_parser():
         help="Whether to show a progress bar when performing segment-wise speaker "
         "diarization",
     )
+    group.add_argument(
+        "--num_spk",
+        type=int_or_none,
+        default=None,
+        help="Predetermined number of speakers for inference",
+    )
 
     return parser
 
diff --git a/espnet2/bin/enh_inference.py b/espnet2/bin/enh_inference.py
index 54958d842fa..84a37b5ff7f 100755
--- a/espnet2/bin/enh_inference.py
+++ b/espnet2/bin/enh_inference.py
@@ -3,6 +3,7 @@
 import logging
 from pathlib import Path
 import sys
+from typing import Any
 from typing import List
 from typing import Optional
 from typing import Sequence
@@ -16,6 +17,9 @@
 from typeguard import check_argument_types
 
 from espnet.utils.cli_utils import get_commandline_args
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
+from espnet2.enh.loss.criterions.time_domain import SISNRLoss
+from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 from espnet2.fileio.sound_scp import SoundScpWriter
 from espnet2.tasks.enh import EnhancementTask
 from espnet2.torch_utils.device_funcs import to_device
@@ -43,8 +47,8 @@ class SeparateSpeech:
 
     def __init__(
         self,
-        enh_train_config: Union[Path, str],
-        enh_model_file: Union[Path, str] = None,
+        train_config: Union[Path, str] = None,
+        model_file: Union[Path, str] = None,
         segment_size: Optional[float] = None,
         hop_size: Optional[float] = None,
         normalize_segment_scale: bool = False,
@@ -58,7 +62,7 @@ def __init__(
 
         # 1. Build Enh model
         enh_model, enh_train_args = EnhancementTask.build_model_from_file(
-            enh_train_config, enh_model_file, device
+            train_config, model_file, device
         )
         enh_model.to(dtype=getattr(torch, dtype)).eval()
 
@@ -121,7 +125,7 @@ def __call__(
         assert speech_mix.dim() > 1, speech_mix.size()
         batch_size = speech_mix.size(0)
         speech_mix = speech_mix.to(getattr(torch, self.dtype))
-        # lenghts: (B,)
+        # lengths: (B,)
         lengths = speech_mix.new_full(
             [batch_size], dtype=torch.long, fill_value=speech_mix.size(1)
         )
@@ -169,10 +173,17 @@ def __call__(
                     speech_seg_ = speech_seg
 
                 if self.normalize_segment_scale:
-                    # normalize the energy of each separated stream
-                    # to match the input energy
+                    # normalize the scale to match the input mixture scale
+                    mix_energy = torch.sqrt(
+                        torch.mean(speech_seg_[:, :t].pow(2), dim=1, keepdim=True)
+                    )
+                    enh_energy = torch.sqrt(
+                        torch.mean(
+                            sum(processed_wav)[:, :t].pow(2), dim=1, keepdim=True
+                        )
+                    )
                     processed_wav = [
-                        self.normalize_scale(w, speech_seg_) for w in processed_wav
+                        w * (mix_energy / enh_energy) for w in processed_wav
                     ]
                 # List[torch.Tensor(num_spk, B, T)]
                 enh_waves.append(torch.stack(processed_wav, dim=0))
@@ -223,21 +234,6 @@ def __call__(
 
         return waves
 
-    @staticmethod
-    @torch.no_grad()
-    def normalize_scale(enh_wav, ref_ch_wav):
-        """Normalize the energy of enh_wav to match that of ref_ch_wav.
-
-        Args:
-            enh_wav (torch.Tensor): (B, Nsamples)
-            ref_ch_wav (torch.Tensor): (B, Nsamples)
-        Returns:
-            enh_wav (torch.Tensor): (B, Nsamples)
-        """
-        ref_energy = torch.sqrt(torch.mean(ref_ch_wav.pow(2), dim=1))
-        enh_energy = torch.sqrt(torch.mean(enh_wav.pow(2), dim=1))
-        return enh_wav * (ref_energy / enh_energy)[:, None]
-
     @torch.no_grad()
     def cal_permumation(self, ref_wavs, enh_wavs, criterion="si_snr"):
         """Calculate the permutation between seaprated streams in two adjacent segments.
@@ -249,18 +245,45 @@ def cal_permumation(self, ref_wavs, enh_wavs, criterion="si_snr"):
         Returns:
             perm (torch.Tensor): permutation for enh_wavs (Batch, num_spk)
         """
-        loss_func = {
-            "si_snr": self.enh_model.si_snr_loss,
-            "mse": lambda enh, ref: torch.mean((enh - ref).pow(2), dim=1),
-            "corr": lambda enh, ref: (
-                (enh * ref).sum(dim=1)
-                / (enh.pow(2).sum(dim=1) * ref.pow(2).sum(dim=1) + EPS)
-            ).clamp(min=EPS, max=1 - EPS),
-        }[criterion]
-
-        _, perm = self.enh_model._permutation_loss(ref_wavs, enh_wavs, loss_func)
+
+        criterion_class = {"si_snr": SISNRLoss, "mse": FrequencyDomainMSE}[criterion]
+
+        pit_solver = PITSolver(criterion=criterion_class())
+
+        _, _, others = pit_solver(ref_wavs, enh_wavs)
+        perm = others["perm"]
         return perm
 
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build SeparateSpeech instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            SeparateSpeech: SeparateSpeech instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return SeparateSpeech(**kwargs)
+
 
 def humanfriendly_or_none(value: str):
     if value in ("none", "None", "NONE"):
@@ -279,8 +302,9 @@ def inference(
     log_level: Union[int, str],
     data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
     key_file: Optional[str],
-    enh_train_config: str,
-    enh_model_file: str,
+    train_config: Optional[str],
+    model_file: Optional[str],
+    model_tag: Optional[str],
     allow_variable_data_keys: bool,
     segment_size: Optional[float],
     hop_size: Optional[float],
@@ -309,9 +333,9 @@ def inference(
     set_all_random_seed(seed)
 
     # 2. Build separate_speech
-    separate_speech = SeparateSpeech(
-        enh_train_config=enh_train_config,
-        enh_model_file=enh_model_file,
+    separate_speech_kwargs = dict(
+        train_config=train_config,
+        model_file=model_file,
         segment_size=segment_size,
         hop_size=hop_size,
         normalize_segment_scale=normalize_segment_scale,
@@ -321,6 +345,10 @@ def inference(
         device=device,
         dtype=dtype,
     )
+    separate_speech = SeparateSpeech.from_pretrained(
+        model_tag=model_tag,
+        **separate_speech_kwargs,
+    )
 
     # 3. Build data-iterator
     loader = EnhancementTask.build_streaming_iterator(
@@ -421,8 +449,22 @@ def get_parser():
     )
 
     group = parser.add_argument_group("The model configuration related")
-    group.add_argument("--enh_train_config", type=str, required=True)
-    group.add_argument("--enh_model_file", type=str, required=True)
+    group.add_argument(
+        "--train_config",
+        type=str,
+        help="Training configuration file",
+    )
+    group.add_argument(
+        "--model_file",
+        type=str,
+        help="Model parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, train_config and "
+        "model_file will be overwritten",
+    )
 
     group = parser.add_argument_group("Data loading related")
     group.add_argument(
diff --git a/espnet2/bin/enh_scoring.py b/espnet2/bin/enh_scoring.py
index a64a42fdb07..a0c9b3c32f6 100755
--- a/espnet2/bin/enh_scoring.py
+++ b/espnet2/bin/enh_scoring.py
@@ -12,12 +12,15 @@
 from typeguard import check_argument_types
 
 from espnet.utils.cli_utils import get_commandline_args
-from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.enh.loss.criterions.time_domain import SISNRLoss
 from espnet2.fileio.datadir_writer import DatadirWriter
 from espnet2.fileio.sound_scp import SoundScpReader
 from espnet2.utils import config_argparse
 
 
+si_snr_loss = SISNRLoss()
+
+
 def scoring(
     output_dir: str,
     dtype: str,
@@ -77,7 +80,7 @@ def scoring(
             for i in range(num_spk):
                 stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate)
                 si_snr_score = -float(
-                    ESPnetEnhancementModel.si_snr_loss(
+                    si_snr_loss(
                         torch.from_numpy(ref[i][None, ...]),
                         torch.from_numpy(inf[int(perm[i])][None, ...]),
                     )
diff --git a/espnet2/bin/gan_tts_train.py b/espnet2/bin/gan_tts_train.py
new file mode 100755
index 00000000000..3103a195010
--- /dev/null
+++ b/espnet2/bin/gan_tts_train.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+from espnet2.tasks.gan_tts import GANTTSTask
+
+
+def get_parser():
+    parser = GANTTSTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    """GAN-based TTS training
+
+    Example:
+
+        % python gan_tts_train.py --print_config --optim1 adadelta
+        % python gan_tts_train.py --config conf/train.yaml
+    """
+    GANTTSTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/hubert_train.py b/espnet2/bin/hubert_train.py
new file mode 100755
index 00000000000..b1452be7ccb
--- /dev/null
+++ b/espnet2/bin/hubert_train.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+from espnet2.tasks.hubert import HubertTask
+
+
+def get_parser():
+    parser = HubertTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    """Hubert pretraining.
+
+    Example:
+        % python hubert_train.py asr --print_config --optim adadelta \
+                > conf/hubert_asr.yaml
+        % python hubert_train.py --config conf/train_asr.yaml
+    """
+    HubertTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/launch.py b/espnet2/bin/launch.py
index c1c86f9b7da..57290c3262d 100755
--- a/espnet2/bin/launch.py
+++ b/espnet2/bin/launch.py
@@ -283,7 +283,7 @@ def main(cmd=None):
                 str(args.num_nodes),
                 args.log,
                 "srun",
-                # Inherit all enviroment variable from parent process
+                # Inherit all environment variable from parent process
                 "--export=ALL",
             ]
             # arguments for *_train.py
diff --git a/espnet2/bin/mt_inference.py b/espnet2/bin/mt_inference.py
new file mode 100755
index 00000000000..e523e1e6d47
--- /dev/null
+++ b/espnet2/bin/mt_inference.py
@@ -0,0 +1,537 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.lm import LMTask
+from espnet2.tasks.mt import MTTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+class Text2Text:
+    """Text2Text class
+
+    Examples:
+        >>> text2text = Text2Text("mt_config.yml", "mt.pth")
+        >>> text2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        mt_train_config: Union[Path, str] = None,
+        mt_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        ngram_scorer: str = "full",
+        ngram_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        lm_weight: float = 1.0,
+        ngram_weight: float = 0.9,
+        penalty: float = 0.0,
+        nbest: int = 1,
+    ):
+        assert check_argument_types()
+
+        # 1. Build MT model
+        scorers = {}
+        mt_model, mt_train_args = MTTask.build_model_from_file(
+            mt_train_config, mt_model_file, device
+        )
+        mt_model.to(dtype=getattr(torch, dtype)).eval()
+
+        decoder = mt_model.decoder
+        token_list = mt_model.token_list
+        scorers.update(
+            decoder=decoder,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build ngram model
+        if ngram_file is not None:
+            if ngram_scorer == "full":
+                from espnet.nets.scorers.ngram import NgramFullScorer
+
+                ngram = NgramFullScorer(ngram_file, token_list)
+            else:
+                from espnet.nets.scorers.ngram import NgramPartScorer
+
+                ngram = NgramPartScorer(ngram_file, token_list)
+        else:
+            ngram = None
+        scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        weights = dict(
+            decoder=1.0,
+            lm=lm_weight,
+            ngram=ngram_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=mt_model.sos,
+            eos=mt_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key="full",
+        )
+        # TODO(karita): make all scorers batchfied
+        if batch_size == 1:
+            non_batch = [
+                k
+                for k, v in beam_search.full_scorers.items()
+                if not isinstance(v, BatchScorerInterface)
+            ]
+            if len(non_batch) == 0:
+                beam_search.__class__ = BatchBeamSearch
+                logging.info("BatchBeamSearch implementation is selected.")
+            else:
+                logging.warning(
+                    f"As non-batch scorers {non_batch} are found, "
+                    f"fall back to non-batch implementation."
+                )
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = mt_train_args.token_type
+        if bpemodel is None:
+            bpemodel = mt_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.mt_model = mt_model
+        self.mt_train_args = mt_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+
+    @torch.no_grad()
+    def __call__(
+        self, src_text: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input text data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(src_text, np.ndarray):
+            src_text = torch.tensor(src_text)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        src_text = src_text.unsqueeze(0).to(torch.long)
+        # lengths: (1,)
+        lengths = src_text.new_full([1], dtype=torch.long, fill_value=src_text.size(1))
+        batch = {"src_text": src_text, "src_text_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.mt_model.encode(**batch)
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the beam search
+        nbest_hyps = self.beam_search(
+            x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+        )
+        nbest_hyps = nbest_hyps[: self.nbest]
+
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            # token_int = hyp.yseq[1:-1].tolist()
+            # TODO(sdalmia): check why the above line doesn't work
+            token_int = hyp.yseq.tolist()
+            token_int = list(filter(lambda x: x != self.mt_model.sos, token_int))
+            token_int = list(filter(lambda x: x != self.mt_model.eos, token_int))
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Text2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+        Returns:
+            Text2Text: Text2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Text2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    lm_weight: float,
+    ngram_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    mt_train_config: Optional[str],
+    mt_model_file: Optional[str],
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    ngram_file: Optional[str],
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build text2text
+    text2text_kwargs = dict(
+        mt_train_config=mt_train_config,
+        mt_model_file=mt_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        ngram_file=ngram_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
+        penalty=penalty,
+        nbest=nbest,
+    )
+    text2text = Text2Text.from_pretrained(
+        model_tag=model_tag,
+        **text2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = MTTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=MTTask.build_preprocess_fn(text2text.mt_train_args, False),
+        collate_fn=MTTask.build_collate_fn(text2text.mt_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            # N-best list of (text, token, token_int, hyp_object)
+            try:
+                results = text2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="MT Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--mt_train_config",
+        type=str,
+        help="ST training configuration",
+    )
+    group.add_argument(
+        "--mt_model_file",
+        type=str,
+        help="MT model parameter file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths."
+        "If maxlenratio<0.0, its absolute value is interpreted"
+        "as a constant max output length",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ST model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/mt_train.py b/espnet2/bin/mt_train.py
new file mode 100755
index 00000000000..03739be8623
--- /dev/null
+++ b/espnet2/bin/mt_train.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+from espnet2.tasks.mt import MTTask
+
+
+def get_parser():
+    parser = MTTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    r"""MT training.
+
+    Example:
+
+        % python mt_train.py st --print_config --optim adadelta \
+                > conf/train_mt.yaml
+        % python mt_train.py --config conf/train_mt.yaml
+    """
+    MTTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/pack.py b/espnet2/bin/pack.py
index b152ba6ee76..21d7b657683 100755
--- a/espnet2/bin/pack.py
+++ b/espnet2/bin/pack.py
@@ -16,6 +16,11 @@ class ASRPackedContents(PackedContents):
     yaml_files = ["asr_train_config", "lm_train_config"]
 
 
+class STPackedContents(PackedContents):
+    files = ["st_model_file"]
+    yaml_files = ["st_train_config"]
+
+
 class TTSPackedContents(PackedContents):
     files = ["model_file"]
     yaml_files = ["train_config"]
@@ -26,6 +31,11 @@ class EnhPackedContents(PackedContents):
     yaml_files = ["train_config"]
 
 
+class DiarPackedContents(PackedContents):
+    files = ["model_file"]
+    yaml_files = ["train_config"]
+
+
 def add_arguments(parser: argparse.ArgumentParser, contents: Type[PackedContents]):
     parser.add_argument("--outpath", type=str, required=True)
     for key in contents.yaml_files:
@@ -42,8 +52,10 @@ def get_parser() -> argparse.ArgumentParser:
     # Create subparser for ASR
     for name, contents in [
         ("asr", ASRPackedContents),
+        ("st", STPackedContents),
         ("tts", TTSPackedContents),
         ("enh", EnhPackedContents),
+        ("diar", DiarPackedContents),
     ]:
         parser_asr = subparsers.add_parser(
             name,
diff --git a/espnet2/bin/st_inference.py b/espnet2/bin/st_inference.py
new file mode 100755
index 00000000000..1758a3ea895
--- /dev/null
+++ b/espnet2/bin/st_inference.py
@@ -0,0 +1,535 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Any
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from typing import List
+
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.beam_search import Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.lm import LMTask
+from espnet2.tasks.st import STTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str2triple_str
+from espnet2.utils.types import str_or_none
+
+
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2Text("st_config.yml", "st.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+        self,
+        st_train_config: Union[Path, str] = None,
+        st_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        ngram_scorer: str = "full",
+        ngram_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        lm_weight: float = 1.0,
+        ngram_weight: float = 0.9,
+        penalty: float = 0.0,
+        nbest: int = 1,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ST model
+        scorers = {}
+        st_model, st_train_args = STTask.build_model_from_file(
+            st_train_config, st_model_file, device
+        )
+        st_model.to(dtype=getattr(torch, dtype)).eval()
+
+        decoder = st_model.decoder
+        token_list = st_model.token_list
+        scorers.update(
+            decoder=decoder,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build ngram model
+        if ngram_file is not None:
+            if ngram_scorer == "full":
+                from espnet.nets.scorers.ngram import NgramFullScorer
+
+                ngram = NgramFullScorer(ngram_file, token_list)
+            else:
+                from espnet.nets.scorers.ngram import NgramPartScorer
+
+                ngram = NgramPartScorer(ngram_file, token_list)
+        else:
+            ngram = None
+        scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        weights = dict(
+            decoder=1.0,
+            lm=lm_weight,
+            ngram=ngram_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=st_model.sos,
+            eos=st_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key="full",
+        )
+        # TODO(karita): make all scorers batchfied
+        if batch_size == 1:
+            non_batch = [
+                k
+                for k, v in beam_search.full_scorers.items()
+                if not isinstance(v, BatchScorerInterface)
+            ]
+            if len(non_batch) == 0:
+                beam_search.__class__ = BatchBeamSearch
+                logging.info("BatchBeamSearch implementation is selected.")
+            else:
+                logging.warning(
+                    f"As non-batch scorers {non_batch} are found, "
+                    f"fall back to non-batch implementation."
+                )
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+        logging.info(f"Beam_search: {beam_search}")
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = st_train_args.token_type
+        if bpemodel is None:
+            bpemodel = st_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.st_model = st_model
+        self.st_train_args = st_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray]
+    ) -> List[Tuple[Optional[str], List[str], List[int], Hypothesis]]:
+        """Inference
+
+        Args:
+            data: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        # data: (Nsamples,) -> (1, Nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lengths: (1,)
+        lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        batch = {"speech": speech, "speech_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, _ = self.st_model.encode(**batch)
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the beam search
+        nbest_hyps = self.beam_search(
+            x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+        )
+        nbest_hyps = nbest_hyps[: self.nbest]
+
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            token_int = hyp.yseq[1:-1].tolist()
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Speech2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+        Returns:
+            Speech2Text: Speech2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Speech2Text(**kwargs)
+
+
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    lm_weight: float,
+    ngram_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    st_train_config: Optional[str],
+    st_model_file: Optional[str],
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    ngram_file: Optional[str],
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        st_train_config=st_train_config,
+        st_model_file=st_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        ngram_file=ngram_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
+        penalty=penalty,
+        nbest=nbest,
+    )
+    speech2text = Speech2Text.from_pretrained(
+        model_tag=model_tag,
+        **speech2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = STTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=STTask.build_preprocess_fn(speech2text.st_train_args, False),
+        collate_fn=STTask.build_collate_fn(speech2text.st_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            # N-best list of (text, token, token_int, hyp_object)
+            try:
+                results = speech2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ST Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--st_train_config",
+        type=str,
+        help="ST training configuration",
+    )
+    group.add_argument(
+        "--st_model_file",
+        type=str,
+        help="ST model parameter file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths."
+        "If maxlenratio<0.0, its absolute value is interpreted"
+        "as a constant max output length",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ST model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/st_train.py b/espnet2/bin/st_train.py
new file mode 100755
index 00000000000..5927fa8ec5f
--- /dev/null
+++ b/espnet2/bin/st_train.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+from espnet2.tasks.st import STTask
+
+
+def get_parser():
+    parser = STTask.get_parser()
+    return parser
+
+
+def main(cmd=None):
+    r"""ST training.
+
+    Example:
+
+        % python st_train.py st --print_config --optim adadelta \
+                > conf/train_st.yaml
+        % python st_train.py --config conf/train_st.yaml
+    """
+    STTask.main(cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/tokenize_text.py b/espnet2/bin/tokenize_text.py
index d2ff08cb60b..f22068c8846 100755
--- a/espnet2/bin/tokenize_text.py
+++ b/espnet2/bin/tokenize_text.py
@@ -12,6 +12,7 @@
 from espnet.utils.cli_utils import get_commandline_args
 from espnet2.text.build_tokenizer import build_tokenizer
 from espnet2.text.cleaner import TextCleaner
+from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str_or_none
 
@@ -28,7 +29,6 @@ def field2slice(field: Optional[str]) -> slice:
         slice(0, 3, None)
         >>> field2slice("-3")
         slice(None, 3, None)
-
     """
     field = field.strip()
     try:
@@ -54,9 +54,12 @@ def field2slice(field: Optional[str]) -> slice:
     except ValueError:
         raise RuntimeError(f"Format error: e.g. '2-', '2-5', or '-5': {field}")
 
-    # -1 because of 1-based integer following "cut" command
-    # e.g "1-3" -> slice(0, 3)
-    slic = slice(s1 - 1, s2)
+    if s1 is None:
+        slic = slice(None, s2)
+    else:
+        # -1 because of 1-based integer following "cut" command
+        # e.g "1-3" -> slice(0, 3)
+        slic = slice(s1 - 1, s2)
     return slic
 
 
@@ -219,25 +222,14 @@ def get_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--cleaner",
         type=str_or_none,
-        choices=[None, "tacotron", "jaconv", "vietnamese"],
+        choices=[None, "tacotron", "jaconv", "vietnamese", "korean_cleaner"],
         default=None,
         help="Apply text cleaning",
     )
     parser.add_argument(
         "--g2p",
         type=str_or_none,
-        choices=[
-            None,
-            "g2p_en",
-            "g2p_en_no_space",
-            "pyopenjtalk",
-            "pyopenjtalk_kana",
-            "pyopenjtalk_accent",
-            "pyopenjtalk_accent_with_pause",
-            "pypinyin_g2p",
-            "pypinyin_g2p_phone",
-            "espeak_ng_arabic",
-        ],
+        choices=g2p_choices,
         default=None,
         help="Specify g2p method if --token_type=phn",
     )
diff --git a/espnet2/bin/tts_inference.py b/espnet2/bin/tts_inference.py
index 417f7d7b00c..338ce8a016b 100755
--- a/espnet2/bin/tts_inference.py
+++ b/espnet2/bin/tts_inference.py
@@ -1,58 +1,81 @@
 #!/usr/bin/env python3
 
-"""TTS mode decoding."""
+"""Script to run the inference of text-to-speeech model."""
 
 import argparse
 import logging
-from pathlib import Path
 import shutil
 import sys
 import time
+
+from distutils.version import LooseVersion
+from pathlib import Path
+from typing import Any
+from typing import Dict
 from typing import Optional
 from typing import Sequence
 from typing import Tuple
 from typing import Union
 
-import matplotlib
 import numpy as np
 import soundfile as sf
 import torch
+
 from typeguard import check_argument_types
 
 from espnet.utils.cli_utils import get_commandline_args
 from espnet2.fileio.npy_scp import NpyScpWriter
+from espnet2.gan_tts.vits import VITS
 from espnet2.tasks.tts import TTSTask
 from espnet2.torch_utils.device_funcs import to_device
 from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
-from espnet2.tts.duration_calculator import DurationCalculator
 from espnet2.tts.fastspeech import FastSpeech
 from espnet2.tts.fastspeech2 import FastSpeech2
 from espnet2.tts.tacotron2 import Tacotron2
 from espnet2.tts.transformer import Transformer
+from espnet2.tts.utils import DurationCalculator
 from espnet2.utils import config_argparse
-from espnet2.utils.get_default_kwargs import get_default_kwargs
-from espnet2.utils.griffin_lim import Spectrogram2Waveform
-from espnet2.utils.nested_dict_action import NestedDictAction
 from espnet2.utils.types import str2bool
 from espnet2.utils.types import str2triple_str
 from espnet2.utils.types import str_or_none
 
 
 class Text2Speech:
-    """Speech2Text class
+    """Text2Speech class.
 
     Examples:
-        >>> import soundfile
-        >>> text2speech = Text2Speech("config.yml", "model.pth")
-        >>> wav = text2speech("Hello World")[0]
-        >>> soundfile.write("out.wav", wav.numpy(), text2speech.fs, "PCM_16")
+        >>> from espnet2.bin.tts_inference import Text2Speech
+        >>> # Case 1: Load the local model and use Griffin-Lim vocoder
+        >>> text2speech = Text2Speech(
+        >>>     train_config="/path/to/config.yml",
+        >>>     model_file="/path/to/model.pth",
+        >>> )
+        >>> # Case 2: Load the local model and the pretrained vocoder
+        >>> text2speech = Text2Speech.from_pretrained(
+        >>>     train_config="/path/to/config.yml",
+        >>>     model_file="/path/to/model.pth",
+        >>>     vocoder_tag="kan-bayashi/ljspeech_tacotron2",
+        >>> )
+        >>> # Case 3: Load the pretrained model and use Griffin-Lim vocoder
+        >>> text2speech = Text2Speech.from_pretrained(
+        >>>     model_tag="kan-bayashi/ljspeech_tacotron2",
+        >>> )
+        >>> # Case 4: Load the pretrained model and the pretrained vocoder
+        >>> text2speech = Text2Speech.from_pretrained(
+        >>>     model_tag="kan-bayashi/ljspeech_tacotron2",
+        >>>     vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v1",
+        >>> )
+        >>> # Run inference and save as wav file
+        >>> import soundfile as sf
+        >>> wav = text2speech("Hello, World")["wav"]
+        >>> sf.write("out.wav", wav.numpy(), text2speech.fs, "PCM_16")
 
     """
 
     def __init__(
         self,
-        train_config: Optional[Union[Path, str]],
-        model_file: Optional[Union[Path, str]] = None,
+        train_config: Union[Path, str] = None,
+        model_file: Union[Path, str] = None,
         threshold: float = 0.5,
         minlenratio: float = 0.0,
         maxlenratio: float = 10.0,
@@ -61,12 +84,19 @@ def __init__(
         backward_window: int = 1,
         forward_window: int = 3,
         speed_control_alpha: float = 1.0,
-        vocoder_conf: dict = None,
+        noise_scale: float = 0.667,
+        noise_scale_dur: float = 0.8,
+        vocoder_config: Union[Path, str] = None,
+        vocoder_file: Union[Path, str] = None,
         dtype: str = "float32",
         device: str = "cpu",
+        seed: int = 777,
+        always_fix_seed: bool = False,
     ):
+        """Initialize Text2Speech module."""
         assert check_argument_types()
 
+        # setup model
         model, train_args = TTSTask.build_model_from_file(
             train_config, model_file, device
         )
@@ -81,47 +111,45 @@ def __init__(
         self.duration_calculator = DurationCalculator()
         self.preprocess_fn = TTSTask.build_preprocess_fn(train_args, False)
         self.use_teacher_forcing = use_teacher_forcing
-
-        logging.info(f"Normalization:\n{self.normalize}")
+        self.seed = seed
+        self.always_fix_seed = always_fix_seed
+        self.vocoder = None
+        if self.tts.require_vocoder:
+            vocoder = TTSTask.build_vocoder_from_file(
+                vocoder_config, vocoder_file, model, device
+            )
+            if isinstance(vocoder, torch.nn.Module):
+                vocoder.to(dtype=getattr(torch, dtype)).eval()
+            self.vocoder = vocoder
+        logging.info(f"Extractor:\n{self.feats_extract}")
+        logging.info(f"Normalizer:\n{self.normalize}")
         logging.info(f"TTS:\n{self.tts}")
+        if self.vocoder is not None:
+            logging.info(f"Vocoder:\n{self.vocoder}")
 
-        decode_config = {}
+        # setup decoding config
+        decode_conf = {}
+        decode_conf.update(use_teacher_forcing=use_teacher_forcing)
         if isinstance(self.tts, (Tacotron2, Transformer)):
-            decode_config.update(
-                {
-                    "threshold": threshold,
-                    "maxlenratio": maxlenratio,
-                    "minlenratio": minlenratio,
-                }
+            decode_conf.update(
+                threshold=threshold,
+                maxlenratio=maxlenratio,
+                minlenratio=minlenratio,
             )
         if isinstance(self.tts, Tacotron2):
-            decode_config.update(
-                {
-                    "use_att_constraint": use_att_constraint,
-                    "forward_window": forward_window,
-                    "backward_window": backward_window,
-                }
+            decode_conf.update(
+                use_att_constraint=use_att_constraint,
+                forward_window=forward_window,
+                backward_window=backward_window,
             )
-        if isinstance(self.tts, (FastSpeech, FastSpeech2)):
-            decode_config.update({"alpha": speed_control_alpha})
-        decode_config.update({"use_teacher_forcing": use_teacher_forcing})
-
-        self.decode_config = decode_config
-
-        if vocoder_conf is None:
-            vocoder_conf = {}
-        if self.feats_extract is not None:
-            vocoder_conf.update(self.feats_extract.get_parameters())
-        if (
-            "n_fft" in vocoder_conf
-            and "n_shift" in vocoder_conf
-            and "fs" in vocoder_conf
-        ):
-            self.spc2wav = Spectrogram2Waveform(**vocoder_conf)
-            logging.info(f"Vocoder: {self.spc2wav}")
-        else:
-            self.spc2wav = None
-            logging.info("Vocoder is not used because vocoder_conf is not sufficient")
+        if isinstance(self.tts, (FastSpeech, FastSpeech2, VITS)):
+            decode_conf.update(alpha=speed_control_alpha)
+        if isinstance(self.tts, VITS):
+            decode_conf.update(
+                noise_scale=noise_scale,
+                noise_scale_dur=noise_scale_dur,
+            )
+        self.decode_conf = decode_conf
 
     @torch.no_grad()
     def __call__(
@@ -130,56 +158,156 @@ def __call__(
         speech: Union[torch.Tensor, np.ndarray] = None,
         durations: Union[torch.Tensor, np.ndarray] = None,
         spembs: Union[torch.Tensor, np.ndarray] = None,
-    ):
+        sids: Union[torch.Tensor, np.ndarray] = None,
+        lids: Union[torch.Tensor, np.ndarray] = None,
+        decode_conf: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """Run text-to-speech."""
         assert check_argument_types()
 
+        # check inputs
         if self.use_speech and speech is None:
-            raise RuntimeError("missing required argument: 'speech'")
-
+            raise RuntimeError("Missing required argument: 'speech'")
+        if self.use_sids and sids is None:
+            raise RuntimeError("Missing required argument: 'sids'")
+        if self.use_lids and lids is None:
+            raise RuntimeError("Missing required argument: 'lids'")
+        if self.use_spembs and spembs is None:
+            raise RuntimeError("Missing required argument: 'spembs'")
+
+        # prepare batch
         if isinstance(text, str):
-            # str -> np.ndarray
-            text = self.preprocess_fn("<dummy>", {"text": text})["text"]
-        batch = {"text": text}
+            text = self.preprocess_fn("<dummy>", dict(text=text))["text"]
+        batch = dict(text=text)
         if speech is not None:
-            batch["speech"] = speech
+            batch.update(speech=speech)
         if durations is not None:
-            batch["durations"] = durations
+            batch.update(durations=durations)
         if spembs is not None:
-            batch["spembs"] = spembs
-
+            batch.update(spembs=spembs)
+        if sids is not None:
+            batch.update(sids=sids)
+        if lids is not None:
+            batch.update(lids=lids)
         batch = to_device(batch, self.device)
-        outs, outs_denorm, probs, att_ws = self.model.inference(
-            **batch, **self.decode_config
-        )
-
-        if att_ws is not None:
-            duration, focus_rate = self.duration_calculator(att_ws)
-        else:
-            duration, focus_rate = None, None
-
-        if self.spc2wav is not None:
-            wav = torch.tensor(self.spc2wav(outs_denorm.cpu().numpy()))
-        else:
-            wav = None
 
-        return wav, outs, outs_denorm, probs, att_ws, duration, focus_rate
+        # overwrite the decode configs if provided
+        cfg = self.decode_conf
+        if decode_conf is not None:
+            cfg = self.decode_conf.copy()
+            cfg.update(decode_conf)
+
+        # inference
+        if self.always_fix_seed:
+            set_all_random_seed(self.seed)
+        output_dict = self.model.inference(**batch, **cfg)
+
+        # calculate additional metrics
+        if output_dict.get("att_w") is not None:
+            duration, focus_rate = self.duration_calculator(output_dict["att_w"])
+            output_dict.update(duration=duration, focus_rate=focus_rate)
+
+        # apply vocoder (mel-to-wav)
+        if self.vocoder is not None:
+            if output_dict.get("feat_gen_denorm") is not None:
+                input_feat = output_dict["feat_gen_denorm"]
+            else:
+                input_feat = output_dict["feat_gen"]
+            wav = self.vocoder(input_feat)
+            output_dict.update(wav=wav)
+
+        return output_dict
 
     @property
     def fs(self) -> Optional[int]:
-        if self.spc2wav is not None:
-            return self.spc2wav.fs
+        """Return sampling rate."""
+        if hasattr(self.vocoder, "fs"):
+            return self.vocoder.fs
+        elif hasattr(self.tts, "fs"):
+            return self.tts.fs
         else:
             return None
 
     @property
     def use_speech(self) -> bool:
-        """Check whether to require speech in inference.
+        """Return speech is needed or not in the inference."""
+        return self.use_teacher_forcing or getattr(self.tts, "use_gst", False)
+
+    @property
+    def use_sids(self) -> bool:
+        """Return sid is needed or not in the inference."""
+        return self.tts.spks is not None
+
+    @property
+    def use_lids(self) -> bool:
+        """Return sid is needed or not in the inference."""
+        return self.tts.langs is not None
+
+    @property
+    def use_spembs(self) -> bool:
+        """Return spemb is needed or not in the inference."""
+        return self.tts.spk_embed_dim is not None
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        vocoder_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Text2Speech instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+            vocoder_tag (Optional[str]): Vocoder tag of the pretrained vocoders.
+                Currently, the tags of parallel_wavegan are supported, which should
+                start with the prefix "parallel_wavegan/".
 
         Returns:
-            bool: True if speech is required else False.
+            Text2Speech: Text2Speech instance.
 
         """
-        return self.use_teacher_forcing or getattr(self.tts, "use_gst", False)
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        if vocoder_tag is not None:
+            if vocoder_tag.startswith("parallel_wavegan/"):
+                try:
+                    from parallel_wavegan.utils import download_pretrained_model
+
+                except ImportError:
+                    logging.error(
+                        "`parallel_wavegan` is not installed. "
+                        "Please install via `pip install -U parallel_wavegan`."
+                    )
+                    raise
+
+                from parallel_wavegan import __version__
+
+                # NOTE(kan-bayashi): Filelock download is supported from 0.5.2
+                assert LooseVersion(__version__) > LooseVersion("0.5.1"), (
+                    "Please install the latest parallel_wavegan "
+                    "via `pip install -U parallel_wavegan`."
+                )
+                vocoder_tag = vocoder_tag.replace("parallel_wavegan/", "")
+                vocoder_file = download_pretrained_model(vocoder_tag)
+                vocoder_config = Path(vocoder_file).parent / "config.yml"
+                kwargs.update(vocoder_config=vocoder_config, vocoder_file=vocoder_file)
+
+            else:
+                raise ValueError(f"{vocoder_tag} is unsupported format.")
+
+        return Text2Speech(**kwargs)
 
 
 def inference(
@@ -194,6 +322,7 @@ def inference(
     key_file: Optional[str],
     train_config: Optional[str],
     model_file: Optional[str],
+    model_tag: Optional[str],
     threshold: float,
     minlenratio: float,
     maxlenratio: float,
@@ -202,10 +331,15 @@ def inference(
     backward_window: int,
     forward_window: int,
     speed_control_alpha: float,
+    noise_scale: float,
+    noise_scale_dur: float,
+    always_fix_seed: bool,
     allow_variable_data_keys: bool,
-    vocoder_conf: dict,
+    vocoder_config: Optional[str],
+    vocoder_file: Optional[str],
+    vocoder_tag: Optional[str],
 ):
-    """Perform TTS model decoding."""
+    """Run text-to-speech inference."""
     assert check_argument_types()
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
@@ -225,7 +359,7 @@ def inference(
     set_all_random_seed(seed)
 
     # 2. Build model
-    text2speech = Text2Speech(
+    text2speech_kwargs = dict(
         train_config=train_config,
         model_file=model_file,
         threshold=threshold,
@@ -236,9 +370,19 @@ def inference(
         backward_window=backward_window,
         forward_window=forward_window,
         speed_control_alpha=speed_control_alpha,
-        vocoder_conf=vocoder_conf,
+        noise_scale=noise_scale,
+        noise_scale_dur=noise_scale_dur,
+        vocoder_config=vocoder_config,
+        vocoder_file=vocoder_file,
         dtype=dtype,
         device=device,
+        seed=seed,
+        always_fix_seed=always_fix_seed,
+    )
+    text2speech = Text2Speech.from_pretrained(
+        model_tag=model_tag,
+        vocoder_tag=vocoder_tag,
+        **text2speech_kwargs,
     )
 
     # 3. Build data-iterator
@@ -270,6 +414,8 @@ def inference(
     (output_dir / "focus_rates").mkdir(parents=True, exist_ok=True)
 
     # Lazy load to avoid the backend error
+    import matplotlib
+
     matplotlib.use("Agg")
     import matplotlib.pyplot as plt
     from matplotlib.ticker import MaxNLocator
@@ -297,53 +443,72 @@ def inference(
             batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
 
             start_time = time.perf_counter()
-            wav, outs, outs_denorm, probs, att_ws, duration, focus_rate = text2speech(
-                **batch
-            )
+            output_dict = text2speech(**batch)
 
             key = keys[0]
             insize = next(iter(batch.values())).size(0) + 1
-            logging.info(
-                "inference speed = {:.1f} frames / sec.".format(
-                    int(outs.size(0)) / (time.perf_counter() - start_time)
+            if output_dict.get("feat_gen") is not None:
+                # standard text2mel model case
+                feat_gen = output_dict["feat_gen"]
+                logging.info(
+                    "inference speed = {:.1f} frames / sec.".format(
+                        int(feat_gen.size(0)) / (time.perf_counter() - start_time)
+                    )
                 )
-            )
-            logging.info(f"{key} (size:{insize}->{outs.size(0)})")
-            if outs.size(0) == insize * maxlenratio:
-                logging.warning(f"output length reaches maximum length ({key}).")
+                logging.info(f"{key} (size:{insize}->{feat_gen.size(0)})")
+                if feat_gen.size(0) == insize * maxlenratio:
+                    logging.warning(f"output length reaches maximum length ({key}).")
 
-            norm_writer[key] = outs.cpu().numpy()
-            shape_writer.write(f"{key} " + ",".join(map(str, outs.shape)) + "\n")
-
-            denorm_writer[key] = outs_denorm.cpu().numpy()
+                norm_writer[key] = output_dict["feat_gen"].cpu().numpy()
+                shape_writer.write(
+                    f"{key} " + ",".join(map(str, output_dict["feat_gen"].shape)) + "\n"
+                )
+                if output_dict.get("feat_gen_denorm") is not None:
+                    denorm_writer[key] = output_dict["feat_gen_denorm"].cpu().numpy()
+            else:
+                # end-to-end text2wav model case
+                wav = output_dict["wav"]
+                logging.info(
+                    "inference speed = {:.1f} points / sec.".format(
+                        int(wav.size(0)) / (time.perf_counter() - start_time)
+                    )
+                )
+                logging.info(f"{key} (size:{insize}->{wav.size(0)})")
 
-            if duration is not None:
+            if output_dict.get("duration") is not None:
                 # Save duration and fucus rates
                 duration_writer.write(
-                    f"{key} " + " ".join(map(str, duration.cpu().numpy())) + "\n"
+                    f"{key} "
+                    + " ".join(map(str, output_dict["duration"].long().cpu().numpy()))
+                    + "\n"
+                )
+
+            if output_dict.get("focus_rate") is not None:
+                focus_rate_writer.write(
+                    f"{key} {float(output_dict['focus_rate']):.5f}\n"
                 )
-                focus_rate_writer.write(f"{key} {float(focus_rate):.5f}\n")
 
+            if output_dict.get("att_w") is not None:
                 # Plot attention weight
-                att_ws = att_ws.cpu().numpy()
+                att_w = output_dict["att_w"].cpu().numpy()
 
-                if att_ws.ndim == 2:
-                    att_ws = att_ws[None][None]
-                elif att_ws.ndim != 4:
-                    raise RuntimeError(f"Must be 2 or 4 dimension: {att_ws.ndim}")
+                if att_w.ndim == 2:
+                    att_w = att_w[None][None]
+                elif att_w.ndim != 4:
+                    raise RuntimeError(f"Must be 2 or 4 dimension: {att_w.ndim}")
 
-                w, h = plt.figaspect(att_ws.shape[0] / att_ws.shape[1])
+                w, h = plt.figaspect(att_w.shape[0] / att_w.shape[1])
                 fig = plt.Figure(
                     figsize=(
-                        w * 1.3 * min(att_ws.shape[0], 2.5),
-                        h * 1.3 * min(att_ws.shape[1], 2.5),
+                        w * 1.3 * min(att_w.shape[0], 2.5),
+                        h * 1.3 * min(att_w.shape[1], 2.5),
                     )
                 )
                 fig.suptitle(f"{key}")
-                axes = fig.subplots(att_ws.shape[0], att_ws.shape[1])
-                if len(att_ws) == 1:
+                axes = fig.subplots(att_w.shape[0], att_w.shape[1])
+                if len(att_w) == 1:
                     axes = [[axes]]
-                for ax, att_w in zip(axes, att_ws):
+                for ax, att_w in zip(axes, att_w):
                     for ax_, att_w_ in zip(ax, att_w):
                         ax_.imshow(att_w_.astype(np.float32), aspect="auto")
                         ax_.set_xlabel("Input")
@@ -355,13 +520,13 @@ def inference(
                 fig.savefig(output_dir / f"att_ws/{key}.png")
                 fig.clf()
 
-            if probs is not None:
+            if output_dict.get("prob") is not None:
                 # Plot stop token prediction
-                probs = probs.cpu().numpy()
+                prob = output_dict["prob"].cpu().numpy()
 
                 fig = plt.Figure()
                 ax = fig.add_subplot(1, 1, 1)
-                ax.plot(probs)
+                ax.plot(prob)
                 ax.set_title(f"{key}")
                 ax.set_xlabel("Output")
                 ax.set_ylabel("Stop probability")
@@ -372,25 +537,36 @@ def inference(
                 fig.savefig(output_dir / f"probs/{key}.png")
                 fig.clf()
 
-            # TODO(kamo): Write scp
-            if wav is not None:
+            if output_dict.get("wav") is not None:
+                # TODO(kamo): Write scp
                 sf.write(
-                    f"{output_dir}/wav/{key}.wav", wav.numpy(), text2speech.fs, "PCM_16"
+                    f"{output_dir}/wav/{key}.wav",
+                    output_dict["wav"].cpu().numpy(),
+                    text2speech.fs,
+                    "PCM_16",
                 )
 
-    # remove duration related files if attention is not provided
-    if att_ws is None:
+    # remove files if those are not included in output dict
+    if output_dict.get("feat_gen") is None:
+        shutil.rmtree(output_dir / "norm")
+    if output_dict.get("feat_gen_denorm") is None:
+        shutil.rmtree(output_dir / "denorm")
+    if output_dict.get("att_w") is None:
         shutil.rmtree(output_dir / "att_ws")
+    if output_dict.get("duration") is None:
         shutil.rmtree(output_dir / "durations")
+    if output_dict.get("focus_rate") is None:
         shutil.rmtree(output_dir / "focus_rates")
-    if probs is None:
+    if output_dict.get("prob") is None:
         shutil.rmtree(output_dir / "probs")
+    if output_dict.get("wav") is None:
+        shutil.rmtree(output_dir / "wav")
 
 
 def get_parser():
     """Get argument parser."""
     parser = config_argparse.ArgumentParser(
-        description="TTS Decode",
+        description="TTS inference",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
 
@@ -462,12 +638,18 @@ def get_parser():
     group.add_argument(
         "--train_config",
         type=str,
-        help="Training configuration file.",
+        help="Training configuration file",
     )
     group.add_argument(
         "--model_file",
         type=str,
-        help="Model parameter file.",
+        help="Model parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, train_config and "
+        "model_file will be overwritten",
     )
 
     group = parser.add_argument_group("Decoding related")
@@ -519,19 +701,47 @@ def get_parser():
         default=1.0,
         help="Alpha in FastSpeech to change the speed of generated speech",
     )
+    parser.add_argument(
+        "--noise_scale",
+        type=float,
+        default=0.667,
+        help="Noise scale parameter for the flow in vits",
+    )
+    parser.add_argument(
+        "--noise_scale_dur",
+        type=float,
+        default=0.8,
+        help="Noise scale parameter for the stochastic duration predictor in vits",
+    )
+    group.add_argument(
+        "--always_fix_seed",
+        type=str2bool,
+        default=False,
+        help="Whether to always fix seed",
+    )
 
-    group = parser.add_argument_group("Grriffin-Lim related")
+    group = parser.add_argument_group("Vocoder related")
+    group.add_argument(
+        "--vocoder_config",
+        type=str_or_none,
+        help="Vocoder configuration file",
+    )
     group.add_argument(
-        "--vocoder_conf",
-        action=NestedDictAction,
-        default=get_default_kwargs(Spectrogram2Waveform),
-        help="The configuration for Grriffin-Lim",
+        "--vocoder_file",
+        type=str_or_none,
+        help="Vocoder parameter file",
+    )
+    group.add_argument(
+        "--vocoder_tag",
+        type=str,
+        help="Pretrained vocoder tag. If specify this option, vocoder_config and "
+        "vocoder_file will be overwritten",
     )
     return parser
 
 
 def main(cmd=None):
-    """Run TTS model decoding."""
+    """Run TTS model inference."""
     print(get_commandline_args(), file=sys.stderr)
     parser = get_parser()
     args = parser.parse_args(cmd)
diff --git a/espnet2/diar/attractor/__init__.py b/espnet2/diar/attractor/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/diar/attractor/abs_attractor.py b/espnet2/diar/attractor/abs_attractor.py
new file mode 100644
index 00000000000..914fdb62ea2
--- /dev/null
+++ b/espnet2/diar/attractor/abs_attractor.py
@@ -0,0 +1,16 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import Tuple
+
+import torch
+
+
+class AbsAttractor(torch.nn.Module, ABC):
+    @abstractmethod
+    def forward(
+        self,
+        enc_input: torch.Tensor,
+        ilens: torch.Tensor,
+        dec_input: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/espnet2/diar/attractor/rnn_attractor.py b/espnet2/diar/attractor/rnn_attractor.py
new file mode 100644
index 00000000000..2f7313e8ade
--- /dev/null
+++ b/espnet2/diar/attractor/rnn_attractor.py
@@ -0,0 +1,65 @@
+import torch
+
+from espnet2.diar.attractor.abs_attractor import AbsAttractor
+
+
+class RnnAttractor(AbsAttractor):
+    """encoder decoder attractor for speaker diarization"""
+
+    def __init__(
+        self,
+        encoder_output_size: int,
+        layer: int = 1,
+        unit: int = 512,
+        dropout: float = 0.1,
+        attractor_grad: bool = True,
+    ):
+        super().__init__()
+        self.attractor_encoder = torch.nn.LSTM(
+            input_size=encoder_output_size,
+            hidden_size=unit,
+            num_layers=layer,
+            dropout=dropout,
+            batch_first=True,
+        )
+        self.attractor_decoder = torch.nn.LSTM(
+            input_size=encoder_output_size,
+            hidden_size=unit,
+            num_layers=layer,
+            dropout=dropout,
+            batch_first=True,
+        )
+        self.dropout_layer = torch.nn.Dropout(p=dropout)
+
+        self.linear_projection = torch.nn.Linear(unit, 1)
+
+        self.attractor_grad = attractor_grad
+
+    def forward(
+        self,
+        enc_input: torch.Tensor,
+        ilens: torch.Tensor,
+        dec_input: torch.Tensor,
+    ):
+        """Forward.
+
+        Args:
+            enc_input (torch.Tensor): hidden_space [Batch, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+            dec_input (torch.Tensor): decoder input (zeros) [Batch, num_spk + 1, F]
+
+        Returns:
+            attractor: [Batch, num_spk + 1, F]
+            att_prob: [Batch, num_spk + 1, 1]
+        """
+        pack = torch.nn.utils.rnn.pack_padded_sequence(
+            enc_input, lengths=ilens.cpu(), batch_first=True, enforce_sorted=False
+        )
+        _, hs = self.attractor_encoder(pack)
+        attractor, _ = self.attractor_decoder(dec_input, hs)
+        attractor = self.dropout_layer(attractor)
+        if self.attractor_grad is True:
+            att_prob = self.linear_projection(attractor)
+        else:
+            att_prob = self.linear_projection(attractor.detach())
+        return attractor, att_prob
diff --git a/espnet2/diar/decoder/linear_decoder.py b/espnet2/diar/decoder/linear_decoder.py
index 61a3dea0aa7..5aea3069c4b 100644
--- a/espnet2/diar/decoder/linear_decoder.py
+++ b/espnet2/diar/decoder/linear_decoder.py
@@ -4,7 +4,7 @@
 
 
 class LinearDecoder(AbsDecoder):
-    """Linear decoder for speaker diarization """
+    """Linear decoder for speaker diarization"""
 
     def __init__(
         self,
diff --git a/espnet2/diar/espnet_model.py b/espnet2/diar/espnet_model.py
index cf923c8b7dc..8a59b3cb5a3 100644
--- a/espnet2/diar/espnet_model.py
+++ b/espnet2/diar/espnet_model.py
@@ -15,12 +15,13 @@
 from espnet.nets.pytorch_backend.nets_utils import to_device
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.diar.attractor.abs_attractor import AbsAttractor
 from espnet2.diar.decoder.abs_decoder import AbsDecoder
 from espnet2.layers.abs_normalize import AbsNormalize
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.train.abs_espnet_model import AbsESPnetModel
 
-
 if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
     from torch.cuda.amp import autocast
 else:
@@ -31,28 +32,45 @@ def autocast(enabled=True):
 
 
 class ESPnetDiarizationModel(AbsESPnetModel):
-    """Speaker Diarization model"""
+    """Speaker Diarization model
+
+    If "attractor" is "None", SA-EEND will be used.
+    Else if "attractor" is not "None", EEND-EDA will be used.
+    For the details about SA-EEND and EEND-EDA, refer to the following papers:
+    SA-EEND: https://arxiv.org/pdf/1909.06247.pdf
+    EEND-EDA: https://arxiv.org/pdf/2005.09921.pdf, https://arxiv.org/pdf/2106.10654.pdf
+    """
 
     def __init__(
         self,
         frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
         normalize: Optional[AbsNormalize],
         label_aggregator: torch.nn.Module,
         encoder: AbsEncoder,
         decoder: AbsDecoder,
-        loss_type: str = "pit",  # only support pit loss for now
+        attractor: Optional[AbsAttractor],
+        attractor_weight: float = 1.0,
     ):
         assert check_argument_types()
 
         super().__init__()
 
         self.encoder = encoder
-        self.decoder = decoder
-        self.num_spk = decoder.num_spk
         self.normalize = normalize
         self.frontend = frontend
+        self.specaug = specaug
         self.label_aggregator = label_aggregator
-        self.loss_type = loss_type
+        self.attractor_weight = attractor_weight
+        self.attractor = attractor
+        self.decoder = decoder
+
+        if self.attractor is not None:
+            self.decoder = None
+        elif self.decoder is not None:
+            self.num_spk = decoder.num_spk
+        else:
+            raise NotImplementedError
 
     def forward(
         self,
@@ -78,55 +96,91 @@ def forward(
         # 1. Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
 
-        # 2. Decoder (baiscally a predction layer after encoder_out)
-        pred = self.decoder(encoder_out, encoder_out_lens)
-
+        if self.attractor is None:
+            # 2a. Decoder (baiscally a predction layer after encoder_out)
+            pred = self.decoder(encoder_out, encoder_out_lens)
+        else:
+            # 2b. Encoder Decoder Attractors
+            # Shuffle the chronological order of encoder_out, then calculate attractor
+            encoder_out_shuffled = encoder_out.clone()
+            for i in range(len(encoder_out_lens)):
+                encoder_out_shuffled[i, : encoder_out_lens[i], :] = encoder_out[
+                    i, torch.randperm(encoder_out_lens[i]), :
+                ]
+            attractor, att_prob = self.attractor(
+                encoder_out_shuffled,
+                encoder_out_lens,
+                to_device(
+                    self,
+                    torch.zeros(
+                        encoder_out.size(0), spk_labels.size(2) + 1, encoder_out.size(2)
+                    ),
+                ),
+            )
+            # Remove the final attractor which does not correspond to a speaker
+            # Then multiply the attractors and encoder_out
+            pred = torch.bmm(encoder_out, attractor[:, :-1, :].permute(0, 2, 1))
         # 3. Aggregate time-domain labels
         spk_labels, spk_labels_lengths = self.label_aggregator(
             spk_labels, spk_labels_lengths
         )
 
-        if self.loss_type == "pit":
+        # If encoder uses conv* as input_layer (i.e., subsampling),
+        # the sequence length of 'pred' might be slighly less than the
+        # length of 'spk_labels'. Here we force them to be equal.
+        length_diff_tolerance = 2
+        length_diff = spk_labels.shape[1] - pred.shape[1]
+        if length_diff > 0 and length_diff <= length_diff_tolerance:
+            spk_labels = spk_labels[:, 0 : pred.shape[1], :]
+
+        if self.attractor is None:
+            loss_pit, loss_att = None, None
             loss, perm_idx, perm_list, label_perm = self.pit_loss(
                 pred, spk_labels, encoder_out_lens
             )
-
-            (
-                correct,
-                num_frames,
-                speech_scored,
-                speech_miss,
-                speech_falarm,
-                speaker_scored,
-                speaker_miss,
-                speaker_falarm,
-                speaker_error,
-            ) = self.calc_diarization_error(pred, label_perm, encoder_out_lens)
-
-            if speech_scored > 0 and num_frames > 0:
-                sad_mr, sad_fr, mi, fa, cf, acc, der = (
-                    speech_miss / speech_scored,
-                    speech_falarm / speech_scored,
-                    speaker_miss / speaker_scored,
-                    speaker_falarm / speaker_scored,
-                    speaker_error / speaker_scored,
-                    correct / num_frames,
-                    (speaker_miss + speaker_falarm + speaker_error) / speaker_scored,
-                )
-            else:
-                sad_mr, sad_fr, mi, fa, cf, acc, der = 0, 0, 0, 0, 0, 0, 0
-            stats = dict(
-                loss=loss.detach(),
-                sad_mr=sad_mr,
-                sad_fr=sad_fr,
-                mi=mi,
-                fa=fa,
-                cf=cf,
-                acc=acc,
-                der=der,
+        else:
+            loss_pit, perm_idx, perm_list, label_perm = self.pit_loss(
+                pred, spk_labels, encoder_out_lens
+            )
+            loss_att = self.attractor_loss(att_prob, spk_labels)
+            loss = loss_pit + self.attractor_weight * loss_att
+        (
+            correct,
+            num_frames,
+            speech_scored,
+            speech_miss,
+            speech_falarm,
+            speaker_scored,
+            speaker_miss,
+            speaker_falarm,
+            speaker_error,
+        ) = self.calc_diarization_error(pred, label_perm, encoder_out_lens)
+
+        if speech_scored > 0 and num_frames > 0:
+            sad_mr, sad_fr, mi, fa, cf, acc, der = (
+                speech_miss / speech_scored,
+                speech_falarm / speech_scored,
+                speaker_miss / speaker_scored,
+                speaker_falarm / speaker_scored,
+                speaker_error / speaker_scored,
+                correct / num_frames,
+                (speaker_miss + speaker_falarm + speaker_error) / speaker_scored,
             )
         else:
-            raise NotImplementedError
+            sad_mr, sad_fr, mi, fa, cf, acc, der = 0, 0, 0, 0, 0, 0, 0
+
+        stats = dict(
+            loss=loss.detach(),
+            loss_att=loss_att.detach() if loss_att is not None else None,
+            loss_pit=loss_pit.detach() if loss_pit is not None else None,
+            sad_mr=sad_mr,
+            sad_fr=sad_fr,
+            mi=mi,
+            fa=fa,
+            cf=cf,
+            acc=acc,
+            der=der,
+        )
 
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
@@ -154,11 +208,15 @@ def encode(
             # 1. Extract feats
             feats, feats_lengths = self._extract_feats(speech, speech_lengths)
 
-            # 2. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
             if self.normalize is not None:
                 feats, feats_lengths = self.normalize(feats, feats_lengths)
 
-            # 3. Forward encoder
+            # 4. Forward encoder
             # feats: (Batch, Length, Dim)
             # -> encoder_out: (Batch, Length2, Dim)
             encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
@@ -236,6 +294,17 @@ def create_length_mask(self, length, max_len, num_output):
         mask = to_device(self, mask)
         return mask
 
+    def attractor_loss(self, att_prob, label):
+        batch_size = len(label)
+        bce_loss = torch.nn.BCEWithLogitsLoss(reduction="none")
+        # create attractor label [1, 1, ..., 1, 0]
+        # att_label: (Batch, num_spk + 1, 1)
+        att_label = to_device(self, torch.zeros(batch_size, label.size(2) + 1, 1))
+        att_label[:, : label.size(2), :] = 1
+        loss = bce_loss(att_prob, att_label)
+        loss = torch.mean(torch.mean(loss, dim=1))
+        return loss
+
     @staticmethod
     def calc_diarization_error(pred, label, length):
         # Note (jiatong): Credit to https://github.com/hitachi-speech/EEND
diff --git a/espnet2/diar/label_processor.py b/espnet2/diar/label_processor.py
index 3c981d99744..bbfd385f4ea 100644
--- a/espnet2/diar/label_processor.py
+++ b/espnet2/diar/label_processor.py
@@ -4,7 +4,7 @@
 
 
 class LabelProcessor(torch.nn.Module):
-    """Label aggregator for speaker diarization """
+    """Label aggregator for speaker diarization"""
 
     def __init__(
         self, win_length: int = 512, hop_length: int = 128, center: bool = True
diff --git a/espnet2/enh/decoder/__init__.py b/espnet2/enh/decoder/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/enh/decoder/conv_decoder.py b/espnet2/enh/decoder/conv_decoder.py
index aad83a243e8..7f951bc42f9 100644
--- a/espnet2/enh/decoder/conv_decoder.py
+++ b/espnet2/enh/decoder/conv_decoder.py
@@ -4,7 +4,7 @@
 
 
 class ConvDecoder(AbsDecoder):
-    """Transposed Convolutional decoder for speech enhancement and separation """
+    """Transposed Convolutional decoder for speech enhancement and separation"""
 
     def __init__(
         self,
diff --git a/espnet2/enh/decoder/stft_decoder.py b/espnet2/enh/decoder/stft_decoder.py
index 329e8b771fe..e9d3bae5c2d 100644
--- a/espnet2/enh/decoder/stft_decoder.py
+++ b/espnet2/enh/decoder/stft_decoder.py
@@ -1,12 +1,15 @@
+from distutils.version import LooseVersion
 import torch
 from torch_complex.tensor import ComplexTensor
 
 from espnet2.enh.decoder.abs_decoder import AbsDecoder
 from espnet2.layers.stft import Stft
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
 
 class STFTDecoder(AbsDecoder):
-    """STFT decoder for speech enhancement and separation """
+    """STFT decoder for speech enhancement and separation"""
 
     def __init__(
         self,
@@ -36,8 +39,10 @@ def forward(self, input: ComplexTensor, ilens: torch.Tensor):
             input (ComplexTensor): spectrum [Batch, T, F]
             ilens (torch.Tensor): input lengths [Batch]
         """
-        if not isinstance(input, ComplexTensor):
-            raise TypeError("Only support ComplexTensor for stft decoder")
+        if not isinstance(input, ComplexTensor) and (
+            is_torch_1_9_plus and not torch.is_complex(input)
+        ):
+            raise TypeError("Only support complex tensors for stft decoder")
 
         wav, wav_lens = self.stft.inverse(input, ilens)
 
diff --git a/espnet2/enh/encoder/__init__.py b/espnet2/enh/encoder/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/enh/encoder/conv_encoder.py b/espnet2/enh/encoder/conv_encoder.py
index a1a51b11daf..c46f31323e5 100644
--- a/espnet2/enh/encoder/conv_encoder.py
+++ b/espnet2/enh/encoder/conv_encoder.py
@@ -4,7 +4,7 @@
 
 
 class ConvEncoder(AbsEncoder):
-    """Convolutional encoder for speech enhancement and separation """
+    """Convolutional encoder for speech enhancement and separation"""
 
     def __init__(
         self,
diff --git a/espnet2/enh/encoder/null_encoder.py b/espnet2/enh/encoder/null_encoder.py
index 4914c352c1c..e4fac1c0ab8 100644
--- a/espnet2/enh/encoder/null_encoder.py
+++ b/espnet2/enh/encoder/null_encoder.py
@@ -4,7 +4,7 @@
 
 
 class NullEncoder(AbsEncoder):
-    """Null encoder. """
+    """Null encoder."""
 
     def __init__(self):
         super().__init__()
diff --git a/espnet2/enh/encoder/stft_encoder.py b/espnet2/enh/encoder/stft_encoder.py
index a81f07b2257..b2ab65e5532 100644
--- a/espnet2/enh/encoder/stft_encoder.py
+++ b/espnet2/enh/encoder/stft_encoder.py
@@ -1,12 +1,15 @@
+from distutils.version import LooseVersion
 import torch
 from torch_complex.tensor import ComplexTensor
 
 from espnet2.enh.encoder.abs_encoder import AbsEncoder
 from espnet2.layers.stft import Stft
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
 
 class STFTEncoder(AbsEncoder):
-    """STFT encoder for speech enhancement and separation """
+    """STFT encoder for speech enhancement and separation"""
 
     def __init__(
         self,
@@ -17,6 +20,7 @@ def __init__(
         center: bool = True,
         normalized: bool = False,
         onesided: bool = True,
+        use_builtin_complex: bool = True,
     ):
         super().__init__()
         self.stft = Stft(
@@ -30,6 +34,7 @@ def __init__(
         )
 
         self._output_dim = n_fft // 2 + 1 if onesided else n_fft
+        self.use_builtin_complex = use_builtin_complex
 
     @property
     def output_dim(self) -> int:
@@ -41,11 +46,11 @@ def forward(self, input: torch.Tensor, ilens: torch.Tensor):
         Args:
             input (torch.Tensor): mixed speech [Batch, sample]
             ilens (torch.Tensor): input lengths [Batch]
-        Returns:
-            stft spectrum (torch.ComplexTensor):  (Batch, Frames, Freq)
-                                                  or (Batch, Frames, Channels, Freq)
         """
         spectrum, flens = self.stft(input, ilens)
-        spectrum = ComplexTensor(spectrum[..., 0], spectrum[..., 1])
+        if is_torch_1_9_plus and self.use_builtin_complex:
+            spectrum = torch.complex(spectrum[..., 0], spectrum[..., 1])
+        else:
+            spectrum = ComplexTensor(spectrum[..., 0], spectrum[..., 1])
 
         return spectrum, flens
diff --git a/espnet2/enh/espnet_model.py b/espnet2/enh/espnet_model.py
index 6f4daad4e03..f9824471604 100644
--- a/espnet2/enh/espnet_model.py
+++ b/espnet2/enh/espnet_model.py
@@ -1,35 +1,25 @@
+"""Enhancement model module."""
 from distutils.version import LooseVersion
-from functools import reduce
-from itertools import permutations
 from typing import Dict
+from typing import List
 from typing import Optional
 from typing import Tuple
 
 import torch
-from torch_complex.tensor import ComplexTensor
 from typeguard import check_argument_types
 
 from espnet2.enh.decoder.abs_decoder import AbsDecoder
 from espnet2.enh.encoder.abs_encoder import AbsEncoder
-from espnet2.enh.encoder.conv_encoder import ConvEncoder
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainLoss
+from espnet2.enh.loss.criterions.time_domain import TimeDomainLoss
+from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
 from espnet2.enh.separator.abs_separator import AbsSeparator
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.train.abs_espnet_model import AbsESPnetModel
 
 
-is_torch_1_3_plus = LooseVersion(torch.__version__) >= LooseVersion("1.3.0")
-ALL_LOSS_TYPES = (
-    # mse_loss(predicted_mask, target_label)
-    "mask_mse",
-    # mse_loss(enhanced_magnitude_spectrum, target_magnitude_spectrum)
-    "magnitude",
-    # mse_loss(enhanced_complex_spectrum, target_complex_spectrum)
-    "spectrum",
-    # log_mse_loss(enhanced_complex_spectrum, target_complex_spectrum)
-    "spectrum_log",
-    # si_snr(enhanced_waveform, target_waveform)
-    "si_snr",
-)
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
 EPS = torch.finfo(torch.get_default_dtype()).eps
 
 
@@ -41,6 +31,7 @@ def __init__(
         encoder: AbsEncoder,
         separator: AbsSeparator,
         decoder: AbsDecoder,
+        loss_wrappers: List[AbsLossWrapper],
         stft_consistency: bool = False,
         loss_type: str = "mask_mse",
         mask_type: Optional[str] = None,
@@ -52,90 +43,24 @@ def __init__(
         self.encoder = encoder
         self.separator = separator
         self.decoder = decoder
+        self.loss_wrappers = loss_wrappers
         self.num_spk = separator.num_spk
         self.num_noise_type = getattr(self.separator, "num_noise_type", 1)
 
-        if loss_type != "si_snr" and isinstance(encoder, ConvEncoder):
-            raise TypeError(f"{loss_type} is not supported with {type(ConvEncoder)}")
-
-        # get mask type for TF-domain models (only used when loss_type="mask_*")
+        # get mask type for TF-domain models
+        # (only used when loss_type="mask_*") (deprecated, keep for compatibility)
         self.mask_type = mask_type.upper() if mask_type else None
-        # get loss type for model training
+
+        # get loss type for model training (deprecated, keep for compatibility)
         self.loss_type = loss_type
-        # whether to compute the TF-domain loss while enforcing STFT consistency
-        self.stft_consistency = stft_consistency
 
-        if stft_consistency and loss_type in ["mask_mse", "si_snr"]:
-            raise ValueError(
-                f"stft_consistency will not work when '{loss_type}' loss is used"
-            )
+        # whether to compute the TF-domain loss
+        # while enforcing STFT consistency (deprecated, keep for compatibility)
+        self.stft_consistency = stft_consistency
 
-        assert self.loss_type in ALL_LOSS_TYPES, self.loss_type
         # for multi-channel signal
         self.ref_channel = getattr(self.separator, "ref_channel", -1)
 
-    @staticmethod
-    def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
-        """Create mask label.
-
-        Args:
-            mix_spec: ComplexTensor(B, T, F)
-            ref_spec: List[ComplexTensor(B, T, F), ...]
-            mask_type: str
-        Returns:
-            labels: List[Tensor(B, T, F), ...] or List[ComplexTensor(B, T, F), ...]
-        """
-
-        # Must be upper case
-        assert mask_type in [
-            "IBM",
-            "IRM",
-            "IAM",
-            "PSM",
-            "NPSM",
-            "PSM^2",
-        ], f"mask type {mask_type} not supported"
-        mask_label = []
-        for r in ref_spec:
-            mask = None
-            if mask_type == "IBM":
-                flags = [abs(r) >= abs(n) for n in ref_spec]
-                mask = reduce(lambda x, y: x * y, flags)
-                mask = mask.int()
-            elif mask_type == "IRM":
-                # TODO(Wangyou): need to fix this,
-                #  as noise referecens are provided separately
-                mask = abs(r) / (sum(([abs(n) for n in ref_spec])) + EPS)
-            elif mask_type == "IAM":
-                mask = abs(r) / (abs(mix_spec) + EPS)
-                mask = mask.clamp(min=0, max=1)
-            elif mask_type == "PSM" or mask_type == "NPSM":
-                phase_r = r / (abs(r) + EPS)
-                phase_mix = mix_spec / (abs(mix_spec) + EPS)
-                # cos(a - b) = cos(a)*cos(b) + sin(a)*sin(b)
-                cos_theta = (
-                    phase_r.real * phase_mix.real + phase_r.imag * phase_mix.imag
-                )
-                mask = (abs(r) / (abs(mix_spec) + EPS)) * cos_theta
-                mask = (
-                    mask.clamp(min=0, max=1)
-                    if mask_type == "NPSM"
-                    else mask.clamp(min=-1, max=1)
-                )
-            elif mask_type == "PSM^2":
-                # This is for training beamforming masks
-                phase_r = r / (abs(r) + EPS)
-                phase_mix = mix_spec / (abs(mix_spec) + EPS)
-                # cos(a - b) = cos(a)*cos(b) + sin(a)*sin(b)
-                cos_theta = (
-                    phase_r.real * phase_mix.real + phase_r.imag * phase_mix.imag
-                )
-                mask = (abs(r).pow(2) / (abs(mix_spec).pow(2) + EPS)) * cos_theta
-                mask = mask.clamp(min=-1, max=1)
-            assert mask is not None, f"mask type {mask_type} not supported"
-            mask_label.append(mask)
-        return mask_label
-
     def forward(
         self,
         speech_mix: torch.Tensor,
@@ -205,443 +130,63 @@ def forward(
         )
 
         # for data-parallel
-        speech_ref = speech_ref[:, :, : speech_lengths.max()]
-        speech_mix = speech_mix[:, : speech_lengths.max()]
-
-        loss, speech_pre, others, out_lengths, perm = self._compute_loss(
-            speech_mix,
-            speech_lengths,
-            speech_ref,
-            dereverb_speech_ref=dereverb_speech_ref,
-            noise_ref=noise_ref,
-        )
-
-        # add stats for logging
-        if self.loss_type != "si_snr":
-            if self.training:
-                si_snr = None
-            else:
-                speech_pre = [self.decoder(ps, speech_lengths)[0] for ps in speech_pre]
-                speech_ref = torch.unbind(speech_ref, dim=1)
-                if speech_ref[0].dim() == 3:
-                    # For si_snr loss, only select one channel as the reference
-                    speech_ref = [sr[..., self.ref_channel] for sr in speech_ref]
-                # compute si-snr loss
-                si_snr_loss, perm = self._permutation_loss(
-                    speech_ref, speech_pre, self.si_snr_loss, perm=perm
-                )
-                si_snr = -si_snr_loss.detach()
+        speech_ref = speech_ref[..., : speech_lengths.max()]
+        speech_ref = speech_ref.unbind(dim=1)
 
-            stats = dict(
-                si_snr=si_snr,
-                loss=loss.detach(),
-            )
-        else:
-            stats = dict(si_snr=-loss.detach(), loss=loss.detach())
-
-        # force_gatherable: to-device and to-tensor if scalar for DataParallel
-        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
-        return loss, stats, weight
-
-    def _compute_loss(
-        self,
-        speech_mix,
-        speech_lengths,
-        speech_ref,
-        dereverb_speech_ref=None,
-        noise_ref=None,
-        cal_loss=True,
-    ):
-        """Compute loss according to self.loss_type.
-
-        Args:
-            speech_mix: (Batch, samples) or (Batch, samples, channels)
-            speech_lengths: (Batch,), default None for chunk interator,
-                            because the chunk-iterator does not have the
-                            speech_lengths returned. see in
-                            espnet2/iterators/chunk_iter_factory.py
-            speech_ref: (Batch, num_speaker, samples)
-                        or (Batch, num_speaker, samples, channels)
-            dereverb_speech_ref: (Batch, N, samples)
-                        or (Batch, num_speaker, samples, channels)
-            noise_ref: (Batch, num_noise_type, samples)
-                        or (Batch, num_speaker, samples, channels)
-            cal_loss: whether to calculate enh loss, defualt is True
+        speech_mix = speech_mix[:, : speech_lengths.max()]
 
-        Returns:
-            loss: (torch.Tensor) speech enhancement loss
-            speech_pre: (List[torch.Tensor] or List[ComplexTensor])
-                        enhanced speech or spectrum(s)
-            others: (OrderedDict) estimated masks or None
-            output_lengths: (Batch,)
-            perm: () best permutation
-        """
+        # model forward
         feature_mix, flens = self.encoder(speech_mix, speech_lengths)
         feature_pre, flens, others = self.separator(feature_mix, flens)
-
-        if self.loss_type != "si_snr":
-            spectrum_mix = feature_mix
-            spectrum_pre = feature_pre
-            # predict separated speech and masks
-            if self.stft_consistency:
-                # pseudo STFT -> time-domain -> STFT (compute loss)
-                tmp_t_domain = [
-                    self.decoder(sp, speech_lengths)[0] for sp in spectrum_pre
-                ]
-                spectrum_pre = [
-                    self.encoder(sp, speech_lengths)[0] for sp in tmp_t_domain
-                ]
-                pass
-
-            if spectrum_pre is not None and not isinstance(
-                spectrum_pre[0], ComplexTensor
-            ):
-                spectrum_pre = [
-                    ComplexTensor(*torch.unbind(sp, dim=-1)) for sp in spectrum_pre
-                ]
-
-            if not cal_loss:
-                loss, perm = None, None
-                return loss, spectrum_pre, others, flens, perm
-
-            # prepare reference speech and reference spectrum
-            speech_ref = torch.unbind(speech_ref, dim=1)
-            # List[ComplexTensor(Batch, T, F)] or List[ComplexTensor(Batch, T, C, F)]
-            spectrum_ref = [self.encoder(sr, speech_lengths)[0] for sr in speech_ref]
-
-            # compute TF masking loss
-            if self.loss_type == "magnitude":
-                # compute loss on magnitude spectrum
-                assert spectrum_pre is not None
-                magnitude_pre = [abs(ps + 1e-15) for ps in spectrum_pre]
-                if spectrum_ref[0].dim() > magnitude_pre[0].dim():
-                    # only select one channel as the reference
-                    magnitude_ref = [
-                        abs(sr[..., self.ref_channel, :]) for sr in spectrum_ref
-                    ]
-                else:
-                    magnitude_ref = [abs(sr) for sr in spectrum_ref]
-
-                tf_loss, perm = self._permutation_loss(
-                    magnitude_ref, magnitude_pre, self.tf_mse_loss
-                )
-            elif self.loss_type.startswith("spectrum"):
-                # compute loss on complex spectrum
-                if self.loss_type == "spectrum":
-                    loss_func = self.tf_mse_loss
-                elif self.loss_type == "spectrum_log":
-                    loss_func = self.tf_log_mse_loss
-                else:
-                    raise ValueError("Unsupported loss type: %s" % self.loss_type)
-
-                assert spectrum_pre is not None
-                if spectrum_ref[0].dim() > spectrum_pre[0].dim():
+        if feature_pre is not None:
+            speech_pre = [self.decoder(ps, speech_lengths)[0] for ps in feature_pre]
+        else:
+            # some models (e.g. neural beamformer trained with mask loss)
+            # do not predict time-domain signal in the training stage
+            speech_pre = None
+
+        loss = 0.0
+        stats = dict()
+        o = {}
+        for loss_wrapper in self.loss_wrappers:
+            criterion = loss_wrapper.criterion
+            if isinstance(criterion, TimeDomainLoss):
+                if speech_ref[0].dim() == 3:
+                    # For multi-channel reference,
                     # only select one channel as the reference
-                    spectrum_ref = [sr[..., self.ref_channel, :] for sr in spectrum_ref]
-
-                tf_loss, perm = self._permutation_loss(
-                    spectrum_ref, spectrum_pre, loss_func
-                )
-            elif self.loss_type.startswith("mask"):
-                if self.loss_type == "mask_mse":
-                    loss_func = self.tf_mse_loss
-                else:
-                    raise ValueError("Unsupported loss type: %s" % self.loss_type)
-
-                assert others is not None
-                mask_pre_ = [
-                    others["mask_spk{}".format(spk + 1)] for spk in range(self.num_spk)
-                ]
-
-                # prepare ideal masks
-                mask_ref = self._create_mask_label(
-                    spectrum_mix, spectrum_ref, mask_type=self.mask_type
-                )
-
-                # compute TF masking loss
-                tf_loss, perm = self._permutation_loss(mask_ref, mask_pre_, loss_func)
-
-                if "mask_dereverb1" in others:
-                    if dereverb_speech_ref is None:
-                        raise ValueError(
-                            "No dereverberated reference for training!\n"
-                            'Please specify "--use_dereverb_ref true" in run.sh'
-                        )
-
-                    mask_wpe_pre = [
-                        others["mask_dereverb{}".format(spk + 1)]
-                        for spk in range(self.num_spk)
-                        if "mask_dereverb{}".format(spk + 1) in others
-                    ]
-                    assert len(mask_wpe_pre) == dereverb_speech_ref.size(1), (
-                        len(mask_wpe_pre),
-                        dereverb_speech_ref.size(1),
-                    )
-                    dereverb_speech_ref = torch.unbind(dereverb_speech_ref, dim=1)
-                    dereverb_spectrum_ref = [
-                        self.encoder(dr, speech_lengths)[0]
-                        for dr in dereverb_speech_ref
-                    ]
-                    dereverb_mask_ref = self._create_mask_label(
-                        spectrum_mix, dereverb_spectrum_ref, mask_type=self.mask_type
-                    )
-
-                    tf_dereverb_loss, perm_d = self._permutation_loss(
-                        dereverb_mask_ref, mask_wpe_pre, loss_func
-                    )
-                    tf_loss = tf_loss + tf_dereverb_loss
-
-                if "mask_noise1" in others:
-                    if noise_ref is None:
-                        raise ValueError(
-                            "No noise reference for training!\n"
-                            'Please specify "--use_noise_ref true" in run.sh'
-                        )
-
-                    noise_ref = torch.unbind(noise_ref, dim=1)
-                    noise_spectrum_ref = [
-                        self.encoder(nr, speech_lengths)[0] for nr in noise_ref
-                    ]
-                    noise_mask_ref = self._create_mask_label(
-                        spectrum_mix, noise_spectrum_ref, mask_type=self.mask_type
+                    speech_ref = [sr[..., self.ref_channel] for sr in speech_ref]
+                # for the time domain criterions
+                l, s, o = loss_wrapper(speech_ref, speech_pre, o)
+            elif isinstance(criterion, FrequencyDomainLoss):
+                # for the time-frequency domain criterions
+                if criterion.compute_on_mask:
+                    # compute on mask
+                    tf_ref = criterion.create_mask_label(
+                        feature_mix,
+                        [self.encoder(sr, speech_lengths)[0] for sr in speech_ref],
                     )
-
-                    mask_noise_pre = [
-                        others["mask_noise{}".format(n + 1)]
-                        for n in range(self.num_noise_type)
+                    tf_pre = [
+                        others["mask_spk{}".format(spk + 1)]
+                        for spk in range(self.num_spk)
                     ]
-                    tf_noise_loss, perm_n = self._permutation_loss(
-                        noise_mask_ref, mask_noise_pre, loss_func
-                    )
-                    tf_loss = tf_loss + tf_noise_loss
-            else:
-                raise ValueError("Unsupported loss type: %s" % self.loss_type)
-
-            loss = tf_loss
-            return loss, spectrum_pre, others, flens, perm
-
-        else:
-            speech_pre = [self.decoder(ps, speech_lengths)[0] for ps in feature_pre]
-            if not cal_loss:
-                loss, perm = None, None
-                return loss, speech_pre, None, speech_lengths, perm
-
-            # speech_pre: list[(batch, sample)]
-            assert speech_pre[0].dim() == 2, speech_pre[0].dim()
-
-            if speech_ref.dim() == 4:
-                # For si_snr loss of multi-channel input,
-                # only select one channel as the reference
-                speech_ref = speech_ref[..., self.ref_channel]
-            speech_ref = torch.unbind(speech_ref, dim=1)
-
-            # compute si-snr loss
-            si_snr_loss, perm = self._permutation_loss(
-                speech_ref, speech_pre, self.si_snr_loss_zeromean
-            )
-            loss = si_snr_loss
-
-            return loss, speech_pre, None, speech_lengths, perm
-
-    @staticmethod
-    def tf_mse_loss(ref, inf):
-        """time-frequency MSE loss.
-
-        Args:
-            ref: (Batch, T, F) or (Batch, T, C, F)
-            inf: (Batch, T, F) or (Batch, T, C, F)
-        Returns:
-            loss: (Batch,)
-        """
-        assert ref.shape == inf.shape, (ref.shape, inf.shape)
-        if not is_torch_1_3_plus:
-            # in case of binary masks
-            ref = ref.type(inf.dtype)
-        diff = ref - inf
-        if isinstance(diff, ComplexTensor):
-            mseloss = diff.real ** 2 + diff.imag ** 2
-        else:
-            mseloss = diff ** 2
-        if ref.dim() == 3:
-            mseloss = mseloss.mean(dim=[1, 2])
-        elif ref.dim() == 4:
-            mseloss = mseloss.mean(dim=[1, 2, 3])
-        else:
-            raise ValueError(
-                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
-            )
-
-        return mseloss
-
-    @staticmethod
-    def tf_log_mse_loss(ref, inf):
-        """time-frequency log-MSE loss.
-
-        Args:
-            ref: (Batch, T, F) or (Batch, T, C, F)
-            inf: (Batch, T, F) or (Batch, T, C, F)
-        Returns:
-            loss: (Batch,)
-        """
-        assert ref.shape == inf.shape, (ref.shape, inf.shape)
-        if not is_torch_1_3_plus:
-            # in case of binary masks
-            ref = ref.type(inf.dtype)
-        diff = ref - inf
-        if isinstance(diff, ComplexTensor):
-            log_mse_loss = diff.real ** 2 + diff.imag ** 2
-        else:
-            log_mse_loss = diff ** 2
-        if ref.dim() == 3:
-            log_mse_loss = torch.log10(log_mse_loss.sum(dim=[1, 2])) * 10
-        elif ref.dim() == 4:
-            log_mse_loss = torch.log10(log_mse_loss.sum(dim=[1, 2, 3])) * 10
-        else:
-            raise ValueError(
-                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
-            )
-
-        return log_mse_loss
-
-    @staticmethod
-    def tf_l1_loss(ref, inf):
-        """time-frequency L1 loss.
-
-        Args:
-            ref: (Batch, T, F) or (Batch, T, C, F)
-            inf: (Batch, T, F) or (Batch, T, C, F)
-        Returns:
-            loss: (Batch,)
-        """
-        assert ref.shape == inf.shape, (ref.shape, inf.shape)
-        if not is_torch_1_3_plus:
-            # in case of binary masks
-            ref = ref.type(inf.dtype)
-        if isinstance(inf, ComplexTensor):
-            l1loss = abs(ref - inf + EPS)
-        else:
-            l1loss = abs(ref - inf)
-        if ref.dim() == 3:
-            l1loss = l1loss.mean(dim=[1, 2])
-        elif ref.dim() == 4:
-            l1loss = l1loss.mean(dim=[1, 2, 3])
-        else:
-            raise ValueError(
-                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
-            )
-        return l1loss
-
-    @staticmethod
-    def si_snr_loss(ref, inf):
-        """SI-SNR loss
-
-        Args:
-            ref: (Batch, samples)
-            inf: (Batch, samples)
-        Returns:
-            loss: (Batch,)
-        """
-        ref = ref / torch.norm(ref, p=2, dim=1, keepdim=True)
-        inf = inf / torch.norm(inf, p=2, dim=1, keepdim=True)
-
-        s_target = (ref * inf).sum(dim=1, keepdims=True) * ref
-        e_noise = inf - s_target
-
-        si_snr = 20 * (
-            torch.log10(torch.norm(s_target, p=2, dim=1).clamp(min=EPS))
-            - torch.log10(torch.norm(e_noise, p=2, dim=1).clamp(min=EPS))
-        )
-        return -si_snr
-
-    @staticmethod
-    def si_snr_loss_zeromean(ref, inf):
-        """SI-SNR loss with zero-mean in pre-processing.
-
-        Args:
-            ref: (Batch, samples)
-            inf: (Batch, samples)
-        Returns:
-            loss: (Batch,)
-        """
-        assert ref.size() == inf.size()
-        B, T = ref.size()
-        # mask padding position along T
-
-        # Step 1. Zero-mean norm
-        mean_target = torch.sum(ref, dim=1, keepdim=True) / T
-        mean_estimate = torch.sum(inf, dim=1, keepdim=True) / T
-        zero_mean_target = ref - mean_target
-        zero_mean_estimate = inf - mean_estimate
-
-        # Step 2. SI-SNR with order
-        # reshape to use broadcast
-        s_target = zero_mean_target  # [B, T]
-        s_estimate = zero_mean_estimate  # [B, T]
-        # s_target = <s', s>s / ||s||^2
-        pair_wise_dot = torch.sum(s_estimate * s_target, dim=1, keepdim=True)  # [B, 1]
-        s_target_energy = torch.sum(s_target ** 2, dim=1, keepdim=True) + EPS  # [B, 1]
-        pair_wise_proj = pair_wise_dot * s_target / s_target_energy  # [B, T]
-        # e_noise = s' - s_target
-        e_noise = s_estimate - pair_wise_proj  # [B, T]
-
-        # SI-SNR = 10 * log_10(||s_target||^2 / ||e_noise||^2)
-        pair_wise_si_snr = torch.sum(pair_wise_proj ** 2, dim=1) / (
-            torch.sum(e_noise ** 2, dim=1) + EPS
-        )
-        # print('pair_si_snr',pair_wise_si_snr[0,:])
-        pair_wise_si_snr = 10 * torch.log10(pair_wise_si_snr + EPS)  # [B]
-        # print(pair_wise_si_snr)
-
-        return -1 * pair_wise_si_snr
-
-    @staticmethod
-    def _permutation_loss(ref, inf, criterion, perm=None):
-        """The basic permutation loss function.
-
-        Args:
-            ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
-            inf (List[torch.Tensor]): [(batch, ...), ...]
-            criterion (function): Loss function
-            perm (torch.Tensor): specified permutation (batch, num_spk)
-        Returns:
-            loss (torch.Tensor): minimum loss with the best permutation (batch)
-            perm (torch.Tensor): permutation for inf (batch, num_spk)
-                                 e.g. tensor([[1, 0, 2], [0, 1, 2]])
-        """
-        assert len(ref) == len(inf), (len(ref), len(inf))
-        num_spk = len(ref)
+                else:
+                    # compute on spectrum
+                    if speech_ref[0].dim() == 3:
+                        # For multi-channel reference,
+                        # only select one channel as the reference
+                        speech_ref = [sr[..., self.ref_channel] for sr in speech_ref]
+                    tf_ref = [self.encoder(sr, speech_lengths)[0] for sr in speech_ref]
+                    tf_pre = feature_pre
 
-        def pair_loss(permutation):
-            return sum(
-                [criterion(ref[s], inf[t]) for s, t in enumerate(permutation)]
-            ) / len(permutation)
+                l, s, o = loss_wrapper(tf_ref, tf_pre, o)
+            loss += l * loss_wrapper.weight
+            stats.update(s)
 
-        if perm is None:
-            device = ref[0].device
-            all_permutations = list(permutations(range(num_spk)))
-            losses = torch.stack([pair_loss(p) for p in all_permutations], dim=1)
-            loss, perm = torch.min(losses, dim=1)
-            perm = torch.index_select(
-                torch.tensor(all_permutations, device=device, dtype=torch.long),
-                0,
-                perm,
-            )
-        else:
-            loss = torch.tensor(
-                [
-                    torch.tensor(
-                        [
-                            criterion(
-                                ref[s][batch].unsqueeze(0), inf[t][batch].unsqueeze(0)
-                            )
-                            for s, t in enumerate(p)
-                        ]
-                    ).mean()
-                    for batch, p in enumerate(perm)
-                ]
-            )
+        stats["loss"] = loss.detach()
 
-        return loss.mean(), perm
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
 
     def collect_feats(
         self, speech_mix: torch.Tensor, speech_mix_lengths: torch.Tensor, **kwargs
diff --git a/espnet2/enh/layers/beamformer.py b/espnet2/enh/layers/beamformer.py
index 39f610a2d57..1e72b99071f 100644
--- a/espnet2/enh/layers/beamformer.py
+++ b/espnet2/enh/layers/beamformer.py
@@ -1,31 +1,216 @@
+"""Beamformer module."""
 from distutils.version import LooseVersion
 from typing import List
 from typing import Optional
 from typing import Union
 
-import numpy as np
 import torch
 from torch_complex import functional as FC
 from torch_complex.tensor import ComplexTensor
 
-is_torch_1_1_plus = LooseVersion(torch.__version__) >= LooseVersion("1.1.0")
+from espnet2.enh.layers.complex_utils import cat
+from espnet2.enh.layers.complex_utils import complex_norm
+from espnet2.enh.layers.complex_utils import einsum
+from espnet2.enh.layers.complex_utils import inverse
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.complex_utils import is_torch_complex_tensor
+from espnet2.enh.layers.complex_utils import matmul
+from espnet2.enh.layers.complex_utils import reverse
+from espnet2.enh.layers.complex_utils import solve
+from espnet2.enh.layers.complex_utils import to_double
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 EPS = torch.finfo(torch.double).eps
 
 
-def complex_norm(c: ComplexTensor) -> torch.Tensor:
-    return torch.sqrt((c.real ** 2 + c.imag ** 2).sum(dim=-1, keepdim=True) + EPS)
+def prepare_beamformer_stats(
+    signal,
+    masks_speech,
+    mask_noise,
+    powers=None,
+    beamformer_type="mvdr",
+    bdelay=3,
+    btaps=5,
+    eps=1e-6,
+):
+    """Prepare necessary statistics for constructing the specified beamformer.
+
+    Args:
+        signal (torch.complex64/ComplexTensor): (..., F, C, T)
+        masks_speech (List[torch.Tensor]): (..., F, C, T) masks for all speech sources
+        mask_noise (torch.Tensor): (..., F, C, T) noise mask
+        powers (List[torch.Tensor]): powers for all speech sources (..., F, T)
+                                     used for wMPDR or WPD beamformers
+        beamformer_type (str): one of the pre-defined beamformer types
+        bdelay (int): delay factor, used for WPD beamformser
+        btaps (int): number of filter taps, used for WPD beamformser
+        eps (torch.Tensor): tiny constant
+    Returns:
+        beamformer_stats (dict): a dictionary containing all necessary statistics
+            e.g. "psd_n", "psd_speech", "psd_distortion"
+            Note:
+            * When `masks_speech` is a tensor or a single-element list, all returned
+              statistics are tensors;
+            * When `masks_speech` is a multi-element list, some returned statistics
+              can be a list, e.g., "psd_n" for MVDR, "psd_speech" and "psd_distortion".
+
+    """
+    from espnet2.enh.layers.dnn_beamformer import BEAMFORMER_TYPES
+
+    assert beamformer_type in BEAMFORMER_TYPES, "%s is not supported yet"
+
+    if isinstance(masks_speech, (list, tuple)):
+        masks_speech = [to_double(m) for m in masks_speech]
+    else:
+        masks_speech = [to_double(masks_speech)]
+    num_spk = len(masks_speech)
+
+    if (
+        beamformer_type.startswith("wmpdr")
+        or beamformer_type.startswith("wpd")
+        or beamformer_type == "wlcmp"
+        or beamformer_type == "wmwf"
+    ):
+        if powers is None:
+            power_input = signal.real**2 + signal.imag**2
+            # Averaging along the channel axis: (..., C, T) -> (..., T)
+            powers = [(power_input * m).mean(dim=-2) for m in masks_speech]
+        else:
+            assert len(powers) == num_spk, (len(powers), num_spk)
+        inverse_powers = [1 / torch.clamp(p, min=eps) for p in powers]
+
+    psd_speeches = [get_power_spectral_density_matrix(signal, m) for m in masks_speech]
+    if (
+        beamformer_type == "mvdr_souden"
+        or beamformer_type == "sdw_mwf"
+        or beamformer_type == "r1mwf"
+        or beamformer_type.startswith("mvdr_tfs")
+        or not beamformer_type.endswith("_souden")
+    ):
+        # MVDR or other RTF-based formulas
+        if mask_noise is not None:
+            psd_bg = get_power_spectral_density_matrix(signal, to_double(mask_noise))
+        if num_spk == 1:
+            assert mask_noise is not None
+            psd_noise = psd_bg
+        else:
+            psd_noise = []
+            for i in range(num_spk):
+                if beamformer_type.startswith("mvdr_tfs"):
+                    # NOTE: psd_noise is a list only for this beamformer
+                    psd_noise_i = [psd for j, psd in enumerate(psd_speeches) if j != i]
+                else:
+                    psd_sum = sum(psd for j, psd in enumerate(psd_speeches) if j != i)
+                    psd_noise_i = (
+                        psd_bg + psd_sum if mask_noise is not None else psd_sum
+                    )
+                psd_noise.append(psd_noise_i)
+
+    if beamformer_type in (
+        "mvdr",
+        "mvdr_souden",
+        "mvdr_tfs_souden",
+        "sdw_mwf",
+        "r1mwf",
+        "lcmv",
+        "gev",
+        "gev_ban",
+    ):
+        psd_n = psd_noise
+    elif beamformer_type == "mvdr_tfs":
+        psd_n = psd_noise
+        psd_noise = [sum(psd_noise_i) for psd_noise_i in psd_noise]
+    elif beamformer_type in ("mpdr", "mpdr_souden", "lcmp", "mwf"):
+        psd_n = einsum("...ct,...et->...ce", signal, signal.conj())
+    elif beamformer_type in ("wmpdr", "wmpdr_souden", "wlcmp", "wmwf"):
+        psd_n = [
+            einsum(
+                "...ct,...et->...ce",
+                signal * inv_p[..., None, :],
+                signal.conj(),
+            )
+            for inv_p in inverse_powers
+        ]
+    elif beamformer_type in ("wpd", "wpd_souden"):
+        psd_n = [
+            get_covariances(signal, inv_p, bdelay, btaps, get_vector=False)
+            for inv_p in inverse_powers
+        ]
+
+    if num_spk == 1:
+        psd_speeches = psd_speeches[0]
+        if isinstance(psd_n, (list, tuple)):
+            psd_n = psd_n[0]
+
+    if beamformer_type in (
+        "mvdr",
+        "mpdr",
+        "wmpdr",
+        "wpd",
+        "lcmp",
+        "wlcmp",
+        "lcmv",
+        "mvdr_tfs",
+    ):
+        return {"psd_n": psd_n, "psd_speech": psd_speeches, "psd_distortion": psd_noise}
+    elif (
+        beamformer_type.endswith("_souden")
+        or beamformer_type.startswith("gev")
+        or beamformer_type == "mwf"
+        or beamformer_type == "wmwf"
+        or beamformer_type == "sdw_mwf"
+        or beamformer_type == "r1mwf"
+    ):
+        return {"psd_n": psd_n, "psd_speech": psd_speeches}
+
+
+def get_power_spectral_density_matrix(
+    xs, mask, normalization=True, reduction="mean", eps: float = 1e-15
+):
+    """Return cross-channel power spectral density (PSD) matrix
+
+    Args:
+        xs (torch.complex64/ComplexTensor): (..., F, C, T)
+        reduction (str): "mean" or "median"
+        mask (torch.Tensor): (..., F, C, T)
+        normalization (bool):
+        eps (float):
+    Returns
+        psd (torch.complex64/ComplexTensor): (..., F, C, C)
+
+    """
+    if reduction == "mean":
+        # Averaging mask along C: (..., C, T) -> (..., 1, T)
+        mask = mask.mean(dim=-2, keepdim=True)
+    elif reduction == "median":
+        mask = mask.median(dim=-2, keepdim=True)
+    else:
+        raise ValueError("Unknown reduction mode: %s" % reduction)
+
+    # Normalized mask along T: (..., T)
+    if normalization:
+        # If assuming the tensor is padded with zero, the summation along
+        # the time axis is same regardless of the padding length.
+        mask = mask / (mask.sum(dim=-1, keepdim=True) + eps)
+
+    # outer product: (..., C_1, T) x (..., C_2, T) -> (..., C, C_2)
+    psd = einsum("...ct,...et->...ce", xs * mask, xs.conj())
+
+    return psd
 
 
 def get_rtf(
-    psd_speech: ComplexTensor,
-    psd_noise: ComplexTensor,
-    reference_vector: Union[int, torch.Tensor, None] = None,
+    psd_speech,
+    psd_noise,
+    mode="power",
+    reference_vector: Union[int, torch.Tensor] = 0,
     iterations: int = 3,
     use_torch_solver: bool = True,
-) -> ComplexTensor:
-    """Calculate the relative transfer function (RTF) using the power method.
+):
+    """Calculate the relative transfer function (RTF)
 
-    Algorithm:
+    Algorithm of power method:
         1) rtf = reference_vector
         2) for i in range(iterations):
              rtf = (psd_noise^-1 @ psd_speech) @ rtf
@@ -35,40 +220,55 @@ def get_rtf(
     Note: 4) Normalization at the reference channel is not performed here.
 
     Args:
-        psd_speech (ComplexTensor): speech covariance matrix (..., F, C, C)
-        psd_noise (ComplexTensor): noise covariance matrix (..., F, C, C)
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
+        mode (str): one of ("power", "evd")
+            "power": power method
+            "evd": eigenvalue decomposition
         reference_vector (torch.Tensor or int): (..., C) or scalar
         iterations (int): number of iterations in power method
         use_torch_solver (bool): Whether to use `solve` instead of `inverse`
     Returns:
-        rtf (ComplexTensor): (..., F, C, 1)
+        rtf (torch.complex64/ComplexTensor): (..., F, C, 1)
     """
-    if use_torch_solver and is_torch_1_1_plus:
-        # torch.solve is required, which is only available after pytorch 1.1.0+
-        phi = FC.solve(psd_speech, psd_noise)[0]
+    if mode == "power":
+        if use_torch_solver:
+            phi = solve(psd_speech, psd_noise)
+        else:
+            phi = matmul(inverse(psd_noise), psd_speech)
+        rtf = (
+            phi[..., reference_vector, None]
+            if isinstance(reference_vector, int)
+            else matmul(phi, reference_vector[..., None, :, None])
+        )
+        for _ in range(iterations - 2):
+            rtf = matmul(phi, rtf)
+            # rtf = rtf / complex_norm(rtf, dim=-1, keepdim=True)
+        rtf = matmul(psd_speech, rtf)
+    elif mode == "evd":
+        assert (
+            is_torch_1_9_plus
+            and is_torch_complex_tensor(psd_speech)
+            and is_torch_complex_tensor(psd_noise)
+        )
+        e_vec = generalized_eigenvalue_decomposition(psd_speech, psd_noise)[1]
+        rtf = matmul(psd_noise, e_vec[..., -1, None])
     else:
-        phi = FC.matmul(psd_noise.inverse2(), psd_speech)
-    rtf = (
-        phi[..., reference_vector, None]
-        if isinstance(reference_vector, int)
-        else FC.matmul(phi, reference_vector[..., None, :, None])
-    )
-    for _ in range(iterations - 2):
-        rtf = FC.matmul(phi, rtf)
-        # rtf = rtf / complex_norm(rtf)
-    rtf = FC.matmul(psd_speech, rtf)
+        raise ValueError("Unknown mode: %s" % mode)
     return rtf
 
 
 def get_mvdr_vector(
-    psd_s: ComplexTensor,
-    psd_n: ComplexTensor,
+    psd_s,
+    psd_n,
     reference_vector: torch.Tensor,
     use_torch_solver: bool = True,
     diagonal_loading: bool = True,
     diag_eps: float = 1e-7,
     eps: float = 1e-8,
-) -> ComplexTensor:
+):
     """Return the MVDR (Minimum Variance Distortionless Response) vector:
 
         h = (Npsd^-1 @ Spsd) / (Tr(Npsd^-1 @ Spsd)) @ u
@@ -79,35 +279,38 @@ def get_mvdr_vector(
         https://ieeexplore.ieee.org/document/5089420
 
     Args:
-        psd_s (ComplexTensor): speech covariance matrix (..., F, C, C)
-        psd_n (ComplexTensor): observation/noise covariance matrix (..., F, C, C)
+        psd_s (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_n (torch.complex64/ComplexTensor):
+            observation/noise covariance matrix (..., F, C, C)
         reference_vector (torch.Tensor): (..., C)
         use_torch_solver (bool): Whether to use `solve` instead of `inverse`
         diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
         diag_eps (float):
         eps (float):
     Returns:
-        beamform_vector (ComplexTensor): (..., F, C)
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
     """  # noqa: D400
     if diagonal_loading:
         psd_n = tik_reg(psd_n, reg=diag_eps, eps=eps)
 
-    if use_torch_solver and is_torch_1_1_plus:
-        # torch.solve is required, which is only available after pytorch 1.1.0+
-        numerator = FC.solve(psd_s, psd_n)[0]
+    if use_torch_solver:
+        numerator = solve(psd_s, psd_n)
     else:
-        numerator = FC.matmul(psd_n.inverse2(), psd_s)
+        numerator = matmul(inverse(psd_n), psd_s)
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
     # ws: (..., C, C) / (...,) -> (..., C, C)
     ws = numerator / (FC.trace(numerator)[..., None, None] + eps)
     # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
-    beamform_vector = FC.einsum("...fec,...c->...fe", [ws, reference_vector])
+    beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
     return beamform_vector
 
 
 def get_mvdr_vector_with_rtf(
-    psd_n: ComplexTensor,
-    psd_speech: ComplexTensor,
-    psd_noise: ComplexTensor,
+    psd_n: Union[torch.Tensor, ComplexTensor],
+    psd_speech: Union[torch.Tensor, ComplexTensor],
+    psd_noise: Union[torch.Tensor, ComplexTensor],
     iterations: int = 3,
     reference_vector: Union[int, torch.Tensor, None] = None,
     normalize_ref_channel: Optional[int] = None,
@@ -115,7 +318,7 @@ def get_mvdr_vector_with_rtf(
     diagonal_loading: bool = True,
     diag_eps: float = 1e-7,
     eps: float = 1e-8,
-) -> ComplexTensor:
+) -> Union[torch.Tensor, ComplexTensor]:
     """Return the MVDR (Minimum Variance Distortionless Response) vector
         calculated with RTF:
 
@@ -127,9 +330,12 @@ def get_mvdr_vector_with_rtf(
         https://ieeexplore.ieee.org/document/5089420
 
     Args:
-        psd_n (ComplexTensor): observation/noise covariance matrix (..., F, C, C)
-        psd_speech (ComplexTensor): speech covariance matrix (..., F, C, C)
-        psd_noise (ComplexTensor): noise covariance matrix (..., F, C, C)
+        psd_n (torch.complex64/ComplexTensor):
+            observation/noise covariance matrix (..., F, C, C)
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
         iterations (int): number of iterations in power method
         reference_vector (torch.Tensor or int): (..., C) or scalar
         normalize_ref_channel (int): reference channel for normalizing the RTF
@@ -138,7 +344,7 @@ def get_mvdr_vector_with_rtf(
         diag_eps (float):
         eps (float):
     Returns:
-        beamform_vector (ComplexTensor): (..., F, C)
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
     """  # noqa: H405, D205, D400
     if diagonal_loading:
         psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
@@ -147,18 +353,17 @@ def get_mvdr_vector_with_rtf(
     rtf = get_rtf(
         psd_speech,
         psd_noise,
-        reference_vector,
+        reference_vector=reference_vector,
         iterations=iterations,
         use_torch_solver=use_torch_solver,
     )
 
     # numerator: (..., C_1, C_2) x (..., C_2, 1) -> (..., C_1)
-    if use_torch_solver and is_torch_1_1_plus:
-        # torch.solve is required, which is only available after pytorch 1.1.0+
-        numerator = FC.solve(rtf, psd_n)[0].squeeze(-1)
+    if use_torch_solver:
+        numerator = solve(rtf, psd_n).squeeze(-1)
     else:
-        numerator = FC.matmul(psd_n.inverse2(), rtf).squeeze(-1)
-    denominator = FC.einsum("...d,...d->...", [rtf.squeeze(-1).conj(), numerator])
+        numerator = matmul(inverse(psd_n), rtf).squeeze(-1)
+    denominator = einsum("...d,...d->...", rtf.squeeze(-1).conj(), numerator)
     if normalize_ref_channel is not None:
         scale = rtf.squeeze(-1)[..., normalize_ref_channel, None].conj()
         beamforming_vector = numerator * scale / (denominator.real.unsqueeze(-1) + eps)
@@ -167,6 +372,476 @@ def get_mvdr_vector_with_rtf(
     return beamforming_vector
 
 
+def apply_beamforming_vector(
+    beamform_vector: Union[torch.Tensor, ComplexTensor],
+    mix: Union[torch.Tensor, ComplexTensor],
+) -> Union[torch.Tensor, ComplexTensor]:
+    # (..., C) x (..., C, T) -> (..., T)
+    es = einsum("...c,...ct->...t", beamform_vector.conj(), mix)
+    return es
+
+
+def get_mwf_vector(
+    psd_s,
+    psd_n,
+    reference_vector: Union[torch.Tensor, int],
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+):
+    """Return the MWF (Minimum Multi-channel Wiener Filter) vector:
+
+        h = (Npsd^-1 @ Spsd) @ u
+
+    Args:
+        psd_s (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_n (torch.complex64/ComplexTensor):
+            power-normalized observation covariance matrix (..., F, C, C)
+        reference_vector (torch.Tensor or int): (..., C) or scalar
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: D400
+    if diagonal_loading:
+        psd_n = tik_reg(psd_n, reg=diag_eps, eps=eps)
+
+    if use_torch_solver:
+        ws = solve(psd_s, psd_n)
+    else:
+        ws = matmul(inverse(psd_n), psd_s)
+    # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
+    if isinstance(reference_vector, int):
+        beamform_vector = ws[..., reference_vector]
+    else:
+        beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
+    return beamform_vector
+
+
+def get_sdw_mwf_vector(
+    psd_speech,
+    psd_noise,
+    reference_vector: Union[torch.Tensor, int],
+    denoising_weight: float = 1.0,
+    approx_low_rank_psd_speech: bool = False,
+    iterations: int = 3,
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+):
+    """Return the SDW-MWF (Speech Distortion Weighted Multi-channel Wiener Filter) vector
+
+        h = (Spsd + mu * Npsd)^-1 @ Spsd @ u
+
+    Reference:
+        [1] Spatially pre-processed speech distortion weighted multi-channel Wiener
+        filtering for noise reduction; A. Spriet et al, 2004
+        https://dl.acm.org/doi/abs/10.1016/j.sigpro.2004.07.028
+        [2] Rank-1 constrained multichannel Wiener filter for speech recognition in
+        noisy environments; Z. Wangyou et al, 2018
+        https://hal.inria.fr/hal-01634449/document
+        [3] Low-rank approximation based multichannel Wiener filter algorithms for
+        noise reduction with application in cochlear implants; R. Serizel, 2014
+        https://ieeexplore.ieee.org/document/6730918
+
+    Args:
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
+        reference_vector (torch.Tensor or int): (..., C) or scalar
+        denoising_weight (float): a trade-off parameter between noise reduction and
+            speech distortion.
+            A larger value leads to more noise reduction at the expense of more speech
+            distortion.
+            The plain MWF is obtained with `denoising_weight = 1` (by default).
+        approx_low_rank_psd_speech (bool): whether to replace original input psd_speech
+            with its low-rank approximation as in [2]
+        iterations (int): number of iterations in power method, only used when
+            `approx_low_rank_psd_speech = True`
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: H405, D205, D400
+    if approx_low_rank_psd_speech:
+        if diagonal_loading:
+            psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
+
+        # (B, F, C, 1)
+        recon_vec = get_rtf(
+            psd_speech,
+            psd_noise,
+            mode="power",
+            iterations=iterations,
+            reference_vector=reference_vector,
+            use_torch_solver=use_torch_solver,
+        )
+        # Eq. (25) in Ref[2]
+        psd_speech_r1 = matmul(recon_vec, recon_vec.conj().transpose(-1, -2))
+        sigma_speech = FC.trace(psd_speech) / (FC.trace(psd_speech_r1) + eps)
+        psd_speech_r1 = psd_speech_r1 * sigma_speech[..., None, None]
+        # c.f. Eq. (62) in Ref[3]
+        psd_speech = psd_speech_r1
+
+    psd_n = psd_speech + denoising_weight * psd_noise
+    if diagonal_loading:
+        psd_n = tik_reg(psd_n, reg=diag_eps, eps=eps)
+
+    if use_torch_solver:
+        ws = solve(psd_speech, psd_n)
+    else:
+        ws = matmul(inverse(psd_n), psd_speech)
+
+    if isinstance(reference_vector, int):
+        beamform_vector = ws[..., reference_vector]
+    else:
+        beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
+    return beamform_vector
+
+
+def get_rank1_mwf_vector(
+    psd_speech,
+    psd_noise,
+    reference_vector: Union[torch.Tensor, int],
+    denoising_weight: float = 1.0,
+    approx_low_rank_psd_speech: bool = False,
+    iterations: int = 3,
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+):
+    """Return the R1-MWF (Rank-1 Multi-channel Wiener Filter) vector
+
+        h = (Npsd^-1 @ Spsd) / (mu + Tr(Npsd^-1 @ Spsd)) @ u
+
+    Reference:
+        [1] Rank-1 constrained multichannel Wiener filter for speech recognition in
+        noisy environments; Z. Wangyou et al, 2018
+        https://hal.inria.fr/hal-01634449/document
+        [2] Low-rank approximation based multichannel Wiener filter algorithms for
+        noise reduction with application in cochlear implants; R. Serizel, 2014
+        https://ieeexplore.ieee.org/document/6730918
+
+    Args:
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
+        reference_vector (torch.Tensor or int): (..., C) or scalar
+        denoising_weight (float): a trade-off parameter between noise reduction and
+            speech distortion.
+            A larger value leads to more noise reduction at the expense of more speech
+            distortion.
+            When `denoising_weight = 0`, it corresponds to MVDR beamformer.
+        approx_low_rank_psd_speech (bool): whether to replace original input psd_speech
+            with its low-rank approximation as in [1]
+        iterations (int): number of iterations in power method, only used when
+            `approx_low_rank_psd_speech = True`
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: H405, D205, D400
+    if approx_low_rank_psd_speech:
+        if diagonal_loading:
+            psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
+
+        # (B, F, C, 1)
+        recon_vec = get_rtf(
+            psd_speech,
+            psd_noise,
+            mode="power",
+            iterations=iterations,
+            reference_vector=reference_vector,
+            use_torch_solver=use_torch_solver,
+        )
+        # Eq. (25) in Ref[1]
+        psd_speech_r1 = matmul(recon_vec, recon_vec.conj().transpose(-1, -2))
+        sigma_speech = FC.trace(psd_speech) / (FC.trace(psd_speech_r1) + eps)
+        psd_speech_r1 = psd_speech_r1 * sigma_speech[..., None, None]
+        # c.f. Eq. (62) in Ref[2]
+        psd_speech = psd_speech_r1
+    elif diagonal_loading:
+        psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
+
+    if use_torch_solver:
+        numerator = solve(psd_speech, psd_noise)
+    else:
+        numerator = matmul(inverse(psd_noise), psd_speech)
+
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
+    # ws: (..., C, C) / (...,) -> (..., C, C)
+    ws = numerator / (denoising_weight + FC.trace(numerator)[..., None, None] + eps)
+
+    # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
+    if isinstance(reference_vector, int):
+        beamform_vector = ws[..., reference_vector]
+    else:
+        beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
+    return beamform_vector
+
+
+def get_rtf_matrix(
+    psd_speeches,
+    psd_noises,
+    diagonal_loading: bool = True,
+    ref_channel: int = 0,
+    rtf_iterations: int = 3,
+    use_torch_solver: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+):
+    """Calculate the RTF matrix with each column the relative transfer function
+    of the corresponding source.
+    """  # noqa: H405
+    assert isinstance(psd_speeches, list) and isinstance(psd_noises, list)
+    rtf_mat = cat(
+        [
+            get_rtf(
+                psd_speeches[spk],
+                tik_reg(psd_n, reg=diag_eps, eps=eps) if diagonal_loading else psd_n,
+                reference_vector=ref_channel,
+                iterations=rtf_iterations,
+                use_torch_solver=use_torch_solver,
+            )
+            for spk, psd_n in enumerate(psd_noises)
+        ],
+        dim=-1,
+    )
+    # normalize at the reference channel
+    return rtf_mat / rtf_mat[..., ref_channel, None, :]
+
+
+def get_lcmv_vector_with_rtf(
+    psd_n: Union[torch.Tensor, ComplexTensor],
+    rtf_mat: Union[torch.Tensor, ComplexTensor],
+    reference_vector: Union[int, torch.Tensor, None] = None,
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Union[torch.Tensor, ComplexTensor]:
+    """Return the LCMV (Linearly Constrained Minimum Variance) vector
+        calculated with RTF:
+
+        h = (Npsd^-1 @ rtf_mat) @ (rtf_mat^H @ Npsd^-1 @ rtf_mat)^-1 @ p
+
+    Reference:
+        H. L. Van Trees, “Optimum array processing: Part IV of detection, estimation,
+        and modulation theory,” John Wiley & Sons, 2004. (Chapter 6.7)
+
+    Args:
+        psd_n (torch.complex64/ComplexTensor):
+            observation/noise covariance matrix (..., F, C, C)
+        rtf_mat (torch.complex64/ComplexTensor):
+            RTF matrix (..., F, C, num_spk)
+        reference_vector (torch.Tensor or int): (..., num_spk) or scalar
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: H405, D205, D400
+    if diagonal_loading:
+        psd_n = tik_reg(psd_n, reg=diag_eps, eps=eps)
+
+    # numerator: (..., C_1, C_2) x (..., C_2, num_spk) -> (..., C_1, num_spk)
+    if use_torch_solver:
+        numerator = solve(rtf_mat, psd_n)
+    else:
+        numerator = matmul(inverse(psd_n), rtf_mat)
+    denominator = matmul(rtf_mat.conj().transpose(-1, -2), numerator)
+    if isinstance(reference_vector, int):
+        ws = inverse(denominator)[..., reference_vector, None]
+    else:
+        ws = solve(reference_vector, denominator)
+    beamforming_vector = matmul(numerator, ws).squeeze(-1)
+    return beamforming_vector
+
+
+def generalized_eigenvalue_decomposition(a: torch.Tensor, b: torch.Tensor, eps=1e-6):
+    """Solves the generalized eigenvalue decomposition through Cholesky decomposition.
+
+    ported from https://github.com/asteroid-team/asteroid/blob/master/asteroid/dsp/beamforming.py#L464
+
+    a @ e_vec = e_val * b @ e_vec
+    |
+    |   Cholesky decomposition on `b`:
+    |       b = L @ L^H, where `L` is a lower triangular matrix
+    |
+    |   Let C = L^-1 @ a @ L^-H, it is Hermitian.
+    |
+    => C @ y = lambda * y
+    => e_vec = L^-H @ y
+
+    Reference: https://www.netlib.org/lapack/lug/node54.html
+
+    Args:
+        a: A complex Hermitian or real symmetric matrix whose eigenvalues and
+            eigenvectors will be computed. (..., C, C)
+        b: A complex Hermitian or real symmetric definite positive matrix. (..., C, C)
+    Returns:
+        e_val: generalized eigenvalues (ascending order)
+        e_vec: generalized eigenvectors
+    """  # noqa: H405, E501
+    try:
+        cholesky = torch.linalg.cholesky(b)
+    except RuntimeError:
+        b = tik_reg(b, reg=eps, eps=eps)
+        cholesky = torch.linalg.cholesky(b)
+    inv_cholesky = cholesky.inverse()
+    # Compute C matrix L⁻1 a L^-H
+    cmat = inv_cholesky @ a @ inv_cholesky.conj().transpose(-1, -2)
+    # Performing the eigenvalue decomposition
+    e_val, e_vec = torch.linalg.eigh(cmat)
+    # Collecting the eigenvectors
+    e_vec = torch.matmul(inv_cholesky.conj().transpose(-1, -2), e_vec)
+    return e_val, e_vec
+
+
+def gev_phase_correction(vector):
+    """Phase correction to reduce distortions due to phase inconsistencies.
+
+    ported from https://github.com/fgnt/nn-gev/blob/master/fgnt/beamforming.py#L169
+
+    Args:
+        vector: Beamforming vector with shape (..., F, C)
+    Returns:
+        w: Phase corrected beamforming vectors
+    """
+    B, F, C = vector.shape
+    correction = torch.empty_like(vector.real)
+    for f in range(F):
+        correction[:, f, :] = torch.exp(
+            (vector[:, f, :] * vector[:, f - 1, :].conj())
+            .sum(dim=-1, keepdim=True)
+            .angle()
+        )
+    if isinstance(vector, ComplexTensor):
+        correction = ComplexTensor(torch.cos(correction), -torch.sin(correction))
+    else:
+        correction = torch.exp(-1j * correction)
+    return vector * correction
+
+
+def blind_analytic_normalization(ws, psd_noise, eps=1e-8):
+    """Blind analytic normalization (BAN) for post-filtering
+
+    Args:
+        ws (torch.complex64/ComplexTensor): beamformer vector (..., F, C)
+        psd_noise (torch.complex64/ComplexTensor): noise PSD matrix (..., F, C, C)
+        eps (float)
+    Returns:
+        ws_ban (torch.complex64/ComplexTensor): normalized beamformer vector (..., F)
+    """
+    C2 = psd_noise.size(-1) ** 2
+    denominator = einsum("...c,...ce,...e->...", ws.conj(), psd_noise, ws)
+    numerator = einsum(
+        "...c,...ce,...eo,...o->...", ws.conj(), psd_noise, psd_noise, ws
+    )
+    gain = (numerator + eps).sqrt() / (denominator * C2 + eps)
+    return gain
+
+
+def get_gev_vector(
+    psd_noise: Union[torch.Tensor, ComplexTensor],
+    psd_speech: Union[torch.Tensor, ComplexTensor],
+    mode="power",
+    reference_vector: Union[int, torch.Tensor] = 0,
+    iterations: int = 3,
+    use_torch_solver: bool = True,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Union[torch.Tensor, ComplexTensor]:
+    """Return the generalized eigenvalue (GEV) beamformer vector:
+
+        psd_speech @ h = lambda * psd_noise @ h
+
+    Reference:
+        Blind acoustic beamforming based on generalized eigenvalue decomposition;
+        E. Warsitz and R. Haeb-Umbach, 2007.
+
+    Args:
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        mode (str): one of ("power", "evd")
+            "power": power method
+            "evd": eigenvalue decomposition (only for torch builtin complex tensors)
+        reference_vector (torch.Tensor or int): (..., C) or scalar
+        iterations (int): number of iterations in power method
+        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diag_eps (float):
+        eps (float):
+    Returns:
+        beamform_vector (torch.complex64/ComplexTensor): (..., F, C)
+    """  # noqa: H405, D205, D400
+    if diagonal_loading:
+        psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
+
+    if mode == "power":
+        if use_torch_solver:
+            phi = solve(psd_speech, psd_noise)
+        else:
+            phi = matmul(inverse(psd_noise), psd_speech)
+        e_vec = (
+            phi[..., reference_vector, None]
+            if isinstance(reference_vector, int)
+            else matmul(phi, reference_vector[..., None, :, None])
+        )
+        for _ in range(iterations - 1):
+            e_vec = matmul(phi, e_vec)
+            # e_vec = e_vec / complex_norm(e_vec, dim=-1, keepdim=True)
+        e_vec = e_vec.squeeze(-1)
+    elif mode == "evd":
+        assert (
+            is_torch_1_9_plus
+            and is_torch_complex_tensor(psd_speech)
+            and is_torch_complex_tensor(psd_noise)
+        )
+        # e_vec = generalized_eigenvalue_decomposition(psd_speech, psd_noise)[1][...,-1]
+        e_vec = psd_noise.new_zeros(psd_noise.shape[:-1])
+        for f in range(psd_noise.shape[-3]):
+            try:
+                e_vec[..., f, :] = generalized_eigenvalue_decomposition(
+                    psd_speech[..., f, :, :], psd_noise[..., f, :, :]
+                )[1][..., -1]
+            except RuntimeError:
+                # port from github.com/fgnt/nn-gev/blob/master/fgnt/beamforming.py#L106
+                print(
+                    "GEV beamformer: LinAlg error for frequency {}".format(f),
+                    flush=True,
+                )
+                C = psd_noise.size(-1)
+                e_vec[..., f, :] = (
+                    psd_noise.new_ones(e_vec[..., f, :].shape)
+                    / FC.trace(psd_noise[..., f, :, :])
+                    * C
+                )
+    else:
+        raise ValueError("Unknown mode: %s" % mode)
+
+    beamforming_vector = e_vec / complex_norm(e_vec, dim=-1, keepdim=True)
+    beamforming_vector = gev_phase_correction(beamforming_vector)
+    return beamforming_vector
+
+
 def signal_framing(
     signal: Union[torch.Tensor, ComplexTensor],
     frame_length: int,
@@ -192,11 +867,22 @@ def signal_framing(
             if do_padding: (..., T, frame_length)
             else:          (..., T - bdelay - frame_length + 2, frame_length)
     """
+    if isinstance(signal, ComplexTensor):
+        complex_wrapper = ComplexTensor
+        pad_func = FC.pad
+    elif is_torch_complex_tensor(signal):
+        complex_wrapper = torch.complex
+        pad_func = torch.nn.functional.pad
+    else:
+        pad_func = torch.nn.functional.pad
+
     frame_length2 = frame_length - 1
     # pad to the right at the last dimension of `signal` (time dimension)
     if do_padding:
         # (..., T) --> (..., T + bdelay + frame_length - 2)
-        signal = FC.pad(signal, (bdelay + frame_length2 - 1, 0), "constant", pad_value)
+        signal = pad_func(
+            signal, (bdelay + frame_length2 - 1, 0), "constant", pad_value
+        )
         do_padding = False
 
     if indices is None:
@@ -210,7 +896,7 @@ def signal_framing(
             for i in range(0, signal.shape[-1] - frame_length2 - bdelay + 1, frame_step)
         ]
 
-    if isinstance(signal, ComplexTensor):
+    if is_complex(signal):
         real = signal_framing(
             signal.real,
             frame_length,
@@ -229,26 +915,25 @@ def signal_framing(
             pad_value,
             indices,
         )
-        return ComplexTensor(real, imag)
+        return complex_wrapper(real, imag)
     else:
         # (..., T - bdelay - frame_length + 2, frame_length)
         signal = signal[..., indices]
-        # signal[..., :-1] = -signal[..., :-1]
         return signal
 
 
 def get_covariances(
-    Y: ComplexTensor,
+    Y: Union[torch.Tensor, ComplexTensor],
     inverse_power: torch.Tensor,
     bdelay: int,
     btaps: int,
     get_vector: bool = False,
-) -> ComplexTensor:
+) -> Union[torch.Tensor, ComplexTensor]:
     """Calculates the power normalized spatio-temporal covariance
         matrix of the framed signal.
 
     Args:
-        Y : Complext STFT signal with shape (B, F, C, T)
+        Y : Complex STFT signal with shape (B, F, C, T)
         inverse_power : Weighting factor with shape (B, F, T)
 
     Returns:
@@ -266,13 +951,13 @@ def get_covariances(
     ]
     # Reverse along btaps-axis:
     # [tau, tau-bdelay, tau-bdelay-1, ..., tau-bdelay-frame_length+1]
-    Psi = FC.reverse(Psi, dim=-1)
+    Psi = reverse(Psi, dim=-1)
     Psi_norm = Psi * inverse_power[..., None, bdelay + btaps - 1 :, None]
 
     # let T' = T - bdelay - btaps + 1
     # (B, F, C, T', btaps + 1) x (B, F, C, T', btaps + 1)
     #  -> (B, F, btaps + 1, C, btaps + 1, C)
-    covariance_matrix = FC.einsum("bfdtk,bfetl->bfkdle", (Psi, Psi_norm.conj()))
+    covariance_matrix = einsum("bfdtk,bfetl->bfkdle", Psi, Psi_norm.conj())
 
     # (B, F, btaps + 1, C, btaps + 1, C)
     #   -> (B, F, (btaps + 1) * C, (btaps + 1) * C)
@@ -283,8 +968,8 @@ def get_covariances(
     if get_vector:
         # (B, F, C, T', btaps + 1) x (B, F, C, T')
         #    --> (B, F, btaps +1, C, C)
-        covariance_vector = FC.einsum(
-            "bfdtk,bfet->bfked", (Psi_norm, Y[..., bdelay + btaps - 1 :].conj())
+        covariance_vector = einsum(
+            "bfdtk,bfet->bfked", Psi_norm, Y[..., bdelay + btaps - 1 :].conj()
         )
         return covariance_matrix, covariance_vector
     else:
@@ -292,14 +977,14 @@ def get_covariances(
 
 
 def get_WPD_filter(
-    Phi: ComplexTensor,
-    Rf: ComplexTensor,
+    Phi: Union[torch.Tensor, ComplexTensor],
+    Rf: Union[torch.Tensor, ComplexTensor],
     reference_vector: torch.Tensor,
     use_torch_solver: bool = True,
     diagonal_loading: bool = True,
     diag_eps: float = 1e-7,
     eps: float = 1e-8,
-) -> ComplexTensor:
+) -> Union[torch.Tensor, ComplexTensor]:
     """Return the WPD vector.
 
         WPD is the Weighted Power minimization Distortionless response
@@ -315,9 +1000,9 @@ def get_WPD_filter(
         https://ieeexplore.ieee.org/document/8691481
 
     Args:
-        Phi (ComplexTensor): (B, F, (btaps+1) * C, (btaps+1) * C)
+        Phi (torch.complex64/ComplexTensor): (B, F, (btaps+1) * C, (btaps+1) * C)
             is the PSD of zero-padded speech [x^T(t,f) 0 ... 0]^T.
-        Rf (ComplexTensor): (B, F, (btaps+1) * C, (btaps+1) * C)
+        Rf (torch.complex64/ComplexTensor): (B, F, (btaps+1) * C, (btaps+1) * C)
             is the power normalized spatio-temporal covariance matrix.
         reference_vector (torch.Tensor): (B, (btaps+1) * C)
             is the reference_vector.
@@ -327,72 +1012,76 @@ def get_WPD_filter(
         eps (float):
 
     Returns:
-        filter_matrix (ComplexTensor): (B, F, (btaps + 1) * C)
+        filter_matrix (torch.complex64/ComplexTensor): (B, F, (btaps + 1) * C)
     """
     if diagonal_loading:
         Rf = tik_reg(Rf, reg=diag_eps, eps=eps)
 
     # numerator: (..., C_1, C_2) x (..., C_2, C_3) -> (..., C_1, C_3)
-    if use_torch_solver and is_torch_1_1_plus:
-        # torch.solve is required, which is only available after pytorch 1.1.0+
-        numerator = FC.solve(Phi, Rf)[0]
+    if use_torch_solver:
+        numerator = solve(Phi, Rf)
     else:
-        numerator = FC.matmul(Rf.inverse2(), Phi)
+        numerator = matmul(inverse(Rf), Phi)
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
     # ws: (..., C, C) / (...,) -> (..., C, C)
     ws = numerator / (FC.trace(numerator)[..., None, None] + eps)
     # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
-    beamform_vector = FC.einsum("...fec,...c->...fe", [ws, reference_vector])
+    beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
     # (B, F, (btaps + 1) * C)
     return beamform_vector
 
 
 def get_WPD_filter_v2(
-    Phi: ComplexTensor,
-    Rf: ComplexTensor,
+    Phi: Union[torch.Tensor, ComplexTensor],
+    Rf: Union[torch.Tensor, ComplexTensor],
     reference_vector: torch.Tensor,
     diagonal_loading: bool = True,
     diag_eps: float = 1e-7,
     eps: float = 1e-8,
-) -> ComplexTensor:
+) -> Union[torch.Tensor, ComplexTensor]:
     """Return the WPD vector (v2).
 
-       This implementaion is more efficient than `get_WPD_filter` as
+       This implementation is more efficient than `get_WPD_filter` as
         it skips unnecessary computation with zeros.
 
     Args:
-        Phi (ComplexTensor): (B, F, C, C)
+        Phi (torch.complex64/ComplexTensor): (B, F, C, C)
             is speech PSD.
-        Rf (ComplexTensor): (B, F, (btaps+1) * C, (btaps+1) * C)
+        Rf (torch.complex64/ComplexTensor): (B, F, (btaps+1) * C, (btaps+1) * C)
             is the power normalized spatio-temporal covariance matrix.
         reference_vector (torch.Tensor): (B, C)
             is the reference_vector.
-        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        diagonal_loading (bool):
+            Whether to add a tiny term to the diagonal of psd_n
         diag_eps (float):
         eps (float):
 
     Returns:
-        filter_matrix (ComplexTensor): (B, F, (btaps+1) * C)
+        filter_matrix (torch.complex64/ComplexTensor): (B, F, (btaps+1) * C)
     """
     C = reference_vector.shape[-1]
     if diagonal_loading:
         Rf = tik_reg(Rf, reg=diag_eps, eps=eps)
-    inv_Rf = Rf.inverse2()
+    inv_Rf = inverse(Rf)
     # (B, F, (btaps+1) * C, C)
     inv_Rf_pruned = inv_Rf[..., :C]
     # numerator: (..., C_1, C_2) x (..., C_2, C_3) -> (..., C_1, C_3)
-    numerator = FC.matmul(inv_Rf_pruned, Phi)
+    numerator = matmul(inv_Rf_pruned, Phi)
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
     # ws: (..., (btaps+1) * C, C) / (...,) -> (..., (btaps+1) * C, C)
     ws = numerator / (FC.trace(numerator[..., :C, :])[..., None, None] + eps)
     # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
-    beamform_vector = FC.einsum("...fec,...c->...fe", [ws, reference_vector])
+    beamform_vector = einsum("...fec,...c->...fe", ws, reference_vector)
     # (B, F, (btaps+1) * C)
     return beamform_vector
 
 
 def get_WPD_filter_with_rtf(
-    psd_observed_bar: ComplexTensor,
-    psd_speech: ComplexTensor,
-    psd_noise: ComplexTensor,
+    psd_observed_bar: Union[torch.Tensor, ComplexTensor],
+    psd_speech: Union[torch.Tensor, ComplexTensor],
+    psd_noise: Union[torch.Tensor, ComplexTensor],
     iterations: int = 3,
     reference_vector: Union[int, torch.Tensor, None] = None,
     normalize_ref_channel: Optional[int] = None,
@@ -400,7 +1089,7 @@ def get_WPD_filter_with_rtf(
     diagonal_loading: bool = True,
     diag_eps: float = 1e-7,
     eps: float = 1e-15,
-) -> ComplexTensor:
+) -> Union[torch.Tensor, ComplexTensor]:
     """Return the WPD vector calculated with RTF.
 
         WPD is the Weighted Power minimization Distortionless response
@@ -416,19 +1105,34 @@ def get_WPD_filter_with_rtf(
         https://ieeexplore.ieee.org/document/8691481
 
     Args:
-        psd_observed_bar (ComplexTensor): stacked observation covariance matrix
-        psd_speech (ComplexTensor): speech covariance matrix (..., F, C, C)
-        psd_noise (ComplexTensor): noise covariance matrix (..., F, C, C)
+        psd_observed_bar (torch.complex64/ComplexTensor):
+            stacked observation covariance matrix
+        psd_speech (torch.complex64/ComplexTensor):
+            speech covariance matrix (..., F, C, C)
+        psd_noise (torch.complex64/ComplexTensor):
+            noise covariance matrix (..., F, C, C)
         iterations (int): number of iterations in power method
         reference_vector (torch.Tensor or int): (..., C) or scalar
-        normalize_ref_channel (int): reference channel for normalizing the RTF
-        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
-        diagonal_loading (bool): Whether to add a tiny term to the diagonal of psd_n
+        normalize_ref_channel (int):
+            reference channel for normalizing the RTF
+        use_torch_solver (bool):
+            Whether to use `solve` instead of `inverse`
+        diagonal_loading (bool):
+            Whether to add a tiny term to the diagonal of psd_n
         diag_eps (float):
         eps (float):
     Returns:
-        beamform_vector (ComplexTensor)r: (..., F, C)
+        beamform_vector (torch.complex64/ComplexTensor)r: (..., F, C)
     """
+    if isinstance(psd_speech, ComplexTensor):
+        pad_func = FC.pad
+    elif is_torch_complex_tensor(psd_speech):
+        pad_func = torch.nn.functional.pad
+    else:
+        raise ValueError(
+            "Please update your PyTorch version to 1.9+ for complex support."
+        )
+
     C = psd_noise.size(-1)
     if diagonal_loading:
         psd_noise = tik_reg(psd_noise, reg=diag_eps, eps=eps)
@@ -437,20 +1141,19 @@ def get_WPD_filter_with_rtf(
     rtf = get_rtf(
         psd_speech,
         psd_noise,
-        reference_vector,
+        reference_vector=reference_vector,
         iterations=iterations,
         use_torch_solver=use_torch_solver,
     )
 
     # (B, F, (K+1)*C, 1)
-    rtf = FC.pad(rtf, (0, 0, 0, psd_observed_bar.shape[-1] - C), "constant", 0)
+    rtf = pad_func(rtf, (0, 0, 0, psd_observed_bar.shape[-1] - C), "constant", 0)
     # numerator: (..., C_1, C_2) x (..., C_2, 1) -> (..., C_1)
-    if use_torch_solver and is_torch_1_1_plus:
-        # torch.solve is required, which is only available after pytorch 1.1.0+
-        numerator = FC.solve(rtf, psd_observed_bar)[0].squeeze(-1)
+    if use_torch_solver:
+        numerator = solve(rtf, psd_observed_bar).squeeze(-1)
     else:
-        numerator = FC.matmul(psd_observed_bar.inverse2(), rtf).squeeze(-1)
-    denominator = FC.einsum("...d,...d->...", [rtf.squeeze(-1).conj(), numerator])
+        numerator = matmul(inverse(psd_observed_bar), rtf).squeeze(-1)
+    denominator = einsum("...d,...d->...", rtf.squeeze(-1).conj(), numerator)
     if normalize_ref_channel is not None:
         scale = rtf.squeeze(-1)[..., normalize_ref_channel, None].conj()
         beamforming_vector = numerator * scale / (denominator.real.unsqueeze(-1) + eps)
@@ -460,8 +1163,11 @@ def get_WPD_filter_with_rtf(
 
 
 def perform_WPD_filtering(
-    filter_matrix: ComplexTensor, Y: ComplexTensor, bdelay: int, btaps: int
-) -> ComplexTensor:
+    filter_matrix: Union[torch.Tensor, ComplexTensor],
+    Y: Union[torch.Tensor, ComplexTensor],
+    bdelay: int,
+    btaps: int,
+) -> Union[torch.Tensor, ComplexTensor]:
     """Perform WPD filtering.
 
     Args:
@@ -469,29 +1175,29 @@ def perform_WPD_filtering(
         Y : Complex STFT signal with shape (B, F, C, T)
 
     Returns:
-        enhanced (ComplexTensor): (B, F, T)
+        enhanced (torch.complex64/ComplexTensor): (B, F, T)
     """
     # (B, F, C, T) --> (B, F, C, T, btaps + 1)
     Ytilde = signal_framing(Y, btaps + 1, 1, bdelay, do_padding=True, pad_value=0)
-    Ytilde = FC.reverse(Ytilde, dim=-1)
+    Ytilde = reverse(Ytilde, dim=-1)
 
     Bs, Fdim, C, T = Y.shape
     # --> (B, F, T, btaps + 1, C) --> (B, F, T, (btaps + 1) * C)
     Ytilde = Ytilde.permute(0, 1, 3, 4, 2).contiguous().view(Bs, Fdim, T, -1)
     # (B, F, T, 1)
-    enhanced = FC.einsum("...tc,...c->...t", [Ytilde, filter_matrix.conj()])
+    enhanced = einsum("...tc,...c->...t", Ytilde, filter_matrix.conj())
     return enhanced
 
 
-def tik_reg(mat: ComplexTensor, reg: float = 1e-8, eps: float = 1e-8) -> ComplexTensor:
+def tik_reg(mat, reg: float = 1e-8, eps: float = 1e-8):
     """Perform Tikhonov regularization (only modifying real part).
 
     Args:
-        mat (ComplexTensor): input matrix (..., C, C)
+        mat (torch.complex64/ComplexTensor): input matrix (..., C, C)
         reg (float): regularization factor
         eps (float)
     Returns:
-        ret (ComplexTensor): regularized matrix (..., C, C)
+        ret (torch.complex64/ComplexTensor): regularized matrix (..., C, C)
     """
     # Add eps
     C = mat.size(-1)
@@ -504,142 +1210,3 @@ def tik_reg(mat: ComplexTensor, reg: float = 1e-8, eps: float = 1e-8) -> Complex
         epsilon = epsilon + eps
     mat = mat + epsilon * eye
     return mat
-
-
-##############################################
-# Below are for Multi-Frame MVDR beamforming #
-##############################################
-# modified from https://gitlab.uni-oldenburg.de/hura4843/deep-mfmvdr/-/blob/master/deep_mfmvdr (# noqa: E501)
-def get_adjacent(spec: ComplexTensor, filter_length: int = 5) -> ComplexTensor:
-    """Zero-pad and unfold stft, i.e.,
-
-    add zeros to the beginning so that, using the multi-frame signal model,
-    there will be as many output frames as input frames.
-
-    Args:
-        spec (ComplexTensor): input spectrum (B, F, T)
-        filter_length (int): length for frame extension
-    Returns:
-        ret (ComplexTensor): output spectrum (B, F, T, filter_length)
-    """  # noqa: D400
-    return (
-        FC.pad(spec, pad=[filter_length - 1, 0])
-        .unfold(dim=-1, size=filter_length, step=1)
-        .contiguous()
-    )
-
-
-def get_adjacent_th(spec: torch.Tensor, filter_length: int = 5) -> torch.Tensor:
-    """Zero-pad and unfold stft, i.e.,
-
-    add zeros to the beginning so that, using the multi-frame signal model,
-    there will be as many output frames as input frames.
-
-    Args:
-        spec (torch.Tensor): input spectrum (B, F, T, 2)
-        filter_length (int): length for frame extension
-    Returns:
-        ret (torch.Tensor): output spectrum (B, F, T, filter_length, 2)
-    """  # noqa: D400
-    return (
-        torch.nn.functional.pad(spec, pad=[0, 0, filter_length - 1, 0])
-        .unfold(dimension=-2, size=filter_length, step=1)
-        .transpose(-2, -1)
-        .contiguous()
-    )
-
-
-def vector_to_Hermitian(vec):
-    """Construct a Hermitian matrix from a vector of N**2 independent
-    real-valued elements.
-
-    Args:
-        vec (torch.Tensor): (..., N ** 2)
-    Returns:
-        mat (ComplexTensor): (..., N, N)
-    """  # noqa: H405, D205, D400
-    N = int(np.sqrt(vec.shape[-1]))
-    mat = torch.zeros(size=vec.shape[:-1] + (N, N, 2), device=vec.device)
-
-    # real component
-    triu = np.triu_indices(N, 0)
-    triu2 = np.triu_indices(N, 1)  # above main diagonal
-    tril = (triu2[1], triu2[0])  # below main diagonal; for symmetry
-    mat[(...,) + triu + (np.zeros(triu[0].shape[0]),)] = vec[..., : triu[0].shape[0]]
-    start = triu[0].shape[0]
-    mat[(...,) + tril + (np.zeros(tril[0].shape[0]),)] = mat[
-        (...,) + triu2 + (np.zeros(triu2[0].shape[0]),)
-    ]
-
-    # imaginary component
-    mat[(...,) + triu2 + (np.ones(triu2[0].shape[0]),)] = vec[
-        ..., start : start + triu2[0].shape[0]
-    ]
-    mat[(...,) + tril + (np.ones(tril[0].shape[0]),)] = -mat[
-        (...,) + triu2 + (np.ones(triu2[0].shape[0]),)
-    ]
-
-    return ComplexTensor(mat[..., 0], mat[..., 1])
-
-
-def get_mfmvdr_vector(gammax, Phi, use_torch_solver: bool = True, eps: float = EPS):
-    """Compute conventional MFMPDR/MFMVDR filter.
-
-    Args:
-        gammax (ComplexTensor): (..., L, N)
-        Phi (ComplexTensor): (..., L, N, N)
-        use_torch_solver (bool): Whether to use `solve` instead of `inverse`
-        eps (float)
-    Returns:
-        beamforming_vector (ComplexTensor): (..., L, N)
-    """
-    # (..., L, N)
-    if use_torch_solver and is_torch_1_1_plus:
-        # torch.solve is required, which is only available after pytorch 1.1.0+
-        numerator = FC.solve(gammax.unsqueeze(-1), Phi)[0].squeeze(-1)
-    else:
-        numerator = FC.matmul(Phi.inverse2(), gammax.unsqueeze(-1)).squeeze(-1)
-    denominator = FC.einsum("...d,...d->...", [gammax.conj(), numerator])
-    return numerator / (denominator.real.unsqueeze(-1) + eps)
-
-
-def filter_minimum_gain_like(
-    G_min, w, y, alpha=None, k: float = 10.0, eps: float = EPS
-):
-    """Approximate a minimum gain operation.
-
-    speech_estimate = alpha w^H y + (1 - alpha) G_min Y,
-    where alpha = 1 / (1 + exp(-2 k x)), x = w^H y - G_min Y
-
-    Args:
-        G_min (float): minimum gain
-        w (ComplexTensor): filter coefficients (..., L, N)
-        y (ComplexTensor): buffered and stacked input (..., L, N)
-        alpha: mixing factor
-        k (float): scaling in tanh-like function
-        esp (float)
-    Returns:
-        output (ComplexTensor): minimum gain-filtered output
-        alpha (float): optional
-    """
-    # (..., L)
-    filtered_input = FC.einsum("...d,...d->...", [w.conj(), y])
-    # (..., L)
-    Y = y[..., -1]
-    return minimum_gain_like(G_min, Y, filtered_input, alpha, k, eps)
-
-
-def minimum_gain_like(
-    G_min, Y, filtered_input, alpha=None, k: float = 10.0, eps: float = EPS
-):
-    if alpha is None:
-        diff = (filtered_input + eps).abs() - (G_min * Y + eps).abs()
-        alpha = 1.0 / (1.0 + torch.exp(-2 * k * diff))
-        return_alpha = True
-    else:
-        return_alpha = False
-    output = alpha * filtered_input + (1 - alpha) * G_min * Y
-    if return_alpha:
-        return output, alpha
-    else:
-        return output
diff --git a/espnet2/enh/layers/complex_utils.py b/espnet2/enh/layers/complex_utils.py
new file mode 100644
index 00000000000..bf4799f58d8
--- /dev/null
+++ b/espnet2/enh/layers/complex_utils.py
@@ -0,0 +1,191 @@
+"""Beamformer module."""
+from distutils.version import LooseVersion
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex import functional as FC
+from torch_complex.tensor import ComplexTensor
+
+
+EPS = torch.finfo(torch.double).eps
+is_torch_1_8_plus = LooseVersion(torch.__version__) >= LooseVersion("1.8.0")
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+def new_complex_like(
+    ref: Union[torch.Tensor, ComplexTensor],
+    real_imag: Tuple[torch.Tensor, torch.Tensor],
+):
+    if isinstance(ref, ComplexTensor):
+        return ComplexTensor(*real_imag)
+    elif is_torch_complex_tensor(ref):
+        return torch.complex(*real_imag)
+    else:
+        raise ValueError(
+            "Please update your PyTorch version to 1.9+ for complex support."
+        )
+
+
+def is_torch_complex_tensor(c):
+    return (
+        not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c)
+    )
+
+
+def is_complex(c):
+    return isinstance(c, ComplexTensor) or is_torch_complex_tensor(c)
+
+
+def to_double(c):
+    if not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c):
+        return c.to(dtype=torch.complex128)
+    else:
+        return c.double()
+
+
+def to_float(c):
+    if not isinstance(c, ComplexTensor) and is_torch_1_9_plus and torch.is_complex(c):
+        return c.to(dtype=torch.complex64)
+    else:
+        return c.float()
+
+
+def cat(seq: Sequence[Union[ComplexTensor, torch.Tensor]], *args, **kwargs):
+    if not isinstance(seq, (list, tuple)):
+        raise TypeError(
+            "cat(): argument 'tensors' (position 1) must be tuple of Tensors, "
+            "not Tensor"
+        )
+    if isinstance(seq[0], ComplexTensor):
+        return FC.cat(seq, *args, **kwargs)
+    else:
+        return torch.cat(seq, *args, **kwargs)
+
+
+def complex_norm(
+    c: Union[torch.Tensor, ComplexTensor], dim=-1, keepdim=False
+) -> torch.Tensor:
+    if not is_complex(c):
+        raise TypeError("Input is not a complex tensor.")
+    if is_torch_complex_tensor(c):
+        return torch.norm(c, dim=dim, keepdim=keepdim)
+    else:
+        return torch.sqrt(
+            (c.real**2 + c.imag**2).sum(dim=dim, keepdim=keepdim) + EPS
+        )
+
+
+def einsum(equation, *operands):
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.einsum does not support
+    # mixed input with complex and real tensors.
+    if len(operands) == 1:
+        if isinstance(operands[0], (tuple, list)):
+            operands = operands[0]
+        complex_module = FC if isinstance(operands[0], ComplexTensor) else torch
+        return complex_module.einsum(equation, *operands)
+    elif len(operands) != 2:
+        op0 = operands[0]
+        same_type = all(op.dtype == op0.dtype for op in operands[1:])
+        if same_type:
+            _einsum = FC.einsum if isinstance(op0, ComplexTensor) else torch.einsum
+            return _einsum(equation, *operands)
+        else:
+            raise ValueError("0 or More than 2 operands are not supported.")
+    a, b = operands
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        return FC.einsum(equation, a, b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if not torch.is_complex(a):
+            o_real = torch.einsum(equation, a, b.real)
+            o_imag = torch.einsum(equation, a, b.imag)
+            return torch.complex(o_real, o_imag)
+        elif not torch.is_complex(b):
+            o_real = torch.einsum(equation, a.real, b)
+            o_imag = torch.einsum(equation, a.imag, b)
+            return torch.complex(o_real, o_imag)
+        else:
+            return torch.einsum(equation, a, b)
+    else:
+        return torch.einsum(equation, a, b)
+
+
+def inverse(
+    c: Union[torch.Tensor, ComplexTensor]
+) -> Union[torch.Tensor, ComplexTensor]:
+    if isinstance(c, ComplexTensor):
+        return c.inverse2()
+    else:
+        return c.inverse()
+
+
+def matmul(
+    a: Union[torch.Tensor, ComplexTensor], b: Union[torch.Tensor, ComplexTensor]
+) -> Union[torch.Tensor, ComplexTensor]:
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.matmul does not support
+    # multiplication between complex and real tensors.
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        return FC.matmul(a, b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if not torch.is_complex(a):
+            o_real = torch.matmul(a, b.real)
+            o_imag = torch.matmul(a, b.imag)
+            return torch.complex(o_real, o_imag)
+        elif not torch.is_complex(b):
+            o_real = torch.matmul(a.real, b)
+            o_imag = torch.matmul(a.imag, b)
+            return torch.complex(o_real, o_imag)
+        else:
+            return torch.matmul(a, b)
+    else:
+        return torch.matmul(a, b)
+
+
+def trace(a: Union[torch.Tensor, ComplexTensor]):
+    # NOTE (wangyou): until PyTorch 1.9.0, torch.trace does not
+    # support bacth processing. Use FC.trace() as fallback.
+    return FC.trace(a)
+
+
+def reverse(a: Union[torch.Tensor, ComplexTensor], dim=0):
+    if isinstance(a, ComplexTensor):
+        return FC.reverse(a, dim=dim)
+    else:
+        return torch.flip(a, dims=(dim,))
+
+
+def solve(b: Union[torch.Tensor, ComplexTensor], a: Union[torch.Tensor, ComplexTensor]):
+    """Solve the linear equation ax = b."""
+    # NOTE: Do not mix ComplexTensor and torch.complex in the input!
+    # NOTE (wangyou): Until PyTorch 1.9.0, torch.solve does not support
+    # mixed input with complex and real tensors.
+    if isinstance(a, ComplexTensor) or isinstance(b, ComplexTensor):
+        if isinstance(a, ComplexTensor) and isinstance(b, ComplexTensor):
+            return FC.solve(b, a, return_LU=False)
+        else:
+            return matmul(inverse(a), b)
+    elif is_torch_1_9_plus and (torch.is_complex(a) or torch.is_complex(b)):
+        if torch.is_complex(a) and torch.is_complex(b):
+            return torch.linalg.solve(a, b)
+        else:
+            return matmul(inverse(a), b)
+    else:
+        if is_torch_1_8_plus:
+            return torch.linalg.solve(a, b)
+        else:
+            return torch.solve(b, a)[0]
+
+
+def stack(seq: Sequence[Union[ComplexTensor, torch.Tensor]], *args, **kwargs):
+    if not isinstance(seq, (list, tuple)):
+        raise TypeError(
+            "stack(): argument 'tensors' (position 1) must be tuple of Tensors, "
+            "not Tensor"
+        )
+    if isinstance(seq[0], ComplexTensor):
+        return FC.stack(seq, *args, **kwargs)
+    else:
+        return torch.stack(seq, *args, **kwargs)
diff --git a/espnet2/enh/layers/complexnn.py b/espnet2/enh/layers/complexnn.py
new file mode 100644
index 00000000000..5b458ca0ac2
--- /dev/null
+++ b/espnet2/enh/layers/complexnn.py
@@ -0,0 +1,436 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class NavieComplexLSTM(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        projection_dim=None,
+        bidirectional=False,
+        batch_first=False,
+    ):
+        super(NavieComplexLSTM, self).__init__()
+        self.bidirectional = bidirectional
+        self.input_dim = input_size // 2
+        self.rnn_units = hidden_size // 2
+        self.real_lstm = nn.LSTM(
+            self.input_dim,
+            self.rnn_units,
+            num_layers=1,
+            bidirectional=bidirectional,
+            batch_first=False,
+        )
+        self.imag_lstm = nn.LSTM(
+            self.input_dim,
+            self.rnn_units,
+            num_layers=1,
+            bidirectional=bidirectional,
+            batch_first=False,
+        )
+        if bidirectional:
+            bidirectional = 2
+        else:
+            bidirectional = 1
+        if projection_dim is not None:
+            self.projection_dim = projection_dim // 2
+            self.r_trans = nn.Linear(
+                self.rnn_units * bidirectional, self.projection_dim
+            )
+            self.i_trans = nn.Linear(
+                self.rnn_units * bidirectional, self.projection_dim
+            )
+        else:
+            self.projection_dim = None
+
+    def forward(self, inputs):
+        if isinstance(inputs, list):
+            real, imag = inputs
+        elif isinstance(inputs, torch.Tensor):
+            real, imag = torch.chunk(inputs, -1)
+        r2r_out = self.real_lstm(real)[0]
+        r2i_out = self.imag_lstm(real)[0]
+        i2r_out = self.real_lstm(imag)[0]
+        i2i_out = self.imag_lstm(imag)[0]
+        real_out = r2r_out - i2i_out
+        imag_out = i2r_out + r2i_out
+        if self.projection_dim is not None:
+            real_out = self.r_trans(real_out)
+            imag_out = self.i_trans(imag_out)
+        return [real_out, imag_out]
+
+    def flatten_parameters(self):
+        self.imag_lstm.flatten_parameters()
+        self.real_lstm.flatten_parameters()
+
+
+def complex_cat(inputs, axis):
+
+    real, imag = [], []
+    for idx, data in enumerate(inputs):
+        r, i = torch.chunk(data, 2, axis)
+        real.append(r)
+        imag.append(i)
+    real = torch.cat(real, axis)
+    imag = torch.cat(imag, axis)
+    outputs = torch.cat([real, imag], axis)
+    return outputs
+
+
+class ComplexConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=(1, 1),
+        stride=(1, 1),
+        padding=(0, 0),
+        dilation=1,
+        groups=1,
+        causal=True,
+        complex_axis=1,
+    ):
+        """ComplexConv2d.
+
+        in_channels: real+imag
+        out_channels: real+imag
+        kernel_size : input [B,C,D,T] kernel size in [D,T]
+        padding : input [B,C,D,T] padding in [D,T]
+        causal: if causal, will padding time dimension's left side,
+                otherwise both
+        """
+        super(ComplexConv2d, self).__init__()
+        self.in_channels = in_channels // 2
+        self.out_channels = out_channels // 2
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.causal = causal
+        self.groups = groups
+        self.dilation = dilation
+        self.complex_axis = complex_axis
+        self.real_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size,
+            self.stride,
+            padding=[self.padding[0], 0],
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        self.imag_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size,
+            self.stride,
+            padding=[self.padding[0], 0],
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+
+        nn.init.normal_(self.real_conv.weight.data, std=0.05)
+        nn.init.normal_(self.imag_conv.weight.data, std=0.05)
+        nn.init.constant_(self.real_conv.bias, 0.0)
+        nn.init.constant_(self.imag_conv.bias, 0.0)
+
+    def forward(self, inputs):
+        if self.padding[1] != 0 and self.causal:
+            inputs = F.pad(inputs, [self.padding[1], 0, 0, 0])
+        else:
+            inputs = F.pad(inputs, [self.padding[1], self.padding[1], 0, 0])
+
+        if self.complex_axis == 0:
+            real = self.real_conv(inputs)
+            imag = self.imag_conv(inputs)
+            real2real, imag2real = torch.chunk(real, 2, self.complex_axis)
+            real2imag, imag2imag = torch.chunk(imag, 2, self.complex_axis)
+
+        else:
+            if isinstance(inputs, torch.Tensor):
+                real, imag = torch.chunk(inputs, 2, self.complex_axis)
+
+            real2real = self.real_conv(
+                real,
+            )
+            imag2imag = self.imag_conv(
+                imag,
+            )
+
+            real2imag = self.imag_conv(real)
+            imag2real = self.real_conv(imag)
+
+        real = real2real - imag2imag
+        imag = real2imag + imag2real
+        out = torch.cat([real, imag], self.complex_axis)
+
+        return out
+
+
+class ComplexConvTranspose2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=(1, 1),
+        stride=(1, 1),
+        padding=(0, 0),
+        output_padding=(0, 0),
+        causal=False,
+        complex_axis=1,
+        groups=1,
+    ):
+        """ComplexConvTranspose2d.
+
+        in_channels: real+imag
+        out_channels: real+imag
+        """
+        super(ComplexConvTranspose2d, self).__init__()
+        self.in_channels = in_channels // 2
+        self.out_channels = out_channels // 2
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.output_padding = output_padding
+        self.groups = groups
+
+        self.real_conv = nn.ConvTranspose2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size,
+            self.stride,
+            padding=self.padding,
+            output_padding=output_padding,
+            groups=self.groups,
+        )
+        self.imag_conv = nn.ConvTranspose2d(
+            self.in_channels,
+            self.out_channels,
+            kernel_size,
+            self.stride,
+            padding=self.padding,
+            output_padding=output_padding,
+            groups=self.groups,
+        )
+        self.complex_axis = complex_axis
+
+        nn.init.normal_(self.real_conv.weight, std=0.05)
+        nn.init.normal_(self.imag_conv.weight, std=0.05)
+        nn.init.constant_(self.real_conv.bias, 0.0)
+        nn.init.constant_(self.imag_conv.bias, 0.0)
+
+    def forward(self, inputs):
+        if isinstance(inputs, torch.Tensor):
+            real, imag = torch.chunk(inputs, 2, self.complex_axis)
+        elif isinstance(inputs, tuple) or isinstance(inputs, list):
+            real = inputs[0]
+            imag = inputs[1]
+        if self.complex_axis == 0:
+            real = self.real_conv(inputs)
+            imag = self.imag_conv(inputs)
+            real2real, imag2real = torch.chunk(real, 2, self.complex_axis)
+            real2imag, imag2imag = torch.chunk(imag, 2, self.complex_axis)
+
+        else:
+            if isinstance(inputs, torch.Tensor):
+                real, imag = torch.chunk(inputs, 2, self.complex_axis)
+
+            real2real = self.real_conv(
+                real,
+            )
+            imag2imag = self.imag_conv(
+                imag,
+            )
+
+            real2imag = self.imag_conv(real)
+            imag2real = self.real_conv(imag)
+
+        real = real2real - imag2imag
+        imag = real2imag + imag2real
+
+        out = torch.cat([real, imag], self.complex_axis)
+
+        return out
+
+
+class ComplexBatchNorm(torch.nn.Module):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        complex_axis=1,
+    ):
+        super(ComplexBatchNorm, self).__init__()
+        self.num_features = num_features // 2
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+
+        self.complex_axis = complex_axis
+
+        if self.affine:
+            self.Wrr = torch.nn.Parameter(torch.Tensor(self.num_features))
+            self.Wri = torch.nn.Parameter(torch.Tensor(self.num_features))
+            self.Wii = torch.nn.Parameter(torch.Tensor(self.num_features))
+            self.Br = torch.nn.Parameter(torch.Tensor(self.num_features))
+            self.Bi = torch.nn.Parameter(torch.Tensor(self.num_features))
+        else:
+            self.register_parameter("Wrr", None)
+            self.register_parameter("Wri", None)
+            self.register_parameter("Wii", None)
+            self.register_parameter("Br", None)
+            self.register_parameter("Bi", None)
+
+        if self.track_running_stats:
+            self.register_buffer("RMr", torch.zeros(self.num_features))
+            self.register_buffer("RMi", torch.zeros(self.num_features))
+            self.register_buffer("RVrr", torch.ones(self.num_features))
+            self.register_buffer("RVri", torch.zeros(self.num_features))
+            self.register_buffer("RVii", torch.ones(self.num_features))
+            self.register_buffer(
+                "num_batches_tracked", torch.tensor(0, dtype=torch.long)
+            )
+        else:
+            self.register_parameter("RMr", None)
+            self.register_parameter("RMi", None)
+            self.register_parameter("RVrr", None)
+            self.register_parameter("RVri", None)
+            self.register_parameter("RVii", None)
+            self.register_parameter("num_batches_tracked", None)
+        self.reset_parameters()
+
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.RMr.zero_()
+            self.RMi.zero_()
+            self.RVrr.fill_(1)
+            self.RVri.zero_()
+            self.RVii.fill_(1)
+            self.num_batches_tracked.zero_()
+
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.Br.data.zero_()
+            self.Bi.data.zero_()
+            self.Wrr.data.fill_(1)
+            self.Wri.data.uniform_(-0.9, +0.9)  # W will be positive-definite
+            self.Wii.data.fill_(1)
+
+    def _check_input_dim(self, xr, xi):
+        assert xr.shape == xi.shape
+        assert xr.size(1) == self.num_features
+
+    def forward(self, inputs):
+
+        xr, xi = torch.chunk(inputs, 2, axis=self.complex_axis)
+        exponential_average_factor = 0.0
+
+        if self.training and self.track_running_stats:
+            self.num_batches_tracked += 1
+            if self.momentum is None:  # use cumulative moving average
+                exponential_average_factor = 1.0 / self.num_batches_tracked.item()
+            else:  # use exponential moving average
+                exponential_average_factor = self.momentum
+
+        # NOTE: The precise meaning of the "training flag" is:
+        #       True:  Normalize using batch   statistics, update running statistics
+        #              if they are being collected.
+        #       False: Normalize using running statistics, ignore batch   statistics.
+        training = self.training or not self.track_running_stats
+        redux = [i for i in reversed(range(xr.dim())) if i != 1]
+        vdim = [1] * xr.dim()
+        vdim[1] = xr.size(1)
+
+        # Mean M Computation and Centering
+        # Includes running mean update if training and running.
+        if training:
+            Mr, Mi = xr, xi
+            for d in redux:
+                Mr = Mr.mean(d, keepdim=True)
+                Mi = Mi.mean(d, keepdim=True)
+            if self.track_running_stats:
+                self.RMr.lerp_(Mr.squeeze(), exponential_average_factor)
+                self.RMi.lerp_(Mi.squeeze(), exponential_average_factor)
+        else:
+            Mr = self.RMr.view(vdim)
+            Mi = self.RMi.view(vdim)
+        xr, xi = xr - Mr, xi - Mi
+
+        # Variance Matrix V Computation
+        # Includes epsilon numerical stabilizer/Tikhonov regularizer.
+        # Includes running variance update if training and running.
+        if training:
+            Vrr = xr * xr
+            Vri = xr * xi
+            Vii = xi * xi
+            for d in redux:
+                Vrr = Vrr.mean(d, keepdim=True)
+                Vri = Vri.mean(d, keepdim=True)
+                Vii = Vii.mean(d, keepdim=True)
+            if self.track_running_stats:
+                self.RVrr.lerp_(Vrr.squeeze(), exponential_average_factor)
+                self.RVri.lerp_(Vri.squeeze(), exponential_average_factor)
+                self.RVii.lerp_(Vii.squeeze(), exponential_average_factor)
+        else:
+            Vrr = self.RVrr.view(vdim)
+            Vri = self.RVri.view(vdim)
+            Vii = self.RVii.view(vdim)
+        Vrr = Vrr + self.eps
+        Vri = Vri
+        Vii = Vii + self.eps
+
+        # Matrix Inverse Square Root U = V^-0.5
+        # sqrt of a 2x2 matrix,
+        # - https://en.wikipedia.org/wiki/Square_root_of_a_2_by_2_matrix
+        tau = Vrr + Vii
+        delta = torch.addcmul(Vrr * Vii, -1, Vri, Vri)
+        s = delta.sqrt()
+        t = (tau + 2 * s).sqrt()
+
+        # matrix inverse, http://mathworld.wolfram.com/MatrixInverse.html
+        rst = (s * t).reciprocal()
+        Urr = (s + Vii) * rst
+        Uii = (s + Vrr) * rst
+        Uri = (-Vri) * rst
+
+        # Optionally left-multiply U by affine weights W to produce combined
+        # weights Z, left-multiply the inputs by Z, then optionally bias them.
+        #
+        # y = Zx + B
+        # y = WUx + B
+        # y = [Wrr Wri][Urr Uri] [xr] + [Br]
+        #     [Wir Wii][Uir Uii] [xi]   [Bi]
+        if self.affine:
+            Wrr, Wri, Wii = (
+                self.Wrr.view(vdim),
+                self.Wri.view(vdim),
+                self.Wii.view(vdim),
+            )
+            Zrr = (Wrr * Urr) + (Wri * Uri)
+            Zri = (Wrr * Uri) + (Wri * Uii)
+            Zir = (Wri * Urr) + (Wii * Uri)
+            Zii = (Wri * Uri) + (Wii * Uii)
+        else:
+            Zrr, Zri, Zir, Zii = Urr, Uri, Uri, Uii
+
+        yr = (Zrr * xr) + (Zri * xi)
+        yi = (Zir * xr) + (Zii * xi)
+
+        if self.affine:
+            yr = yr + self.Br.view(vdim)
+            yi = yi + self.Bi.view(vdim)
+
+        outputs = torch.cat([yr, yi], self.complex_axis)
+        return outputs
+
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
+            "track_running_stats={track_running_stats}".format(**self.__dict__)
+        )
diff --git a/espnet2/enh/layers/conv_utils.py b/espnet2/enh/layers/conv_utils.py
new file mode 100644
index 00000000000..e3ca44083a6
--- /dev/null
+++ b/espnet2/enh/layers/conv_utils.py
@@ -0,0 +1,57 @@
+# noqa: E501 ported from https://discuss.pytorch.org/t/utility-function-for-calculating-the-shape-of-a-conv-output/11173/7
+import math
+
+
+def num2tuple(num):
+    return num if isinstance(num, tuple) else (num, num)
+
+
+def conv2d_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1):
+    h_w, kernel_size, stride, pad, dilation = (
+        num2tuple(h_w),
+        num2tuple(kernel_size),
+        num2tuple(stride),
+        num2tuple(pad),
+        num2tuple(dilation),
+    )
+    pad = num2tuple(pad[0]), num2tuple(pad[1])
+
+    h = math.floor(
+        (h_w[0] + sum(pad[0]) - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1
+    )
+    w = math.floor(
+        (h_w[1] + sum(pad[1]) - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1
+    )
+
+    return h, w
+
+
+def convtransp2d_output_shape(
+    h_w, kernel_size=1, stride=1, pad=0, dilation=1, out_pad=0
+):
+    h_w, kernel_size, stride, pad, dilation, out_pad = (
+        num2tuple(h_w),
+        num2tuple(kernel_size),
+        num2tuple(stride),
+        num2tuple(pad),
+        num2tuple(dilation),
+        num2tuple(out_pad),
+    )
+    pad = num2tuple(pad[0]), num2tuple(pad[1])
+
+    h = (
+        (h_w[0] - 1) * stride[0]
+        - sum(pad[0])
+        + dilation[0] * (kernel_size[0] - 1)
+        + out_pad[0]
+        + 1
+    )
+    w = (
+        (h_w[1] - 1) * stride[1]
+        - sum(pad[1])
+        + dilation[1] * (kernel_size[1] - 1)
+        + out_pad[1]
+        + 1
+    )
+
+    return h, w
diff --git a/espnet2/enh/layers/dc_crn.py b/espnet2/enh/layers/dc_crn.py
new file mode 100644
index 00000000000..ba781a4cd45
--- /dev/null
+++ b/espnet2/enh/layers/dc_crn.py
@@ -0,0 +1,508 @@
+# Implementation of Densely-connected convolutional recurrent network (DC-CRN)
+# [1] Tan et al. "Deep Learning Based Real-Time Speech Enhancement for Dual-Microphone
+#     Mobile Phones"
+#     https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf
+
+
+from typing import List
+
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers.conv_utils import conv2d_output_shape
+from espnet2.enh.layers.conv_utils import convtransp2d_output_shape
+
+
+class GLSTM(nn.Module):
+    def __init__(
+        self, hidden_size=1024, groups=2, layers=2, bidirectional=False, rearrange=False
+    ):
+        """Grouped LSTM.
+
+        Reference:
+            Efficient Sequence Learning with Group Recurrent Networks; Gao et al., 2018
+
+        Args:
+            hidden_size (int): total hidden size of all LSTMs in each grouped LSTM layer
+                i.e., hidden size of each LSTM is `hidden_size // groups`
+            groups (int): number of LSTMs in each grouped LSTM layer
+            layers (int): number of grouped LSTM layers
+            bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+            rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+        """
+        super().__init__()
+
+        assert hidden_size % groups == 0, (hidden_size, groups)
+        hidden_size_t = hidden_size // groups
+        if bidirectional:
+            assert hidden_size_t % 2 == 0, hidden_size_t
+
+        self.groups = groups
+        self.layers = layers
+        self.rearrange = rearrange
+
+        self.lstm_list = nn.ModuleList()
+        self.ln = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(layers)])
+        for layer in range(layers):
+            self.lstm_list.append(
+                nn.ModuleList(
+                    [
+                        nn.LSTM(
+                            hidden_size_t,
+                            hidden_size_t // 2 if bidirectional else hidden_size_t,
+                            1,
+                            batch_first=True,
+                            bidirectional=bidirectional,
+                        )
+                        for _ in range(groups)
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        """Grouped LSTM forward.
+
+        Args:
+            x (torch.Tensor): (B, C, T, D)
+        Returns:
+            out (torch.Tensor): (B, C, T, D)
+        """
+        out = x
+        out = out.transpose(1, 2).contiguous()
+        B, T = out.size(0), out.size(1)
+        out = out.view(B, T, -1).contiguous()
+
+        out = torch.chunk(out, self.groups, dim=-1)
+        out = torch.stack(
+            [self.lstm_list[0][i](out[i])[0] for i in range(self.groups)], dim=-1
+        )
+        out = torch.flatten(out, start_dim=-2, end_dim=-1)
+        out = self.ln[0](out)
+
+        for layer in range(1, self.layers):
+            if self.rearrange:
+                out = (
+                    out.reshape(B, T, self.groups, -1)
+                    .transpose(-1, -2)
+                    .contiguous()
+                    .view(B, T, -1)
+                )
+            out = torch.chunk(out, self.groups, dim=-1)
+            out = torch.cat(
+                [self.lstm_list[layer][i](out[i])[0] for i in range(self.groups)],
+                dim=-1,
+            )
+            out = self.ln[layer](out)
+
+        out = out.view(out.size(0), out.size(1), x.size(1), -1).contiguous()
+        out = out.transpose(1, 2).contiguous()
+
+        return out
+
+
+class GluConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding=0):
+        """Conv2d with Gated Linear Units (GLU).
+
+        Input and output shapes are the same as regular Conv2d layers.
+
+        Reference: Section III-B in [1]
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            kernel_size (int/tuple): kernel size in Conv2d
+            stride (int/tuple): stride size in Conv2d
+            padding (int/tuple): padding size in Conv2d
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        """ConvGLU forward.
+
+        Args:
+            x (torch.Tensor): (B, C_in, H_in, W_in)
+        Returns:
+            out (torch.Tensor): (B, C_out, H_out, W_out)
+        """
+        out = self.conv1(x)
+        gate = self.sigmoid(self.conv2(x))
+        return out * gate
+
+
+class GluConvTranspose2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        output_padding=(0, 0),
+    ):
+        """ConvTranspose2d with Gated Linear Units (GLU).
+
+        Input and output shapes are the same as regular ConvTranspose2d layers.
+
+        Reference: Section III-B in [1]
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            kernel_size (int/tuple): kernel size in ConvTranspose2d
+            stride (int/tuple): stride size in ConvTranspose2d
+            padding (int/tuple): padding size in ConvTranspose2d
+            output_padding (int/tuple): Additional size added to one side of each
+                dimension in the output shape
+        """
+        super().__init__()
+        self.deconv1 = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+        self.deconv2 = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        """DeconvGLU forward.
+
+        Args:
+            x (torch.Tensor): (B, C_in, H_in, W_in)
+        Returns:
+            out (torch.Tensor): (B, C_out, H_out, W_out)
+        """
+        out = self.deconv1(x)
+        gate = self.sigmoid(self.deconv2(x))
+        return out * gate
+
+
+class DenselyConnectedBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hid_channels=8,
+        kernel_size=(1, 3),
+        padding=(0, 1),
+        last_kernel_size=(1, 4),  # use (1, 4) to alleviate the checkerboard artifacts
+        last_stride=(1, 2),
+        last_padding=(0, 1),
+        last_output_padding=(0, 0),
+        layers=5,
+        transposed=False,
+    ):
+        """Densely-Connected Convolutional Block.
+
+        Args:
+            in_channels (int): number of input channels
+            out_channels (int): number of output channels
+            hid_channels (int): number of output channels in intermediate Conv layers
+            kernel_size (tuple): kernel size for all but the last Conv layers
+            padding (tuple): padding for all but the last Conv layers
+            last_kernel_size (tuple): kernel size for the last GluConv layer
+            last_stride (tuple): stride for the last GluConv layer
+            last_padding (tuple): padding for the last GluConv layer
+            last_output_padding (tuple): output padding for the last GluConvTranspose2d
+                 (only used when `transposed=True`)
+            layers (int): total number of Conv layers
+            transposed (bool): True to use GluConvTranspose2d in the last layer
+                               False to use GluConv2d in the last layer
+        """
+        super().__init__()
+
+        assert layers > 1, layers
+        self.conv = nn.ModuleList()
+        in_channel = in_channels
+        # here T=42 and D=127 are random integers that should not be changed after Conv
+        T, D = 42, 127
+        hidden_sizes = [127]
+        for _ in range(layers - 1):
+            self.conv.append(
+                nn.Sequential(
+                    nn.Conv2d(
+                        in_channel,
+                        hid_channels,
+                        kernel_size=kernel_size,
+                        stride=(1, 1),
+                        padding=padding,
+                    ),
+                    nn.BatchNorm2d(hid_channels),
+                    nn.ELU(inplace=True),
+                )
+            )
+            in_channel = in_channel + hid_channels
+            # make sure the last two dimensions will not be changed after this layer
+            tdim, hdim = conv2d_output_shape(
+                (T, D),
+                kernel_size=kernel_size,
+                stride=(1, 1),
+                pad=padding,
+            )
+            hidden_sizes.append(hdim)
+            assert tdim == T and hdim == D, (tdim, hdim, T, D)
+
+        if transposed:
+            self.conv.append(
+                GluConvTranspose2d(
+                    in_channel,
+                    out_channels,
+                    kernel_size=last_kernel_size,
+                    stride=last_stride,
+                    padding=last_padding,
+                    output_padding=last_output_padding,
+                )
+            )
+        else:
+            self.conv.append(
+                GluConv2d(
+                    in_channel,
+                    out_channels,
+                    kernel_size=last_kernel_size,
+                    stride=last_stride,
+                    padding=last_padding,
+                )
+            )
+
+    def forward(self, input):
+        """DenselyConnectedBlock forward.
+
+        Args:
+            input (torch.Tensor): (B, C, T_in, F_in)
+        Returns:
+            out (torch.Tensor): (B, C, T_out, F_out)
+        """
+        out = self.conv[0](input)
+        outputs = [input, out]
+        num_layers = len(self.conv)
+        for idx, layer in enumerate(self.conv[1:]):
+            out = layer(torch.cat(outputs, dim=1))
+            if idx < num_layers - 1:
+                outputs.append(out)
+        return out
+
+
+class DC_CRN(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        input_channels: List = [2, 16, 32, 64, 128, 256],
+        enc_hid_channels=8,
+        enc_kernel_size=(1, 3),
+        enc_padding=(0, 1),
+        enc_last_kernel_size=(1, 4),
+        enc_last_stride=(1, 2),
+        enc_last_padding=(0, 1),
+        enc_layers=5,
+        skip_last_kernel_size=(1, 3),
+        skip_last_stride=(1, 1),
+        skip_last_padding=(0, 1),
+        glstm_groups=2,
+        glstm_layers=2,
+        glstm_bidirectional=False,
+        glstm_rearrange=False,
+        output_channels=2,
+    ):
+        """Densely-Connected Convolutional Recurrent Network (DC-CRN).
+
+        Reference: Fig. 3 and Section III-B in [1]
+
+        Args:
+            input_dim (int): input feature dimension
+            input_channels (list): number of input channels for the stacked
+                DenselyConnectedBlock layers
+                Its length should be (`number of DenselyConnectedBlock layers`).
+                It is recommended to use even number of channels to avoid AssertError
+                when `glstm_bidirectional=True`.
+            enc_hid_channels (int): common number of intermediate channels for all
+                DenselyConnectedBlock of the encoder
+            enc_kernel_size (tuple): common kernel size for all DenselyConnectedBlock
+                of the encoder
+            enc_padding (tuple): common padding for all DenselyConnectedBlock
+                of the encoder
+            enc_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the encoder
+            enc_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_layers (int): common total number of Conv layers for all
+                DenselyConnectedBlock layers of the encoder
+            skip_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the skip pathways
+            skip_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            skip_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            glstm_groups (int): number of groups in each Grouped LSTM layer
+            glstm_layers (int): number of Grouped LSTM layers
+            glstm_bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+                in Grouped LSTM layers
+            glstm_rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+            output_channels (int): number of output channels (must be an even number to
+                recover both real and imaginary parts)
+        """
+        super().__init__()
+
+        assert output_channels % 2 == 0, output_channels
+        self.conv_enc = nn.ModuleList()
+        # here T=42 is a random integer that should not be changed after Conv
+        T = 42
+        hidden_sizes = [input_dim]
+        hdim = input_dim
+        for i in range(1, len(input_channels)):
+            self.conv_enc.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i - 1],
+                    out_channels=input_channels[i],
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=enc_last_kernel_size,
+                    last_stride=enc_last_stride,
+                    last_padding=enc_last_padding,
+                    layers=enc_layers,
+                    transposed=False,
+                )
+            )
+            tdim, hdim = conv2d_output_shape(
+                (T, hdim),
+                kernel_size=enc_last_kernel_size,
+                stride=enc_last_stride,
+                pad=enc_last_padding,
+            )
+            hidden_sizes.append(hdim)
+            assert tdim == T, (tdim, hdim)
+
+        hs = hdim * input_channels[-1]
+        assert hs >= glstm_groups, (hs, glstm_groups)
+        self.glstm = GLSTM(
+            hidden_size=hs,
+            groups=glstm_groups,
+            layers=glstm_layers,
+            bidirectional=glstm_bidirectional,
+            rearrange=glstm_rearrange,
+        )
+
+        self.skip_pathway = nn.ModuleList()
+        self.deconv_dec = nn.ModuleList()
+        for i in range(len(input_channels) - 1, 0, -1):
+            self.skip_pathway.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i],
+                    out_channels=input_channels[i],
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=skip_last_kernel_size,
+                    last_stride=skip_last_stride,
+                    last_padding=skip_last_padding,
+                    layers=enc_layers,
+                    transposed=False,
+                )
+            )
+            # make sure the last two dimensions will not be changed after this layer
+            enc_hdim = hidden_sizes[i]
+            tdim, hdim = conv2d_output_shape(
+                (T, enc_hdim),
+                kernel_size=skip_last_kernel_size,
+                stride=skip_last_stride,
+                pad=skip_last_padding,
+            )
+            assert tdim == T and hdim == enc_hdim, (tdim, hdim, T, enc_hdim)
+
+            if i != 1:
+                out_ch = input_channels[i - 1]
+            else:
+                out_ch = output_channels
+            # make sure the last but one dimension will not be changed after this layer
+            tdim, hdim = convtransp2d_output_shape(
+                (T, enc_hdim),
+                kernel_size=enc_last_kernel_size,
+                stride=enc_last_stride,
+                pad=enc_last_padding,
+            )
+            assert tdim == T, (tdim, hdim)
+            hpadding = hidden_sizes[i - 1] - hdim
+            assert hpadding >= 0, (hidden_sizes[i - 1], hdim)
+            self.deconv_dec.append(
+                DenselyConnectedBlock(
+                    in_channels=input_channels[i] * 2,
+                    out_channels=out_ch,
+                    hid_channels=enc_hid_channels,
+                    kernel_size=enc_kernel_size,
+                    padding=enc_padding,
+                    last_kernel_size=enc_last_kernel_size,
+                    last_stride=enc_last_stride,
+                    last_padding=enc_last_padding,
+                    last_output_padding=(0, hpadding),
+                    layers=enc_layers,
+                    transposed=True,
+                )
+            )
+
+        self.fc_real = nn.Linear(in_features=input_dim, out_features=input_dim)
+        self.fc_imag = nn.Linear(in_features=input_dim, out_features=input_dim)
+
+    def forward(self, x):
+        """DC-CRN forward.
+
+        Args:
+            x (torch.Tensor): Concatenated real and imaginary spectrum features
+                (B, input_channels[0], T, F)
+        Returns:
+            out (torch.Tensor): (B, 2, output_channels, T, F)
+        """
+        out = x
+        conv_out = []
+        for idx, layer in enumerate(self.conv_enc):
+            out = layer(out)
+            conv_out.append(out)
+
+        num_out = len(conv_out)
+        out = self.glstm(conv_out[-1])
+        res = self.skip_pathway[0](conv_out[-1])
+        out = torch.cat((out, res), dim=1)
+
+        for idx in range(len(self.deconv_dec) - 1):
+            deconv_out = self.deconv_dec[idx](out)
+            res = self.skip_pathway[idx + 1](conv_out[num_out - idx - 2])
+            out = torch.cat((deconv_out, res), dim=1)
+        out = self.deconv_dec[-1](out)
+
+        dout_real, dout_imag = torch.chunk(out, 2, dim=1)
+
+        out_real = self.fc_real(dout_real)
+        out_imag = self.fc_imag(dout_imag)
+        out = torch.stack([out_real, out_imag], dim=1)
+
+        return out
diff --git a/espnet2/enh/layers/dnn_beamformer.py b/espnet2/enh/layers/dnn_beamformer.py
index 94d802dd22f..40b264dcea9 100644
--- a/espnet2/enh/layers/dnn_beamformer.py
+++ b/espnet2/enh/layers/dnn_beamformer.py
@@ -1,29 +1,36 @@
+"""DNN beamformer module."""
 from distutils.version import LooseVersion
 from typing import List
+from typing import Optional
 from typing import Tuple
 from typing import Union
 
 import logging
 import torch
 from torch.nn import functional as F
-from torch_complex import functional as FC
 from torch_complex.tensor import ComplexTensor
 
-from espnet.nets.pytorch_backend.frontends.beamformer import apply_beamforming_vector
-from espnet.nets.pytorch_backend.frontends.beamformer import (
-    get_power_spectral_density_matrix,  # noqa: H301
-)
-from espnet2.enh.layers.beamformer import get_covariances
+from espnet2.enh.layers.beamformer import apply_beamforming_vector
+from espnet2.enh.layers.beamformer import blind_analytic_normalization
+from espnet2.enh.layers.beamformer import get_gev_vector
+from espnet2.enh.layers.beamformer import get_lcmv_vector_with_rtf
 from espnet2.enh.layers.beamformer import get_mvdr_vector
 from espnet2.enh.layers.beamformer import get_mvdr_vector_with_rtf
+from espnet2.enh.layers.beamformer import get_mwf_vector
+from espnet2.enh.layers.beamformer import get_rank1_mwf_vector
+from espnet2.enh.layers.beamformer import get_rtf_matrix
+from espnet2.enh.layers.beamformer import get_sdw_mwf_vector
 from espnet2.enh.layers.beamformer import get_WPD_filter_v2
 from espnet2.enh.layers.beamformer import get_WPD_filter_with_rtf
 from espnet2.enh.layers.beamformer import perform_WPD_filtering
+from espnet2.enh.layers.beamformer import prepare_beamformer_stats
+from espnet2.enh.layers.complex_utils import stack
+from espnet2.enh.layers.complex_utils import to_double
+from espnet2.enh.layers.complex_utils import to_float
 from espnet2.enh.layers.mask_estimator import MaskEstimator
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
-is_torch_1_3_plus = LooseVersion(torch.__version__) >= LooseVersion("1.3.0")
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 
 BEAMFORMER_TYPES = (
     # Minimum Variance Distortionless Response beamformer
@@ -38,6 +45,25 @@
     # Weighted Power minimization Distortionless response beamformer
     "wpd",  # RTF-based formula
     "wpd_souden",  # Souden's solution
+    # Multi-channel Wiener Filter (MWF) and weighted MWF
+    "mwf",
+    "wmwf",
+    # Speech Distortion Weighted (SDW) MWF
+    "sdw_mwf",
+    # Rank-1 MWF
+    "r1mwf",
+    # Linearly Constrained Minimum Variance beamformer
+    "lcmv",
+    # Linearly Constrained Minimum Power beamformer
+    "lcmp",
+    # weighted Linearly Constrained Minimum Power beamformer
+    "wlcmp",
+    # Generalized Eigenvalue beamformer
+    "gev",
+    "gev_ban",  # with blind analytic normalization (BAN) post-filtering
+    # time-frequency-bin-wise switching (TFS) MVDR beamformer
+    "mvdr_tfs",
+    "mvdr_tfs_souden",
 )
 
 
@@ -65,6 +91,7 @@ def __init__(
         ref_channel: int = -1,
         beamformer_type: str = "mvdr_souden",
         rtf_iterations: int = 2,
+        mwf_mu: float = 1.0,
         eps: float = 1e-6,
         diagonal_loading: bool = True,
         diag_eps: float = 1e-7,
@@ -87,7 +114,9 @@ def __init__(
             nmask=bnmask,
             nonlinear=nonlinear,
         )
-        self.ref = AttentionReference(bidim, badim) if ref_channel < 0 else None
+        self.ref = (
+            AttentionReference(bidim, badim, eps=eps) if ref_channel < 0 else None
+        )
         self.ref_channel = ref_channel
 
         self.use_noise_mask = use_noise_mask
@@ -124,6 +153,8 @@ def __init__(
             assert rtf_iterations >= 2, rtf_iterations
         # number of iterations in power method for estimating the RTF
         self.rtf_iterations = rtf_iterations
+        # noise suppression weight in SDW-MWF
+        self.mwf_mu = mwf_mu
 
         assert btaps >= 0 and bdelay >= 0, (btaps, bdelay)
         self.btaps = btaps
@@ -137,10 +168,11 @@ def __init__(
 
     def forward(
         self,
-        data: ComplexTensor,
+        data: Union[torch.Tensor, ComplexTensor],
         ilens: torch.LongTensor,
-        powers: Union[List[torch.Tensor], None] = None,
-    ) -> Tuple[ComplexTensor, torch.LongTensor, torch.Tensor]:
+        powers: Optional[List[torch.Tensor]] = None,
+        oracle_masks: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[Union[torch.Tensor, ComplexTensor], torch.LongTensor, torch.Tensor]:
         """DNN_Beamformer forward function.
 
         Notation:
@@ -150,110 +182,25 @@ def forward(
             F: Freq
 
         Args:
-            data (ComplexTensor): (B, T, C, F)
+            data (torch.complex64/ComplexTensor): (B, T, C, F)
             ilens (torch.Tensor): (B,)
             powers (List[torch.Tensor] or None): used for wMPDR or WPD (B, F, T)
+            oracle_masks (List[torch.Tensor] or None): oracle masks (B, F, C, T)
+                if not None, oracle_masks will be used instead of self.mask
         Returns:
-            enhanced (ComplexTensor): (B, T, F)
+            enhanced (torch.complex64/ComplexTensor): (B, T, F)
             ilens (torch.Tensor): (B,)
             masks (torch.Tensor): (B, T, C, F)
         """
-
-        def apply_beamforming(data, ilens, psd_n, psd_speech, psd_distortion=None):
-            """Beamforming with the provided statistics.
-
-            Args:
-                data (ComplexTensor): (B, F, C, T)
-                ilens (torch.Tensor): (B,)
-                psd_n (ComplexTensor):
-                    Noise covariance matrix for MVDR (B, F, C, C)
-                    Observation covariance matrix for MPDR/wMPDR (B, F, C, C)
-                    Stacked observation covariance for WPD (B,F,(btaps+1)*C,(btaps+1)*C)
-                psd_speech (ComplexTensor): Speech covariance matrix (B, F, C, C)
-                psd_distortion (ComplexTensor): Noise covariance matrix (B, F, C, C)
-            Return:
-                enhanced (ComplexTensor): (B, F, T)
-                ws (ComplexTensor): (B, F) or (B, F, (btaps+1)*C)
-            """
-            # u: (B, C)
-            if self.ref_channel < 0:
-                u, _ = self.ref(psd_speech.to(dtype=data.dtype), ilens)
-                u = u.double()
-            else:
-                if self.beamformer_type.endswith("_souden"):
-                    # (optional) Create onehot vector for fixed reference microphone
-                    u = torch.zeros(
-                        *(data.size()[:-3] + (data.size(-2),)),
-                        device=data.device,
-                        dtype=torch.double
-                    )
-                    u[..., self.ref_channel].fill_(1)
-                else:
-                    # for simplifying computation in RTF-based beamforming
-                    u = self.ref_channel
-
-            if self.beamformer_type in ("mvdr", "mpdr", "wmpdr"):
-                ws = get_mvdr_vector_with_rtf(
-                    psd_n.double(),
-                    psd_speech.double(),
-                    psd_distortion.double(),
-                    iterations=self.rtf_iterations,
-                    reference_vector=u,
-                    normalize_ref_channel=self.ref_channel,
-                    use_torch_solver=self.use_torch_solver,
-                    diagonal_loading=self.diagonal_loading,
-                    diag_eps=self.diag_eps,
-                )
-                enhanced = apply_beamforming_vector(ws, data.double())
-            elif self.beamformer_type in ("mpdr_souden", "mvdr_souden", "wmpdr_souden"):
-                ws = get_mvdr_vector(
-                    psd_speech.double(),
-                    psd_n.double(),
-                    u,
-                    use_torch_solver=self.use_torch_solver,
-                    diagonal_loading=self.diagonal_loading,
-                    diag_eps=self.diag_eps,
-                )
-                enhanced = apply_beamforming_vector(ws, data.double())
-            elif self.beamformer_type == "wpd":
-                ws = get_WPD_filter_with_rtf(
-                    psd_n.double(),
-                    psd_speech.double(),
-                    psd_distortion.double(),
-                    iterations=self.rtf_iterations,
-                    reference_vector=u,
-                    normalize_ref_channel=self.ref_channel,
-                    use_torch_solver=self.use_torch_solver,
-                    diagonal_loading=self.diagonal_loading,
-                    diag_eps=self.diag_eps,
-                )
-                enhanced = perform_WPD_filtering(
-                    ws, data.double(), self.bdelay, self.btaps
-                )
-            elif self.beamformer_type == "wpd_souden":
-                ws = get_WPD_filter_v2(
-                    psd_speech.double(),
-                    psd_n.double(),
-                    u,
-                    diagonal_loading=self.diagonal_loading,
-                    diag_eps=self.diag_eps,
-                )
-                enhanced = perform_WPD_filtering(
-                    ws, data.double(), self.bdelay, self.btaps
-                )
-            else:
-                raise ValueError(
-                    "Not supporting beamformer_type={}".format(self.beamformer_type)
-                )
-
-            return enhanced.to(dtype=data.dtype), ws.to(dtype=data.dtype)
-
         # data (B, T, C, F) -> (B, F, C, T)
         data = data.permute(0, 3, 2, 1)
-        data_d = data.double()
+        data_d = to_double(data)
 
         # mask: [(B, F, C, T)]
-        masks, _ = self.mask(data, ilens)
+        if oracle_masks is not None:
+            masks = oracle_masks
+        else:
+            masks, _ = self.mask(data, ilens)
         assert self.nmask == len(masks), len(masks)
         # floor masks to increase numerical stability
         if self.mask_flooring:
@@ -268,68 +215,40 @@ def apply_beamforming(data, ilens, psd_n, psd_speech, psd_distortion=None):
                 mask_speech = masks[0]
                 mask_noise = 1 - mask_speech
 
-            if self.beamformer_type.startswith(
-                "wmpdr"
-            ) or self.beamformer_type.startswith("wpd"):
-                if powers is None:
-                    power_input = data_d.real ** 2 + data_d.imag ** 2
-                    # Averaging along the channel axis: (..., C, T) -> (..., T)
-                    powers = (power_input * mask_speech.double()).mean(dim=-2)
-                else:
-                    assert len(powers) == 1, len(powers)
-                    powers = powers[0]
-                inverse_power = 1 / torch.clamp(powers, min=self.eps)
-
-            psd_speech = get_power_spectral_density_matrix(data_d, mask_speech.double())
-            if mask_noise is not None and (
-                self.beamformer_type == "mvdr_souden"
-                or not self.beamformer_type.endswith("_souden")
-            ):
-                # MVDR or other RTF-based formulas
-                psd_noise = get_power_spectral_density_matrix(
-                    data_d, mask_noise.double()
-                )
-            if self.beamformer_type == "mvdr":
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_noise, psd_speech, psd_distortion=psd_noise
-                )
-            elif self.beamformer_type == "mvdr_souden":
-                enhanced, ws = apply_beamforming(data, ilens, psd_noise, psd_speech)
-            elif self.beamformer_type == "mpdr":
-                psd_observed = FC.einsum("...ct,...et->...ce", [data_d, data_d.conj()])
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_observed, psd_speech, psd_distortion=psd_noise
-                )
-            elif self.beamformer_type == "mpdr_souden":
-                psd_observed = FC.einsum("...ct,...et->...ce", [data_d, data_d.conj()])
-                enhanced, ws = apply_beamforming(data, ilens, psd_observed, psd_speech)
-            elif self.beamformer_type == "wmpdr":
-                psd_observed = FC.einsum(
-                    "...ct,...et->...ce",
-                    [data_d * inverse_power[..., None, :], data_d.conj()],
-                )
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_observed, psd_speech, psd_distortion=psd_noise
-                )
-            elif self.beamformer_type == "wmpdr_souden":
-                psd_observed = FC.einsum(
-                    "...ct,...et->...ce",
-                    [data_d * inverse_power[..., None, :], data_d.conj()],
-                )
-                enhanced, ws = apply_beamforming(data, ilens, psd_observed, psd_speech)
-            elif self.beamformer_type == "wpd":
-                psd_observed_bar = get_covariances(
-                    data_d, inverse_power, self.bdelay, self.btaps, get_vector=False
-                )
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_observed_bar, psd_speech, psd_distortion=psd_noise
-                )
-            elif self.beamformer_type == "wpd_souden":
-                psd_observed_bar = get_covariances(
-                    data_d, inverse_power, self.bdelay, self.btaps, get_vector=False
+            if self.beamformer_type in ("lcmv", "lcmp", "wlcmp"):
+                raise NotImplementedError("Single source is not supported yet")
+            beamformer_stats = prepare_beamformer_stats(
+                data_d,
+                [mask_speech],
+                mask_noise,
+                powers=powers,
+                beamformer_type=self.beamformer_type,
+                bdelay=self.bdelay,
+                btaps=self.btaps,
+                eps=self.eps,
+            )
+
+            if self.beamformer_type in ("mvdr", "mpdr", "wmpdr", "wpd"):
+                enhanced, ws = self.apply_beamforming(
+                    data,
+                    ilens,
+                    beamformer_stats["psd_n"],
+                    beamformer_stats["psd_speech"],
+                    psd_distortion=beamformer_stats["psd_distortion"],
                 )
-                enhanced, ws = apply_beamforming(
-                    data, ilens, psd_observed_bar, psd_speech
+            elif (
+                self.beamformer_type.endswith("_souden")
+                or self.beamformer_type == "mwf"
+                or self.beamformer_type == "wmwf"
+                or self.beamformer_type == "sdw_mwf"
+                or self.beamformer_type == "r1mwf"
+                or self.beamformer_type.startswith("gev")
+            ):
+                enhanced, ws = self.apply_beamforming(
+                    data,
+                    ilens,
+                    beamformer_stats["psd_n"],
+                    beamformer_stats["psd_speech"],
                 )
             else:
                 raise ValueError(
@@ -348,105 +267,92 @@ def apply_beamforming(data, ilens, psd_n, psd_speech, psd_distortion=None):
                 mask_speech = list(masks)
                 mask_noise = None
 
-            if self.beamformer_type.startswith(
-                "wmpdr"
-            ) or self.beamformer_type.startswith("wpd"):
-                if powers is None:
-                    power_input = data_d.real ** 2 + data_d.imag ** 2
-                    # Averaging along the channel axis: (..., C, T) -> (..., T)
-                    powers = [
-                        (power_input * m.double()).mean(dim=-2) for m in mask_speech
-                    ]
-                else:
-                    assert len(powers) == self.num_spk, len(powers)
-                inverse_power = [1 / torch.clamp(p, min=self.eps) for p in powers]
-
-            psd_speeches = [
-                get_power_spectral_density_matrix(data_d, mask.double())
-                for mask in mask_speech
-            ]
-            if mask_noise is not None and (
-                self.beamformer_type == "mvdr_souden"
-                or not self.beamformer_type.endswith("_souden")
-            ):
-                # MVDR or other RTF-based formulas
-                psd_noise = get_power_spectral_density_matrix(
-                    data_d, mask_noise.double()
+            beamformer_stats = prepare_beamformer_stats(
+                data_d,
+                mask_speech,
+                mask_noise,
+                powers=powers,
+                beamformer_type=self.beamformer_type,
+                bdelay=self.bdelay,
+                btaps=self.btaps,
+                eps=self.eps,
+            )
+            if self.beamformer_type in ("lcmv", "lcmp", "wlcmp"):
+                rtf_mat = get_rtf_matrix(
+                    beamformer_stats["psd_speech"],
+                    beamformer_stats["psd_distortion"],
+                    diagonal_loading=self.diagonal_loading,
+                    ref_channel=self.ref_channel,
+                    rtf_iterations=self.rtf_iterations,
+                    use_torch_solver=self.use_torch_solver,
+                    diag_eps=self.diag_eps,
                 )
-            if self.beamformer_type in ("mpdr", "mpdr_souden"):
-                psd_observed = FC.einsum("...ct,...et->...ce", [data_d, data_d.conj()])
-            elif self.beamformer_type in ("wmpdr", "wmpdr_souden"):
-                psd_observed = [
-                    FC.einsum(
-                        "...ct,...et->...ce",
-                        [data_d * inv_p[..., None, :], data_d.conj()],
-                    )
-                    for inv_p in inverse_power
-                ]
-            elif self.beamformer_type in ("wpd", "wpd_souden"):
-                psd_observed_bar = [
-                    get_covariances(
-                        data_d, inv_p, self.bdelay, self.btaps, get_vector=False
-                    )
-                    for inv_p in inverse_power
-                ]
 
             enhanced, ws = [], []
             for i in range(self.num_spk):
-                psd_speech = psd_speeches.pop(i)
-                if (
-                    self.beamformer_type == "mvdr_souden"
-                    or not self.beamformer_type.endswith("_souden")
-                ):
-                    psd_noise_i = (
-                        psd_noise + sum(psd_speeches)
-                        if mask_noise is not None
-                        else sum(psd_speeches)
-                    )
                 # treat all other speakers' psd_speech as noises
-                if self.beamformer_type == "mvdr":
-                    enh, w = apply_beamforming(
-                        data, ilens, psd_noise_i, psd_speech, psd_distortion=psd_noise_i
+                if self.beamformer_type in ("mvdr", "mvdr_tfs", "wmpdr", "wpd"):
+                    enh, w = self.apply_beamforming(
+                        data,
+                        ilens,
+                        beamformer_stats["psd_n"][i],
+                        beamformer_stats["psd_speech"][i],
+                        psd_distortion=beamformer_stats["psd_distortion"][i],
+                    )
+                elif self.beamformer_type in (
+                    "mvdr_souden",
+                    "mvdr_tfs_souden",
+                    "wmpdr_souden",
+                    "wpd_souden",
+                    "wmwf",
+                    "sdw_mwf",
+                    "r1mwf",
+                    "gev",
+                    "gev_ban",
+                ):
+                    enh, w = self.apply_beamforming(
+                        data,
+                        ilens,
+                        beamformer_stats["psd_n"][i],
+                        beamformer_stats["psd_speech"][i],
                     )
-                elif self.beamformer_type == "mvdr_souden":
-                    enh, w = apply_beamforming(data, ilens, psd_noise_i, psd_speech)
                 elif self.beamformer_type == "mpdr":
-                    enh, w = apply_beamforming(
+                    enh, w = self.apply_beamforming(
                         data,
                         ilens,
-                        psd_observed,
-                        psd_speech,
-                        psd_distortion=psd_noise_i,
+                        beamformer_stats["psd_n"],
+                        beamformer_stats["psd_speech"][i],
+                        psd_distortion=beamformer_stats["psd_distortion"][i],
                     )
-                elif self.beamformer_type == "mpdr_souden":
-                    enh, w = apply_beamforming(data, ilens, psd_observed, psd_speech)
-                elif self.beamformer_type == "wmpdr":
-                    enh, w = apply_beamforming(
+                elif self.beamformer_type in ("mpdr_souden", "mwf"):
+                    enh, w = self.apply_beamforming(
                         data,
                         ilens,
-                        psd_observed[i],
-                        psd_speech,
-                        psd_distortion=psd_noise_i,
+                        beamformer_stats["psd_n"],
+                        beamformer_stats["psd_speech"][i],
                     )
-                elif self.beamformer_type == "wmpdr_souden":
-                    enh, w = apply_beamforming(data, ilens, psd_observed[i], psd_speech)
-                elif self.beamformer_type == "wpd":
-                    enh, w = apply_beamforming(
+                elif self.beamformer_type == "lcmp":
+                    enh, w = self.apply_beamforming(
                         data,
                         ilens,
-                        psd_observed_bar[i],
-                        psd_speech,
-                        psd_distortion=psd_noise_i,
+                        beamformer_stats["psd_n"],
+                        beamformer_stats["psd_speech"][i],
+                        rtf_mat=rtf_mat,
+                        spk=i,
                     )
-                elif self.beamformer_type == "wpd_souden":
-                    enh, w = apply_beamforming(
-                        data, ilens, psd_observed_bar[i], psd_speech
+                elif self.beamformer_type in ("lcmv", "wlcmp"):
+                    enh, w = self.apply_beamforming(
+                        data,
+                        ilens,
+                        beamformer_stats["psd_n"][i],
+                        beamformer_stats["psd_speech"][i],
+                        rtf_mat=rtf_mat,
+                        spk=i,
                     )
                 else:
                     raise ValueError(
                         "Not supporting beamformer_type={}".format(self.beamformer_type)
                     )
-                psd_speeches.insert(i, psd_speech)
 
                 # (..., F, T) -> (..., T, F)
                 enh = enh.transpose(-1, -2)
@@ -457,37 +363,241 @@ def apply_beamforming(data, ilens, psd_n, psd_speech, psd_distortion=None):
         masks = [m.transpose(-1, -3) for m in masks]
         return enhanced, ilens, masks
 
+    def apply_beamforming(
+        self,
+        data,
+        ilens,
+        psd_n,
+        psd_speech,
+        psd_distortion=None,
+        rtf_mat=None,
+        spk=0,
+    ):
+        """Beamforming with the provided statistics.
+
+        Args:
+            data (torch.complex64/ComplexTensor): (B, F, C, T)
+            ilens (torch.Tensor): (B,)
+            psd_n (torch.complex64/ComplexTensor):
+                Noise covariance matrix for MVDR (B, F, C, C)
+                Observation covariance matrix for MPDR/wMPDR (B, F, C, C)
+                Stacked observation covariance for WPD (B,F,(btaps+1)*C,(btaps+1)*C)
+            psd_speech (torch.complex64/ComplexTensor):
+                Speech covariance matrix (B, F, C, C)
+            psd_distortion (torch.complex64/ComplexTensor):
+                Noise covariance matrix (B, F, C, C)
+            rtf_mat (torch.complex64/ComplexTensor):
+                RTF matrix (B, F, C, num_spk)
+            spk (int): speaker index
+        Return:
+            enhanced (torch.complex64/ComplexTensor): (B, F, T)
+            ws (torch.complex64/ComplexTensor): (B, F) or (B, F, (btaps+1)*C)
+        """
+        # u: (B, C)
+        if self.ref_channel < 0:
+            u, _ = self.ref(psd_speech.to(dtype=data.dtype), ilens)
+            u = u.double()
+        else:
+            if self.beamformer_type.endswith("_souden"):
+                # (optional) Create onehot vector for fixed reference microphone
+                u = torch.zeros(
+                    *(data.size()[:-3] + (data.size(-2),)),
+                    device=data.device,
+                    dtype=torch.double
+                )
+                u[..., self.ref_channel].fill_(1)
+            else:
+                # for simplifying computation in RTF-based beamforming
+                u = self.ref_channel
+
+        if self.beamformer_type in ("mvdr", "mpdr", "wmpdr"):
+            ws = get_mvdr_vector_with_rtf(
+                to_double(psd_n),
+                to_double(psd_speech),
+                to_double(psd_distortion),
+                iterations=self.rtf_iterations,
+                reference_vector=u,
+                normalize_ref_channel=self.ref_channel,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type == "mvdr_tfs":
+            assert isinstance(psd_n, (list, tuple))
+            ws = [
+                get_mvdr_vector_with_rtf(
+                    to_double(psd_n_i),
+                    to_double(psd_speech),
+                    to_double(psd_distortion),
+                    iterations=self.rtf_iterations,
+                    reference_vector=u,
+                    normalize_ref_channel=self.ref_channel,
+                    use_torch_solver=self.use_torch_solver,
+                    diagonal_loading=self.diagonal_loading,
+                    diag_eps=self.diag_eps,
+                )
+                for psd_n_i in psd_n
+            ]
+            enhanced = stack([apply_beamforming_vector(w, to_double(data)) for w in ws])
+            with torch.no_grad():
+                index = enhanced.abs().argmin(dim=0, keepdims=True)
+            enhanced = enhanced.gather(0, index).squeeze(0)
+            ws = stack(ws, dim=0)
+        elif self.beamformer_type in (
+            "mpdr_souden",
+            "mvdr_souden",
+            "wmpdr_souden",
+        ):
+            ws = get_mvdr_vector(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type == "mvdr_tfs_souden":
+            assert isinstance(psd_n, (list, tuple))
+            ws = [
+                get_mvdr_vector(
+                    to_double(psd_speech),
+                    to_double(psd_n_i),
+                    u,
+                    use_torch_solver=self.use_torch_solver,
+                    diagonal_loading=self.diagonal_loading,
+                    diag_eps=self.diag_eps,
+                )
+                for psd_n_i in psd_n
+            ]
+            enhanced = stack([apply_beamforming_vector(w, to_double(data)) for w in ws])
+            with torch.no_grad():
+                index = enhanced.abs().argmin(dim=0, keepdims=True)
+            enhanced = enhanced.gather(0, index).squeeze(0)
+            ws = stack(ws, dim=0)
+        elif self.beamformer_type == "wpd":
+            ws = get_WPD_filter_with_rtf(
+                to_double(psd_n),
+                to_double(psd_speech),
+                to_double(psd_distortion),
+                iterations=self.rtf_iterations,
+                reference_vector=u,
+                normalize_ref_channel=self.ref_channel,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = perform_WPD_filtering(
+                ws, to_double(data), self.bdelay, self.btaps
+            )
+        elif self.beamformer_type == "wpd_souden":
+            ws = get_WPD_filter_v2(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = perform_WPD_filtering(
+                ws, to_double(data), self.bdelay, self.btaps
+            )
+        elif self.beamformer_type in ("mwf", "wmwf"):
+            ws = get_mwf_vector(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type == "sdw_mwf":
+            ws = get_sdw_mwf_vector(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                denoising_weight=self.mwf_mu,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type == "r1mwf":
+            ws = get_rank1_mwf_vector(
+                to_double(psd_speech),
+                to_double(psd_n),
+                u,
+                denoising_weight=self.mwf_mu,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type in ("lcmp", "wlcmp", "lcmv"):
+            ws = get_lcmv_vector_with_rtf(
+                to_double(psd_n),
+                to_double(rtf_mat),
+                reference_vector=spk,
+                use_torch_solver=self.use_torch_solver,
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+        elif self.beamformer_type.startswith("gev"):
+            ws = get_gev_vector(
+                to_double(psd_n),
+                to_double(psd_speech),
+                mode="power",
+                diagonal_loading=self.diagonal_loading,
+                diag_eps=self.diag_eps,
+            )
+            enhanced = apply_beamforming_vector(ws, to_double(data))
+            if self.beamformer_type == "gev_ban":
+                gain = blind_analytic_normalization(ws, to_double(psd_n))
+                enhanced = enhanced * gain.unsqueeze(-1)
+        else:
+            raise ValueError(
+                "Not supporting beamformer_type={}".format(self.beamformer_type)
+            )
+
+        return enhanced.to(dtype=data.dtype), ws.to(dtype=data.dtype)
+
     def predict_mask(
-        self, data: ComplexTensor, ilens: torch.LongTensor
+        self, data: Union[torch.Tensor, ComplexTensor], ilens: torch.LongTensor
     ) -> Tuple[Tuple[torch.Tensor, ...], torch.LongTensor]:
         """Predict masks for beamforming.
 
         Args:
-            data (ComplexTensor): (B, T, C, F), double precision
+            data (torch.complex64/ComplexTensor): (B, T, C, F), double precision
             ilens (torch.Tensor): (B,)
         Returns:
             masks (torch.Tensor): (B, T, C, F)
             ilens (torch.Tensor): (B,)
         """
-        masks, _ = self.mask(data.permute(0, 3, 2, 1).float(), ilens)
+        masks, _ = self.mask(to_float(data.permute(0, 3, 2, 1)), ilens)
         # (B, F, C, T) -> (B, T, C, F)
         masks = [m.transpose(-1, -3) for m in masks]
         return masks, ilens
 
 
 class AttentionReference(torch.nn.Module):
-    def __init__(self, bidim, att_dim):
+    def __init__(self, bidim, att_dim, eps=1e-6):
         super().__init__()
         self.mlp_psd = torch.nn.Linear(bidim, att_dim)
         self.gvec = torch.nn.Linear(att_dim, 1)
+        self.eps = eps
 
     def forward(
-        self, psd_in: ComplexTensor, ilens: torch.LongTensor, scaling: float = 2.0
+        self,
+        psd_in: Union[torch.Tensor, ComplexTensor],
+        ilens: torch.LongTensor,
+        scaling: float = 2.0,
     ) -> Tuple[torch.Tensor, torch.LongTensor]:
         """Attention-based reference forward function.
 
         Args:
-            psd_in (ComplexTensor): (B, F, C, C)
+            psd_in (torch.complex64/ComplexTensor): (B, F, C, C)
             ilens (torch.Tensor): (B,)
             scaling (float):
         Returns:
@@ -497,16 +607,14 @@ def forward(
         B, _, C = psd_in.size()[:3]
         assert psd_in.size(2) == psd_in.size(3), psd_in.size()
         # psd_in: (B, F, C, C)
-        datatype = torch.bool if is_torch_1_3_plus else torch.uint8
-        datatype2 = torch.bool if is_torch_1_2_plus else torch.uint8
         psd = psd_in.masked_fill(
-            torch.eye(C, dtype=datatype, device=psd_in.device).type(datatype2), 0
+            torch.eye(C, dtype=torch.bool, device=psd_in.device).type(torch.bool), 0
         )
         # psd: (B, F, C, C) -> (B, C, F)
         psd = (psd.sum(dim=-1) / (C - 1)).transpose(-1, -2)
 
         # Calculate amplitude
-        psd_feat = (psd.real ** 2 + psd.imag ** 2) ** 0.5
+        psd_feat = (psd.real**2 + psd.imag**2 + self.eps) ** 0.5
 
         # (B, C, F) -> (B, C, F2)
         mlp_psd = self.mlp_psd(psd_feat)
diff --git a/espnet2/enh/layers/dnn_wpe.py b/espnet2/enh/layers/dnn_wpe.py
index c48affd6f01..f3430087742 100644
--- a/espnet2/enh/layers/dnn_wpe.py
+++ b/espnet2/enh/layers/dnn_wpe.py
@@ -1,11 +1,14 @@
 from typing import Tuple
+from typing import Union
 
-from pytorch_wpe import wpe_one_iteration
 import torch
 from torch_complex.tensor import ComplexTensor
 
 from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet2.enh.layers.complex_utils import to_double
+from espnet2.enh.layers.complex_utils import to_float
 from espnet2.enh.layers.mask_estimator import MaskEstimator
+from espnet2.enh.layers.wpe import wpe_one_iteration
 
 
 class DNN_WPE(torch.nn.Module):
@@ -63,8 +66,12 @@ def __init__(
             self.nmask = 1
 
     def forward(
-        self, data: ComplexTensor, ilens: torch.LongTensor
-    ) -> Tuple[ComplexTensor, torch.LongTensor, ComplexTensor]:
+        self, data: Union[torch.Tensor, ComplexTensor], ilens: torch.LongTensor
+    ) -> Tuple[
+        Union[torch.Tensor, ComplexTensor],
+        torch.LongTensor,
+        Union[torch.Tensor, ComplexTensor],
+    ]:
         """DNN_WPE forward function.
 
         Notation:
@@ -90,7 +97,7 @@ def forward(
 
         for i in range(self.iterations):
             # Calculate power: (..., C, T)
-            power = [enh.real ** 2 + enh.imag ** 2 for enh in enhanced]
+            power = [enh.real**2 + enh.imag**2 for enh in enhanced]
             if i == 0 and self.use_dnn_mask:
                 # mask: (B, F, C, T)
                 masks, _ = self.mask_est(data, ilens)
@@ -110,8 +117,8 @@ def forward(
             # NOTE(kamo): Calculate in double precision
             enhanced = [
                 wpe_one_iteration(
-                    data.contiguous().double(),
-                    p.double(),
+                    to_double(data.contiguous()),
+                    to_double(p),
                     taps=self.taps,
                     delay=self.delay,
                     inverse_power=self.inverse_power,
@@ -137,19 +144,19 @@ def forward(
         return enhanced, ilens, masks, power
 
     def predict_mask(
-        self, data: ComplexTensor, ilens: torch.LongTensor
+        self, data: Union[torch.Tensor, ComplexTensor], ilens: torch.LongTensor
     ) -> Tuple[torch.Tensor, torch.LongTensor]:
         """Predict mask for WPE dereverberation.
 
         Args:
-            data (ComplexTensor): (B, T, C, F), double precision
+            data (torch.complex64/ComplexTensor): (B, T, C, F), double precision
             ilens (torch.Tensor): (B,)
         Returns:
             masks (torch.Tensor or List[torch.Tensor]): (B, T, C, F)
             ilens (torch.Tensor): (B,)
         """
         if self.use_dnn_mask:
-            masks, ilens = self.mask_est(data.permute(0, 3, 2, 1).float(), ilens)
+            masks, ilens = self.mask_est(to_float(data.permute(0, 3, 2, 1)), ilens)
             # (B, F, C, T) -> (B, T, C, F)
             masks = [m.transpose(-1, -3) for m in masks]
             if self.nmask == 1:
diff --git a/espnet2/enh/layers/dprnn.py b/espnet2/enh/layers/dprnn.py
index 827c754ac86..830e3c59a5e 100644
--- a/espnet2/enh/layers/dprnn.py
+++ b/espnet2/enh/layers/dprnn.py
@@ -4,6 +4,7 @@
 #
 # The code is based on:
 # https://github.com/yluo42/TAC/blob/master/utility/models.py
+# Licensed under CC BY-NC-SA 3.0 US.
 #
 
 
@@ -170,6 +171,175 @@ def forward(self, input):
         return output
 
 
+# dual-path RNN with transform-average-concatenate (TAC)
+class DPRNN_TAC(nn.Module):
+    """Deep duaL-path RNN with TAC applied to each layer/block.
+
+    args:
+        rnn_type: string, select from 'RNN', 'LSTM' and 'GRU'.
+        input_size: int, dimension of the input feature. The input should
+                    have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        dropout: float, dropout ratio. Default is 0.
+        num_layers: int, number of stacked RNN layers. Default is 1.
+        bidirectional: bool, whether the RNN layers are bidirectional.
+                    Default is False.
+    """
+
+    def __init__(
+        self,
+        rnn_type,
+        input_size,
+        hidden_size,
+        output_size,
+        dropout=0,
+        num_layers=1,
+        bidirectional=True,
+    ):
+        super(DPRNN_TAC, self).__init__()
+
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+
+        # DPRNN + TAC for 3D input (ch, N, T)
+        self.row_rnn = nn.ModuleList([])
+        self.col_rnn = nn.ModuleList([])
+        self.ch_transform = nn.ModuleList([])
+        self.ch_average = nn.ModuleList([])
+        self.ch_concat = nn.ModuleList([])
+
+        self.row_norm = nn.ModuleList([])
+        self.col_norm = nn.ModuleList([])
+        self.ch_norm = nn.ModuleList([])
+
+        for i in range(num_layers):
+            self.row_rnn.append(
+                SingleRNN(
+                    rnn_type, input_size, hidden_size, dropout, bidirectional=True
+                )
+            )  # intra-segment RNN is always noncausal
+            self.col_rnn.append(
+                SingleRNN(
+                    rnn_type,
+                    input_size,
+                    hidden_size,
+                    dropout,
+                    bidirectional=bidirectional,
+                )
+            )
+            self.ch_transform.append(
+                nn.Sequential(nn.Linear(input_size, hidden_size * 3), nn.PReLU())
+            )
+            self.ch_average.append(
+                nn.Sequential(nn.Linear(hidden_size * 3, hidden_size * 3), nn.PReLU())
+            )
+            self.ch_concat.append(
+                nn.Sequential(nn.Linear(hidden_size * 6, input_size), nn.PReLU())
+            )
+
+            self.row_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+            # default is to use noncausal LayerNorm for
+            # inter-chunk RNN and TAC modules.
+            # For causal setting change them to causal normalization
+            # techniques accordingly.
+            self.col_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+            self.ch_norm.append(nn.GroupNorm(1, input_size, eps=1e-8))
+
+        # output layer
+        self.output = nn.Sequential(nn.PReLU(), nn.Conv2d(input_size, output_size, 1))
+
+    def forward(self, input, num_mic):
+        # input shape: batch, ch, N, dim1, dim2
+        # num_mic shape: batch,
+        # apply RNN on dim1 first, then dim2, then ch
+
+        batch_size, ch, N, dim1, dim2 = input.shape
+        output = input
+        for i in range(len(self.row_rnn)):
+            # intra-segment RNN
+            output = output.view(batch_size * ch, N, dim1, dim2)
+            row_input = (
+                output.permute(0, 3, 2, 1)
+                .contiguous()
+                .view(batch_size * ch * dim2, dim1, -1)
+            )  # B*ch*dim2, dim1, N
+            row_output = self.row_rnn[i](row_input)  # B*ch*dim2, dim1, N
+            row_output = (
+                row_output.view(batch_size * ch, dim2, dim1, -1)
+                .permute(0, 3, 2, 1)
+                .contiguous()
+            )  # B*ch, N, dim1, dim2
+            row_output = self.row_norm[i](row_output)
+            output = output + row_output  # B*ch, N, dim1, dim2
+
+            # inter-segment RNN
+            col_input = (
+                output.permute(0, 2, 3, 1)
+                .contiguous()
+                .view(batch_size * ch * dim1, dim2, -1)
+            )  # B*ch*dim1, dim2, N
+            col_output = self.col_rnn[i](col_input)  # B*dim1, dim2, N
+            col_output = (
+                col_output.view(batch_size * ch, dim1, dim2, -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+            )  # B*ch, N, dim1, dim2
+            col_output = self.col_norm[i](col_output)
+            output = output + col_output  # B*ch, N, dim1, dim2
+
+            # TAC for cross-channel communication
+            ch_input = output.view(input.shape)  # B, ch, N, dim1, dim2
+            ch_input = (
+                ch_input.permute(0, 3, 4, 1, 2).contiguous().view(-1, N)
+            )  # B*dim1*dim2*ch, N
+            ch_output = self.ch_transform[i](ch_input).view(
+                batch_size, dim1 * dim2, ch, -1
+            )  # B, dim1*dim2, ch, H
+            # mean pooling across channels
+            if num_mic.max() == 0:
+                # fixed geometry array
+                ch_mean = ch_output.mean(2).view(
+                    batch_size * dim1 * dim2, -1
+                )  # B*dim1*dim2, H
+            else:
+                # only consider valid channels
+                ch_mean = [
+                    ch_output[b, :, : num_mic[b]].mean(1).unsqueeze(0)
+                    for b in range(batch_size)
+                ]  # 1, dim1*dim2, H
+                ch_mean = torch.cat(ch_mean, 0).view(
+                    batch_size * dim1 * dim2, -1
+                )  # B*dim1*dim2, H
+            ch_output = ch_output.view(
+                batch_size * dim1 * dim2, ch, -1
+            )  # B*dim1*dim2, ch, H
+            ch_mean = (
+                self.ch_average[i](ch_mean)
+                .unsqueeze(1)
+                .expand_as(ch_output)
+                .contiguous()
+            )  # B*dim1*dim2, ch, H
+            ch_output = torch.cat([ch_output, ch_mean], 2)  # B*dim1*dim2, ch, 2H
+            ch_output = self.ch_concat[i](
+                ch_output.view(-1, ch_output.shape[-1])
+            )  # B*dim1*dim2*ch, N
+            ch_output = (
+                ch_output.view(batch_size, dim1, dim2, ch, -1)
+                .permute(0, 3, 4, 1, 2)
+                .contiguous()
+            )  # B, ch, N, dim1, dim2
+            ch_output = self.ch_norm[i](
+                ch_output.view(batch_size * ch, N, dim1, dim2)
+            )  # B*ch, N, dim1, dim2
+            output = output + ch_output
+
+        output = self.output(output)  # B*ch, N, dim1, dim2
+
+        return output
+
+
 def _pad_segment(input, segment_size):
     # input is the features: (B, N, T)
     batch_size, dim, seq_len = input.shape
diff --git a/espnet2/enh/layers/fasnet.py b/espnet2/enh/layers/fasnet.py
new file mode 100644
index 00000000000..3788fbb08c7
--- /dev/null
+++ b/espnet2/enh/layers/fasnet.py
@@ -0,0 +1,448 @@
+# The implementation of FaSNet in
+# Y. Luo, et al.  “FaSNet: Low-Latency Adaptive Beamforming
+# for Multi-Microphone Audio Processing”
+# The implementation is based on:
+# https://github.com/yluo42/TAC
+# Licensed under CC BY-NC-SA 3.0 US.
+#
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from espnet2.enh.layers import dprnn
+
+
+# DPRNN for beamforming filter estimation
+class BF_module(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        feature_dim,
+        hidden_dim,
+        output_dim,
+        num_spk=2,
+        layer=4,
+        segment_size=100,
+        bidirectional=True,
+        dropout=0.0,
+        fasnet_type="ifasnet",
+    ):
+        super().__init__()
+
+        assert fasnet_type in [
+            "fasnet",
+            "ifasnet",
+        ], "fasnet_type should be fasnet or ifasnet"
+
+        self.input_dim = input_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.output_dim = output_dim
+
+        self.layer = layer
+        self.segment_size = segment_size
+        self.num_spk = num_spk
+
+        self.dprnn_model = dprnn.DPRNN_TAC(
+            "lstm",
+            self.feature_dim,
+            self.hidden_dim,
+            self.feature_dim * self.num_spk,
+            num_layers=layer,
+            bidirectional=bidirectional,
+            dropout=dropout,
+        )
+        self.eps = 1e-8
+
+        self.fasnet_type = fasnet_type
+
+        if fasnet_type == "ifasnet":
+            # output layer in ifasnet
+            self.output = nn.Conv1d(self.feature_dim, self.output_dim, 1)
+        elif fasnet_type == "fasnet":
+            # gated output layer in ifasnet
+            self.output = nn.Sequential(
+                nn.Conv1d(self.feature_dim, self.output_dim, 1), nn.Tanh()
+            )
+            self.output_gate = nn.Sequential(
+                nn.Conv1d(self.feature_dim, self.output_dim, 1), nn.Sigmoid()
+            )
+
+        self.num_spk = num_spk
+        self.BN = nn.Conv1d(self.input_dim, self.feature_dim, 1, bias=False)
+
+    def forward(self, input, num_mic):
+
+        # input: (B, ch, N, T)
+        batch_size, ch, N, seq_length = input.shape
+
+        input = input.view(batch_size * ch, N, seq_length)  # B*ch, N, T
+        enc_feature = self.BN(input)
+
+        # split the encoder output into overlapped, longer segments
+        enc_segments, enc_rest = dprnn.split_feature(
+            enc_feature, self.segment_size
+        )  # B*ch, N, L, K
+
+        enc_segments = enc_segments.view(
+            batch_size, ch, -1, enc_segments.shape[2], enc_segments.shape[3]
+        )  # B, ch, N, L, K
+        output = self.dprnn_model(enc_segments, num_mic).view(
+            batch_size * ch * self.num_spk,
+            self.feature_dim,
+            self.segment_size,
+            -1,
+        )  # B*ch*nspk, N, L, K
+        # overlap-and-add of the outputs
+        output = dprnn.merge_feature(output, enc_rest)  # B*ch*nspk, N, T
+
+        if self.fasnet_type == "fasnet":
+            # gated output layer for filter generation
+            bf_filter = self.output(output) * self.output_gate(
+                output
+            )  # B*ch*nspk, K, T
+            bf_filter = (
+                bf_filter.transpose(1, 2)
+                .contiguous()
+                .view(batch_size, ch, self.num_spk, -1, self.output_dim)
+            )  # B, ch, nspk, L, N
+
+        elif self.fasnet_type == "ifasnet":
+            # output layer
+            bf_filter = self.output(output)  # B*ch*nspk, K, T
+            bf_filter = bf_filter.view(
+                batch_size, ch, self.num_spk, self.output_dim, -1
+            )  # B, ch, nspk, K, L
+
+        return bf_filter
+
+
+# base module for FaSNet
+class FaSNet_base(nn.Module):
+    def __init__(
+        self,
+        enc_dim,
+        feature_dim,
+        hidden_dim,
+        layer,
+        segment_size=24,
+        nspk=2,
+        win_len=16,
+        context_len=16,
+        dropout=0.0,
+        sr=16000,
+    ):
+        super(FaSNet_base, self).__init__()
+
+        # parameters
+        self.win_len = win_len
+        self.window = max(int(sr * win_len / 1000), 2)
+        self.stride = self.window // 2
+        self.sr = sr
+        self.context_len = context_len
+        self.dropout = dropout
+
+        self.enc_dim = enc_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.segment_size = segment_size
+
+        self.layer = layer
+        self.num_spk = nspk
+        self.eps = 1e-8
+
+    def pad_input(self, input, window):
+        """Zero-padding input according to window/stride size."""
+
+        batch_size, nmic, nsample = input.shape
+
+        stride = self.stride
+
+        # pad the signals at the end for matching the window/stride size
+        rest = window - (stride + nsample % window) % window
+        if rest > 0:
+            pad = torch.zeros(batch_size, nmic, rest).type(input.type())
+            input = torch.cat([input, pad], 2)
+        pad_aux = torch.zeros(batch_size, nmic, stride).type(input.type())
+        input = torch.cat([pad_aux, input, pad_aux], 2)
+
+        return input, rest
+
+    def seg_signal_context(self, x, window, context):
+        """Segmenting the signal into chunks with specific context.
+
+        input:
+            x: size (B, ch, T)
+            window: int
+            context: int
+        """
+
+        # pad input accordingly
+        # first pad according to window size
+        input, rest = self.pad_input(x, window)
+        batch_size, nmic, nsample = input.shape
+        stride = window // 2
+
+        # pad another context size
+        pad_context = torch.zeros(batch_size, nmic, context).type(input.type())
+        input = torch.cat([pad_context, input, pad_context], 2)  # B, ch, L
+
+        # calculate index for each chunk
+        nchunk = 2 * nsample // window - 1
+        begin_idx = np.arange(nchunk) * stride
+        begin_idx = (
+            torch.from_numpy(begin_idx).type(input.type()).long().view(1, 1, -1)
+        )  # 1, 1, nchunk
+        begin_idx = begin_idx.expand(batch_size, nmic, nchunk)  # B, ch, nchunk
+        # select entries from index
+        chunks = [
+            torch.gather(input, 2, begin_idx + i).unsqueeze(3)
+            for i in range(2 * context + window)
+        ]  # B, ch, nchunk, 1
+        chunks = torch.cat(chunks, 3)  # B, ch, nchunk, chunk_size
+
+        # center frame
+        center_frame = chunks[:, :, :, context : context + window]
+
+        return center_frame, chunks, rest
+
+    def signal_context(self, x, context):
+        """signal context function
+
+        Segmenting the signal into chunks with specific context.
+        input:
+            x: size (B, dim, nframe)
+            context: int
+        """
+
+        batch_size, dim, nframe = x.shape
+
+        zero_pad = torch.zeros(batch_size, dim, context).type(x.type())
+        pad_past = []
+        pad_future = []
+        for i in range(context):
+            pad_past.append(
+                torch.cat([zero_pad[:, :, i:], x[:, :, : -context + i]], 2).unsqueeze(2)
+            )
+            pad_future.append(
+                torch.cat([x[:, :, i + 1 :], zero_pad[:, :, : i + 1]], 2).unsqueeze(2)
+            )
+
+        pad_past = torch.cat(pad_past, 2)  # B, D, C, L
+        pad_future = torch.cat(pad_future, 2)  # B, D, C, L
+        all_context = torch.cat(
+            [pad_past, x.unsqueeze(2), pad_future], 2
+        )  # B, D, 2*C+1, L
+
+        return all_context
+
+    def seq_cos_sim(self, ref, target):
+        """Cosine similarity between some reference mics and some target mics
+
+        ref: shape (nmic1, L, seg1)
+        target: shape (nmic2, L, seg2)
+        """
+
+        assert ref.size(1) == target.size(1), "Inputs should have same length."
+        assert ref.size(2) >= target.size(
+            2
+        ), "Reference input should be no smaller than the target input."
+
+        seq_length = ref.size(1)
+
+        larger_ch = ref.size(0)
+        if target.size(0) > ref.size(0):
+            ref = ref.expand(
+                target.size(0), ref.size(1), ref.size(2)
+            ).contiguous()  # nmic2, L, seg1
+            larger_ch = target.size(0)
+        elif target.size(0) < ref.size(0):
+            target = target.expand(
+                ref.size(0), target.size(1), target.size(2)
+            ).contiguous()  # nmic1, L, seg2
+
+        # L2 norms
+        ref_norm = F.conv1d(
+            ref.view(1, -1, ref.size(2)).pow(2),
+            torch.ones(ref.size(0) * ref.size(1), 1, target.size(2)).type(ref.type()),
+            groups=larger_ch * seq_length,
+        )  # 1, larger_ch*L, seg1-seg2+1
+        ref_norm = ref_norm.sqrt() + self.eps
+        target_norm = (
+            target.norm(2, dim=2).view(1, -1, 1) + self.eps
+        )  # 1, larger_ch*L, 1
+        # cosine similarity
+        cos_sim = F.conv1d(
+            ref.view(1, -1, ref.size(2)),
+            target.view(-1, 1, target.size(2)),
+            groups=larger_ch * seq_length,
+        )  # 1, larger_ch*L, seg1-seg2+1
+        cos_sim = cos_sim / (ref_norm * target_norm)
+
+        return cos_sim.view(larger_ch, seq_length, -1)
+
+    def forward(self, input, num_mic):
+        """abstract forward function
+
+        input: shape (batch, max_num_ch, T)
+        num_mic: shape (batch, ), the number of channels for each input.
+                 Zero for fixed geometry configuration.
+        """
+        pass
+
+
+# single-stage FaSNet + TAC
+class FaSNet_TAC(FaSNet_base):
+    def __init__(self, *args, **kwargs):
+        super(FaSNet_TAC, self).__init__(*args, **kwargs)
+
+        self.context = int(self.sr * self.context_len / 1000)
+        self.filter_dim = self.context * 2 + 1
+
+        # DPRNN + TAC for estimation
+        self.all_BF = BF_module(
+            self.filter_dim + self.enc_dim,
+            self.feature_dim,
+            self.hidden_dim,
+            self.filter_dim,
+            self.num_spk,
+            self.layer,
+            self.segment_size,
+            dropout=self.dropout,
+            fasnet_type="fasnet",
+        )
+
+        # waveform encoder
+        self.encoder = nn.Conv1d(
+            1, self.enc_dim, self.context * 2 + self.window, bias=False
+        )
+        self.enc_LN = nn.GroupNorm(1, self.enc_dim, eps=1e-8)
+
+    def forward(self, input, num_mic):
+
+        batch_size = input.size(0)
+        nmic = input.size(1)
+
+        # split input into chunks
+        all_seg, all_mic_context, rest = self.seg_signal_context(
+            input, self.window, self.context
+        )  # B, nmic, L, win/chunk
+        seq_length = all_seg.size(2)
+
+        # embeddings for all channels
+        enc_output = (
+            self.encoder(all_mic_context.view(-1, 1, self.context * 2 + self.window))
+            .view(batch_size * nmic, seq_length, self.enc_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )  # B*nmic, N, L
+        enc_output = self.enc_LN(enc_output).view(
+            batch_size, nmic, self.enc_dim, seq_length
+        )  # B, nmic, N, L
+
+        # calculate the cosine similarities for ref channel's center
+        # frame with all channels' context
+
+        ref_seg = all_seg[:, 0].contiguous().view(1, -1, self.window)  # 1, B*L, win
+        all_context = (
+            all_mic_context.transpose(0, 1)
+            .contiguous()
+            .view(nmic, -1, self.context * 2 + self.window)
+        )  # 1, B*L, 3*win
+        all_cos_sim = self.seq_cos_sim(all_context, ref_seg)  # nmic, B*L, 2*win+1
+        all_cos_sim = (
+            all_cos_sim.view(nmic, batch_size, seq_length, self.filter_dim)
+            .permute(1, 0, 3, 2)
+            .contiguous()
+        )  # B, nmic, 2*win+1, L
+
+        input_feature = torch.cat([enc_output, all_cos_sim], 2)  # B, nmic, N+2*win+1, L
+
+        # pass to DPRNN
+        all_filter = self.all_BF(input_feature, num_mic)  # B, ch, nspk, L, 2*win+1
+
+        # convolve with all mic's context
+        mic_context = torch.cat(
+            [
+                all_mic_context.view(
+                    batch_size * nmic, 1, seq_length, self.context * 2 + self.window
+                )
+            ]
+            * self.num_spk,
+            1,
+        )  # B*nmic, nspk, L, 3*win
+        all_bf_output = F.conv1d(
+            mic_context.view(1, -1, self.context * 2 + self.window),
+            all_filter.view(-1, 1, self.filter_dim),
+            groups=batch_size * nmic * self.num_spk * seq_length,
+        )  # 1, B*nmic*nspk*L, win
+        all_bf_output = all_bf_output.view(
+            batch_size, nmic, self.num_spk, seq_length, self.window
+        )  # B, nmic, nspk, L, win
+
+        # reshape to utterance
+        bf_signal = all_bf_output.view(
+            batch_size * nmic * self.num_spk, -1, self.window * 2
+        )
+        bf_signal1 = (
+            bf_signal[:, :, : self.window]
+            .contiguous()
+            .view(batch_size * nmic * self.num_spk, 1, -1)[:, :, self.stride :]
+        )
+        bf_signal2 = (
+            bf_signal[:, :, self.window :]
+            .contiguous()
+            .view(batch_size * nmic * self.num_spk, 1, -1)[:, :, : -self.stride]
+        )
+        bf_signal = bf_signal1 + bf_signal2  # B*nmic*nspk, 1, T
+        if rest > 0:
+            bf_signal = bf_signal[:, :, :-rest]
+
+        bf_signal = bf_signal.view(
+            batch_size, nmic, self.num_spk, -1
+        )  # B, nmic, nspk, T
+        # consider only the valid channels
+        if num_mic.max() == 0:
+            bf_signal = bf_signal.mean(1)  # B, nspk, T
+        else:
+            bf_signal = [
+                bf_signal[b, : num_mic[b]].mean(0).unsqueeze(0)
+                for b in range(batch_size)
+            ]  # nspk, T
+            bf_signal = torch.cat(bf_signal, 0)  # B, nspk, T
+
+        return bf_signal
+
+
+def test_model(model):
+    x = torch.rand(2, 4, 32000)  # (batch, num_mic, length)
+    num_mic = (
+        torch.from_numpy(np.array([3, 2]))
+        .view(
+            -1,
+        )
+        .type(x.type())
+    )  # ad-hoc array
+    none_mic = torch.zeros(1).type(x.type())  # fixed-array
+    y1 = model(x, num_mic.long())
+    y2 = model(x, none_mic.long())
+    print(y1.shape, y2.shape)  # (batch, nspk, length)
+
+
+if __name__ == "__main__":
+
+    model_TAC = FaSNet_TAC(
+        enc_dim=64,
+        feature_dim=64,
+        hidden_dim=128,
+        layer=4,
+        segment_size=50,
+        nspk=2,
+        win_len=4,
+        context_len=16,
+        sr=16000,
+    )
+
+    test_model(model_TAC)
diff --git a/espnet2/enh/layers/ifasnet.py b/espnet2/enh/layers/ifasnet.py
new file mode 100644
index 00000000000..076898f4b2d
--- /dev/null
+++ b/espnet2/enh/layers/ifasnet.py
@@ -0,0 +1,220 @@
+# The implementation of iFaSNet in
+# Luo. et al. "Implicit Filter-and-sum Network for
+# Multi-channel Speech Separation"
+#
+# The implementation is based on:
+# https://github.com/yluo42/TAC
+# Licensed under CC BY-NC-SA 3.0 US.
+#
+
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers import dprnn
+from espnet2.enh.layers.fasnet import BF_module
+from espnet2.enh.layers.fasnet import FaSNet_base
+
+
+# implicit FaSNet (iFaSNet)
+class iFaSNet(FaSNet_base):
+    def __init__(self, *args, **kwargs):
+        super(iFaSNet, self).__init__(*args, **kwargs)
+
+        self.context = self.context_len * 2 // self.win_len
+        # context compression
+        self.summ_BN = nn.Linear(self.enc_dim, self.feature_dim)
+        self.summ_RNN = dprnn.SingleRNN(
+            "LSTM", self.feature_dim, self.hidden_dim, bidirectional=True
+        )
+        self.summ_LN = nn.GroupNorm(1, self.feature_dim, eps=self.eps)
+        self.summ_output = nn.Linear(self.feature_dim, self.enc_dim)
+
+        self.separator = BF_module(
+            self.enc_dim + (self.context * 2 + 1) ** 2,
+            self.feature_dim,
+            self.hidden_dim,
+            self.enc_dim,
+            self.num_spk,
+            self.layer,
+            self.segment_size,
+            dropout=self.dropout,
+            fasnet_type="ifasnet",
+        )
+
+        # waveform encoder/decoder
+        self.encoder = nn.Conv1d(
+            1, self.enc_dim, self.window, stride=self.stride, bias=False
+        )
+        self.decoder = nn.ConvTranspose1d(
+            self.enc_dim, 1, self.window, stride=self.stride, bias=False
+        )
+        self.enc_LN = nn.GroupNorm(1, self.enc_dim, eps=self.eps)
+
+        # context decompression
+        self.gen_BN = nn.Conv1d(self.enc_dim * 2, self.feature_dim, 1)
+        self.gen_RNN = dprnn.SingleRNN(
+            "LSTM", self.feature_dim, self.hidden_dim, bidirectional=True
+        )
+        self.gen_LN = nn.GroupNorm(1, self.feature_dim, eps=self.eps)
+        self.gen_output = nn.Conv1d(self.feature_dim, self.enc_dim, 1)
+
+    def forward(self, input, num_mic):
+
+        batch_size = input.size(0)
+        nmic = input.size(1)
+
+        # pad input accordingly
+        input, rest = self.pad_input(input, self.window)
+
+        # encoder on all channels
+        enc_output = self.encoder(input.view(batch_size * nmic, 1, -1))  # B*nmic, N, L
+        seq_length = enc_output.shape[-1]
+
+        # calculate the context of the encoder output
+        # consider both past and future
+        enc_context = self.signal_context(
+            enc_output, self.context
+        )  # B*nmic, N, 2C+1, L
+        enc_context = enc_context.view(
+            batch_size, nmic, self.enc_dim, -1, seq_length
+        )  # B, nmic, N, 2C+1, L
+
+        # NCC feature
+        ref_enc = enc_context[:, 0].contiguous()  # B, N, 2C+1, L
+        ref_enc = (
+            ref_enc.permute(0, 3, 1, 2)
+            .contiguous()
+            .view(batch_size * seq_length, self.enc_dim, -1)
+        )  # B*L, N, 2C+1
+        enc_context_copy = (
+            enc_context.permute(0, 4, 1, 3, 2)
+            .contiguous()
+            .view(batch_size * seq_length, nmic, -1, self.enc_dim)
+        )  # B*L, nmic, 2C+1, N
+        NCC = torch.cat(
+            [enc_context_copy[:, i].bmm(ref_enc).unsqueeze(1) for i in range(nmic)], 1
+        )  # B*L, nmic, 2C+1, 2C+1
+        ref_norm = (
+            ref_enc.pow(2).sum(1).unsqueeze(1) + self.eps
+        ).sqrt()  # B*L, 1, 2C+1
+        enc_norm = (
+            enc_context_copy.pow(2).sum(3).unsqueeze(3) + self.eps
+        ).sqrt()  # B*L, nmic, 2C+1, 1
+        NCC = NCC / (ref_norm.unsqueeze(1) * enc_norm)  # B*L, nmic, 2C+1, 2C+1
+        NCC = torch.cat(
+            [NCC[:, :, i] for i in range(NCC.shape[2])], 2
+        )  # B*L, nmic, (2C+1)^2
+        NCC = (
+            NCC.view(batch_size, seq_length, nmic, -1).permute(0, 2, 3, 1).contiguous()
+        )  # B, nmic, (2C+1)^2, L
+
+        # context compression
+        norm_output = self.enc_LN(enc_output)  # B*nmic, N, L
+        norm_context = self.signal_context(
+            norm_output, self.context
+        )  # B*nmic, N, 2C+1, L
+        norm_context = (
+            norm_context.permute(0, 3, 2, 1)
+            .contiguous()
+            .view(-1, self.context * 2 + 1, self.enc_dim)
+        )
+        norm_context_BN = self.summ_BN(norm_context.view(-1, self.enc_dim)).view(
+            -1, self.context * 2 + 1, self.feature_dim
+        )
+        embedding = (
+            self.summ_RNN(norm_context_BN).transpose(1, 2).contiguous()
+        )  # B*nmic*L, N, 2C+1
+        embedding = norm_context_BN.transpose(1, 2).contiguous() + self.summ_LN(
+            embedding
+        )  # B*nmic*L, N, 2C+1
+        embedding = self.summ_output(embedding.mean(2)).view(
+            batch_size, nmic, seq_length, self.enc_dim
+        )  # B, nmic, L, N
+        embedding = embedding.transpose(2, 3).contiguous()  # B, nmic, N, L
+
+        input_feature = torch.cat([embedding, NCC], 2)  # B, nmic, N+(2C+1)^2, L
+
+        # pass to DPRNN-TAC
+        embedding = self.separator(input_feature, num_mic)[
+            :, 0
+        ].contiguous()  # B, nspk, N, L
+
+        # concatenate with encoder outputs and generate masks
+        # context decompression
+        norm_context = norm_context.view(
+            batch_size, nmic, seq_length, -1, self.enc_dim
+        )  # B, nmic, L, 2C+1, N
+        norm_context = norm_context.permute(0, 1, 4, 3, 2)[
+            :, :1
+        ].contiguous()  # B, 1, N, 2C+1, L
+
+        embedding = torch.cat(
+            [embedding.unsqueeze(3)] * (self.context * 2 + 1), 3
+        )  # B, nspk, N, 2C+1, L
+        norm_context = torch.cat(
+            [norm_context] * self.num_spk, 1
+        )  # B, nspk, N, 2C+1, L
+        embedding = (
+            torch.cat([norm_context, embedding], 2).permute(0, 1, 4, 2, 3).contiguous()
+        )  # B, nspk, L, 2N, 2C+1
+        all_filter = self.gen_BN(
+            embedding.view(-1, self.enc_dim * 2, self.context * 2 + 1)
+        )  # B*nspk*L, N, 2C+1
+        all_filter = all_filter + self.gen_LN(
+            self.gen_RNN(all_filter.transpose(1, 2)).transpose(1, 2)
+        )  # B*nspk*L, N, 2C+1
+        all_filter = self.gen_output(all_filter)  # B*nspk*L, N, 2C+1
+        all_filter = all_filter.view(
+            batch_size, self.num_spk, seq_length, self.enc_dim, -1
+        )  # B, nspk, L, N+1, 2C+1
+        all_filter = all_filter.permute(
+            0, 1, 3, 4, 2
+        ).contiguous()  # B, nspk, N, 2C+1, L
+
+        # apply to with ref mic's encoder context
+        output = (enc_context[:, :1] * all_filter).mean(3)  # B, nspk, N, L
+
+        # decode
+        bf_signal = self.decoder(
+            output.view(batch_size * self.num_spk, self.enc_dim, -1)
+        )  # B*nspk, 1, T
+
+        if rest > 0:
+            bf_signal = bf_signal[:, :, self.stride : -rest - self.stride]
+
+        bf_signal = bf_signal.view(batch_size, self.num_spk, -1)  # B, nspk, T
+
+        return bf_signal
+
+
+def test_model(model):
+    import numpy as np
+
+    x = torch.rand(3, 4, 32000)  # (batch, num_mic, length)
+    num_mic = (
+        torch.from_numpy(np.array([3, 3, 2]))
+        .view(
+            -1,
+        )
+        .type(x.type())
+    )  # ad-hoc array
+    none_mic = torch.zeros(1).type(x.type())  # fixed-array
+    y1 = model(x, num_mic.long())
+    y2 = model(x, none_mic.long())
+    print(y1.shape, y2.shape)  # (batch, nspk, length)
+
+
+if __name__ == "__main__":
+    model_iFaSNet = iFaSNet(
+        enc_dim=64,
+        feature_dim=64,
+        hidden_dim=128,
+        layer=6,
+        segment_size=24,
+        nspk=2,
+        win_len=16,
+        context_len=16,
+        sr=16000,
+    )
+
+    test_model(model_iFaSNet)
diff --git a/espnet2/enh/layers/mask_estimator.py b/espnet2/enh/layers/mask_estimator.py
index 9e309b4d9a8..daea80f79ec 100644
--- a/espnet2/enh/layers/mask_estimator.py
+++ b/espnet2/enh/layers/mask_estimator.py
@@ -1,4 +1,6 @@
+from distutils.version import LooseVersion
 from typing import Tuple
+from typing import Union
 
 import numpy as np
 import torch
@@ -8,6 +10,10 @@
 from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
 from espnet.nets.pytorch_backend.rnn.encoders import RNN
 from espnet.nets.pytorch_backend.rnn.encoders import RNNP
+from espnet2.enh.layers.complex_utils import is_complex
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 
 
 class MaskEstimator(torch.nn.Module):
@@ -35,7 +41,7 @@ def __init__(
         self.nonlinear = nonlinear
 
     def forward(
-        self, xs: ComplexTensor, ilens: torch.LongTensor
+        self, xs: Union[torch.Tensor, ComplexTensor], ilens: torch.LongTensor
     ) -> Tuple[Tuple[torch.Tensor, ...], torch.LongTensor]:
         """Mask estimator forward function.
 
@@ -53,7 +59,8 @@ def forward(
         xs = xs.permute(0, 2, 3, 1)
 
         # Calculate amplitude: (B, C, T, F) -> (B, C, T, F)
-        xs = (xs.real ** 2 + xs.imag ** 2) ** 0.5
+        if is_complex(xs):
+            xs = (xs.real**2 + xs.imag**2) ** 0.5
         # xs: (B, C, T, F) -> xs: (B * C, T, F)
         xs = xs.contiguous().view(-1, xs.size(-2), xs.size(-1))
         # ilens: (B,) -> ilens_: (B * C)
diff --git a/espnet2/enh/layers/skim.py b/espnet2/enh/layers/skim.py
new file mode 100644
index 00000000000..f095f97495c
--- /dev/null
+++ b/espnet2/enh/layers/skim.py
@@ -0,0 +1,316 @@
+# An implementation of SkiM model described in
+# "SkiM: Skipping Memory LSTM for Low-Latency Real-Time Continuous Speech Separation"
+# (https://arxiv.org/abs/2201.10800)
+#
+
+import torch
+import torch.nn as nn
+
+from espnet2.enh.layers.dprnn import merge_feature
+from espnet2.enh.layers.dprnn import SingleRNN
+from espnet2.enh.layers.dprnn import split_feature
+from espnet2.enh.layers.tcn import choose_norm
+
+
+class MemLSTM(nn.Module):
+    """the Mem-LSTM of SkiM
+
+    args:
+        hidden_size: int, dimension of the hidden state.
+        dropout: float, dropout ratio. Default is 0.
+        bidirectional: bool, whether the LSTM layers are bidirectional.
+            Default is False.
+        mem_type: 'hc', 'h', 'c' or 'id'.
+            It controls whether the hidden (or cell) state of
+            SegLSTM will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states will
+            be identically returned.
+        norm_type: gLN, cLN. cLN is for causal implementation.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        dropout=0.0,
+        bidirectional=False,
+        mem_type="hc",
+        norm_type="cLN",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.bidirectional = bidirectional
+        self.input_size = (int(bidirectional) + 1) * hidden_size
+        self.mem_type = mem_type
+
+        assert mem_type in [
+            "hc",
+            "h",
+            "c",
+            "id",
+        ], f"only support 'hc', 'h', 'c' and 'id', current type: {mem_type}"
+
+        if mem_type in ["hc", "h"]:
+            self.h_net = SingleRNN(
+                "LSTM",
+                input_size=self.input_size,
+                hidden_size=self.hidden_size,
+                dropout=dropout,
+                bidirectional=bidirectional,
+            )
+            self.h_norm = choose_norm(
+                norm_type=norm_type, channel_size=self.input_size, shape="BTD"
+            )
+        if mem_type in ["hc", "c"]:
+            self.c_net = SingleRNN(
+                "LSTM",
+                input_size=self.input_size,
+                hidden_size=self.hidden_size,
+                dropout=dropout,
+                bidirectional=bidirectional,
+            )
+            self.c_norm = choose_norm(
+                norm_type=norm_type, channel_size=self.input_size, shape="BTD"
+            )
+
+    def extra_repr(self) -> str:
+        return f"Mem_type: {self.mem_type}, bidirectional: {self.bidirectional}"
+
+    def forward(self, hc, S):
+        # hc = (h, c), tuple of hidden and cell states from SegLSTM
+        # shape of h and c: (d, B*S, H)
+        # S: number of segments in SegLSTM
+
+        if self.mem_type == "id":
+            ret_val = hc
+        else:
+            h, c = hc
+            d, BS, H = h.shape
+            B = BS // S
+            h = h.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH
+            c = c.transpose(1, 0).contiguous().view(B, S, d * H)  # B, S, dH
+            if self.mem_type == "hc":
+                h = h + self.h_norm(self.h_net(h))
+                c = c + self.c_norm(self.c_net(c))
+            elif self.mem_type == "h":
+                h = h + self.h_norm(self.h_net(h))
+                c = torch.zeros_like(c)
+            elif self.mem_type == "c":
+                h = torch.zeros_like(h)
+                c = c + self.c_norm(self.c_net(c))
+
+            h = h.view(B * S, d, H).transpose(1, 0).contiguous()
+            c = c.view(B * S, d, H).transpose(1, 0).contiguous()
+            ret_val = (h, c)
+
+        if not self.bidirectional:
+            # for causal setup
+            causal_ret_val = []
+            for x in ret_val:
+                x_ = torch.zeros_like(x)
+                x_[:, 1:, :] = x[:, :-1, :]
+                causal_ret_val.append(x_)
+            ret_val = tuple(causal_ret_val)
+
+        return ret_val
+
+
+class SegLSTM(nn.Module):
+
+    """the Seg-LSTM of SkiM
+
+    args:
+        input_size: int, dimension of the input feature.
+            The input should have shape (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        dropout: float, dropout ratio. Default is 0.
+        bidirectional: bool, whether the LSTM layers are bidirectional.
+            Default is False.
+        norm_type: gLN, cLN. cLN is for causal implementation.
+    """
+
+    def __init__(
+        self, input_size, hidden_size, dropout=0.0, bidirectional=False, norm_type="cLN"
+    ):
+        super().__init__()
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_direction = int(bidirectional) + 1
+
+        self.lstm = nn.LSTM(
+            input_size,
+            hidden_size,
+            1,
+            batch_first=True,
+            bidirectional=bidirectional,
+        )
+        self.dropout = nn.Dropout(p=dropout)
+        self.proj = nn.Linear(hidden_size * self.num_direction, input_size)
+        self.norm = choose_norm(
+            norm_type=norm_type, channel_size=input_size, shape="BTD"
+        )
+
+    def forward(self, input, hc):
+        # input shape: B, T, H
+
+        B, T, H = input.shape
+
+        if hc is None:
+            # In fist input SkiM block, h and c are not available
+            d = self.num_direction
+            h = torch.zeros(d, B, self.hidden_size).to(input.device)
+            c = torch.zeros(d, B, self.hidden_size).to(input.device)
+        else:
+            h, c = hc
+
+        output, (h, c) = self.lstm(input, (h, c))
+        output = self.dropout(output)
+        output = self.proj(output.contiguous().view(-1, output.shape[2])).view(
+            input.shape
+        )
+        output = input + self.norm(output)
+
+        return output, (h, c)
+
+
+class SkiM(nn.Module):
+    """Skipping Memory Net
+
+    args:
+        input_size: int, dimension of the input feature.
+            Input shape shoud be (batch, length, input_size)
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        dropout: float, dropout ratio. Default is 0.
+        num_blocks: number of basic SkiM blocks
+        segment_size: segmentation size for splitting long features
+        bidirectional: bool, whether the RNN layers are bidirectional.
+        mem_type: 'hc', 'h', 'c', 'id' or None.
+            It controls whether the hidden (or cell) state of SegLSTM
+            will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states will
+            be identically returned.
+            When mem_type is None, the MemLSTM will be removed.
+        norm_type: gLN, cLN. cLN is for causal implementation.
+        seg_overlap: Bool, whether the segmentation will reserve 50%
+            overlap for adjacent segments.Default is False.
+    """
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        output_size,
+        dropout=0.0,
+        num_blocks=2,
+        segment_size=20,
+        bidirectional=True,
+        mem_type="hc",
+        norm_type="gLN",
+        seg_overlap=False,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+        self.segment_size = segment_size
+        self.dropout = dropout
+        self.num_blocks = num_blocks
+        self.mem_type = mem_type
+        self.norm_type = norm_type
+        self.seg_overlap = seg_overlap
+
+        assert mem_type in [
+            "hc",
+            "h",
+            "c",
+            "id",
+            None,
+        ], f"only support 'hc', 'h', 'c', 'id', and None, current type: {mem_type}"
+
+        self.seg_lstms = nn.ModuleList([])
+        for i in range(num_blocks):
+            self.seg_lstms.append(
+                SegLSTM(
+                    input_size=input_size,
+                    hidden_size=hidden_size,
+                    dropout=dropout,
+                    bidirectional=bidirectional,
+                    norm_type=norm_type,
+                )
+            )
+        if self.mem_type is not None:
+            self.mem_lstms = nn.ModuleList([])
+            for i in range(num_blocks - 1):
+                self.mem_lstms.append(
+                    MemLSTM(
+                        hidden_size,
+                        dropout=dropout,
+                        bidirectional=bidirectional,
+                        mem_type=mem_type,
+                        norm_type=norm_type,
+                    )
+                )
+        self.output_fc = nn.Sequential(
+            nn.PReLU(), nn.Conv1d(input_size, output_size, 1)
+        )
+
+    def forward(self, input):
+        # input shape: B, T (S*K), D
+        B, T, D = input.shape
+
+        if self.seg_overlap:
+            input, rest = split_feature(
+                input.transpose(1, 2), segment_size=self.segment_size
+            )  # B, D, K, S
+            input = input.permute(0, 3, 2, 1).contiguous()  # B, S, K, D
+        else:
+            input, rest = self._padfeature(input=input)
+            input = input.view(B, -1, self.segment_size, D)  # B, S, K, D
+        B, S, K, D = input.shape
+
+        assert K == self.segment_size
+
+        output = input.view(B * S, K, D).contiguous()  # BS, K, D
+        hc = None
+        for i in range(self.num_blocks):
+            output, hc = self.seg_lstms[i](output, hc)  # BS, K, D
+            if self.mem_type and i < self.num_blocks - 1:
+                hc = self.mem_lstms[i](hc, S)
+
+        if self.seg_overlap:
+            output = output.view(B, S, K, D).permute(0, 3, 2, 1)  # B, D, K, S
+            output = merge_feature(output, rest)  # B, D, T
+            output = self.output_fc(output).transpose(1, 2)
+
+        else:
+            output = output.view(B, S * K, D)[:, :T, :]  # B, T, D
+            output = self.output_fc(output.transpose(1, 2)).transpose(1, 2)
+
+        return output
+
+    def _padfeature(self, input):
+        B, T, D = input.shape
+        rest = self.segment_size - T % self.segment_size
+
+        if rest > 0:
+            input = torch.nn.functional.pad(input, (0, 0, 0, rest))
+        return input, rest
+
+
+if __name__ == "__main__":
+
+    model = SkiM(
+        333,
+        111,
+        222,
+        dropout=0.1,
+        num_blocks=3,
+        segment_size=20,
+        bidirectional=False,
+        mem_type="hc",
+        norm_type="cLN",
+        seg_overlap=True,
+    )
+    input = torch.randn(2, 1002, 333)
+    print(model(input).shape)
diff --git a/espnet2/enh/layers/tcn.py b/espnet2/enh/layers/tcn.py
index b3c694cfd3d..11cd5fe3170 100644
--- a/espnet2/enh/layers/tcn.py
+++ b/espnet2/enh/layers/tcn.py
@@ -4,6 +4,7 @@
 #
 # The code is based on:
 # https://github.com/kaituoxu/Conv-TasNet/blob/master/src/conv_tasnet.py
+# Licensed under MIT.
 #
 
 
@@ -46,7 +47,7 @@ def __init__(
         for r in range(R):
             blocks = []
             for x in range(X):
-                dilation = 2 ** x
+                dilation = 2**x
                 padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2
                 blocks += [
                     TemporalBlock(
@@ -110,7 +111,7 @@ def __init__(
         # [M, B, K] -> [M, H, K]
         conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False)
         prelu = nn.PReLU()
-        norm = chose_norm(norm_type, out_channels)
+        norm = choose_norm(norm_type, out_channels)
         # [M, H, K] -> [M, B, K]
         dsconv = DepthwiseSeparableConv(
             out_channels,
@@ -169,7 +170,7 @@ def __init__(
         if causal:
             chomp = Chomp1d(padding)
         prelu = nn.PReLU()
-        norm = chose_norm(norm_type, in_channels)
+        norm = choose_norm(norm_type, in_channels)
         # [M, H, K] -> [M, B, K]
         pointwise_conv = nn.Conv1d(in_channels, out_channels, 1, bias=False)
         # Put together
@@ -214,19 +215,21 @@ def check_nonlinear(nolinear_type):
         raise ValueError("Unsupported nonlinear type")
 
 
-def chose_norm(norm_type, channel_size):
+def choose_norm(norm_type, channel_size, shape="BDT"):
     """The input of normalization will be (M, C, K), where M is batch size.
 
     C is channel size and K is sequence length.
     """
     if norm_type == "gLN":
-        return GlobalLayerNorm(channel_size)
+        return GlobalLayerNorm(channel_size, shape=shape)
     elif norm_type == "cLN":
-        return ChannelwiseLayerNorm(channel_size)
+        return ChannelwiseLayerNorm(channel_size, shape=shape)
     elif norm_type == "BN":
         # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics
         # along M and K, so this BN usage is right.
         return nn.BatchNorm1d(channel_size)
+    elif norm_type == "GN":
+        return nn.GroupNorm(1, channel_size, eps=1e-8)
     else:
         raise ValueError("Unsupported normalization type")
 
@@ -234,11 +237,13 @@ def chose_norm(norm_type, channel_size):
 class ChannelwiseLayerNorm(nn.Module):
     """Channel-wise Layer Normalization (cLN)."""
 
-    def __init__(self, channel_size):
+    def __init__(self, channel_size, shape="BDT"):
         super().__init__()
         self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.reset_parameters()
+        assert shape in ["BDT", "BTD"]
+        self.shape = shape
 
     def reset_parameters(self):
         self.gamma.data.fill_(1)
@@ -253,20 +258,32 @@ def forward(self, y):
         Returns:
             cLN_y: [M, N, K]
         """
+
+        assert y.dim() == 3
+
+        if self.shape == "BTD":
+            y = y.transpose(1, 2).contiguous()
+
         mean = torch.mean(y, dim=1, keepdim=True)  # [M, 1, K]
         var = torch.var(y, dim=1, keepdim=True, unbiased=False)  # [M, 1, K]
         cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+
+        if self.shape == "BTD":
+            cLN_y = cLN_y.transpose(1, 2).contiguous()
+
         return cLN_y
 
 
 class GlobalLayerNorm(nn.Module):
     """Global Layer Normalization (gLN)."""
 
-    def __init__(self, channel_size):
+    def __init__(self, channel_size, shape="BDT"):
         super().__init__()
         self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
         self.reset_parameters()
+        assert shape in ["BDT", "BTD"]
+        self.shape = shape
 
     def reset_parameters(self):
         self.gamma.data.fill_(1)
@@ -281,9 +298,13 @@ def forward(self, y):
         Returns:
             gLN_y: [M, N, K]
         """
-        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)  # [M, 1, 1]
-        var = (
-            (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
-        )
+        if self.shape == "BTD":
+            y = y.transpose(1, 2).contiguous()
+
+        mean = y.mean(dim=(1, 2), keepdim=True)  # [M, 1, 1]
+        var = (torch.pow(y - mean, 2)).mean(dim=(1, 2), keepdim=True)
         gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+
+        if self.shape == "BTD":
+            gLN_y = gLN_y.transpose(1, 2).contiguous()
         return gLN_y
diff --git a/espnet2/enh/layers/wpe.py b/espnet2/enh/layers/wpe.py
new file mode 100644
index 00000000000..a9760325030
--- /dev/null
+++ b/espnet2/enh/layers/wpe.py
@@ -0,0 +1,254 @@
+from distutils.version import LooseVersion
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+import torch_complex.functional as FC
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import einsum
+from espnet2.enh.layers.complex_utils import matmul
+from espnet2.enh.layers.complex_utils import reverse
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+""" WPE pytorch version: Ported from https://github.com/fgnt/nara_wpe
+Many functions aren't enough tested"""
+
+
+def signal_framing(
+    signal: Union[torch.Tensor, ComplexTensor],
+    frame_length: int,
+    frame_step: int,
+    pad_value=0,
+) -> Union[torch.Tensor, ComplexTensor]:
+    """Expands signal into frames of frame_length.
+
+    Args:
+        signal : (B * F, D, T)
+    Returns:
+        torch.Tensor: (B * F, D, T, W)
+    """
+    if isinstance(signal, ComplexTensor):
+        real = signal_framing(signal.real, frame_length, frame_step, pad_value)
+        imag = signal_framing(signal.imag, frame_length, frame_step, pad_value)
+        return ComplexTensor(real, imag)
+    elif is_torch_1_9_plus and torch.is_complex(signal):
+        real = signal_framing(signal.real, frame_length, frame_step, pad_value)
+        imag = signal_framing(signal.imag, frame_length, frame_step, pad_value)
+        return torch.complex(real, imag)
+
+    signal = F.pad(signal, (0, frame_length - 1), "constant", pad_value)
+    indices = sum(
+        [
+            list(range(i, i + frame_length))
+            for i in range(0, signal.size(-1) - frame_length + 1, frame_step)
+        ],
+        [],
+    )
+
+    signal = signal[..., indices].view(*signal.size()[:-1], -1, frame_length)
+    return signal
+
+
+def get_power(signal, dim=-2) -> torch.Tensor:
+    """Calculates power for `signal`
+
+    Args:
+        signal : Single frequency signal
+            with shape (F, C, T).
+        axis: reduce_mean axis
+    Returns:
+        Power with shape (F, T)
+
+    """
+    power = signal.real**2 + signal.imag**2
+    power = power.mean(dim=dim)
+    return power
+
+
+def get_correlations(
+    Y: Union[torch.Tensor, ComplexTensor], inverse_power: torch.Tensor, taps, delay
+) -> Tuple[Union[torch.Tensor, ComplexTensor], Union[torch.Tensor, ComplexTensor]]:
+    """Calculates weighted correlations of a window of length taps
+
+    Args:
+        Y : Complex-valued STFT signal with shape (F, C, T)
+        inverse_power : Weighting factor with shape (F, T)
+        taps (int): Lenghts of correlation window
+        delay (int): Delay for the weighting factor
+
+    Returns:
+        Correlation matrix of shape (F, taps*C, taps*C)
+        Correlation vector of shape (F, taps, C, C)
+    """
+    assert inverse_power.dim() == 2, inverse_power.dim()
+    assert inverse_power.size(0) == Y.size(0), (inverse_power.size(0), Y.size(0))
+
+    F, C, T = Y.size()
+
+    # Y: (F, C, T) -> Psi: (F, C, T, taps)
+    Psi = signal_framing(Y, frame_length=taps, frame_step=1)[
+        ..., : T - delay - taps + 1, :
+    ]
+    # Reverse along taps-axis
+    Psi = reverse(Psi, dim=-1)
+    Psi_conj_norm = Psi.conj() * inverse_power[..., None, delay + taps - 1 :, None]
+
+    # (F, C, T, taps) x (F, C, T, taps) -> (F, taps, C, taps, C)
+    correlation_matrix = einsum("fdtk,fetl->fkdle", Psi_conj_norm, Psi)
+    # (F, taps, C, taps, C) -> (F, taps * C, taps * C)
+    correlation_matrix = correlation_matrix.reshape(F, taps * C, taps * C)
+
+    # (F, C, T, taps) x (F, C, T) -> (F, taps, C, C)
+    correlation_vector = einsum(
+        "fdtk,fet->fked", Psi_conj_norm, Y[..., delay + taps - 1 :]
+    )
+
+    return correlation_matrix, correlation_vector
+
+
+def get_filter_matrix_conj(
+    correlation_matrix: Union[torch.Tensor, ComplexTensor],
+    correlation_vector: Union[torch.Tensor, ComplexTensor],
+    eps: float = 1e-10,
+) -> Union[torch.Tensor, ComplexTensor]:
+    """Calculate (conjugate) filter matrix based on correlations for one freq.
+
+    Args:
+        correlation_matrix : Correlation matrix (F, taps * C, taps * C)
+        correlation_vector : Correlation vector (F, taps, C, C)
+        eps:
+
+    Returns:
+        filter_matrix_conj (torch.complex/ComplexTensor): (F, taps, C, C)
+    """
+    F, taps, C, _ = correlation_vector.size()
+
+    # (F, taps, C1, C2) -> (F, C1, taps, C2) -> (F, C1, taps * C2)
+    correlation_vector = (
+        correlation_vector.permute(0, 2, 1, 3).contiguous().view(F, C, taps * C)
+    )
+
+    eye = torch.eye(
+        correlation_matrix.size(-1),
+        dtype=correlation_matrix.dtype,
+        device=correlation_matrix.device,
+    )
+    shape = (
+        tuple(1 for _ in range(correlation_matrix.dim() - 2))
+        + correlation_matrix.shape[-2:]
+    )
+    eye = eye.view(*shape)
+    correlation_matrix += eps * eye
+
+    inv_correlation_matrix = correlation_matrix.inverse()
+    # (F, C, taps, C) x (F, taps * C, taps * C) -> (F, C, taps * C)
+    stacked_filter_conj = matmul(
+        correlation_vector, inv_correlation_matrix.transpose(-1, -2)
+    )
+
+    # (F, C1, taps * C2) -> (F, C1, taps, C2) -> (F, taps, C2, C1)
+    filter_matrix_conj = stacked_filter_conj.view(F, C, taps, C).permute(0, 2, 3, 1)
+    return filter_matrix_conj
+
+
+def perform_filter_operation(
+    Y: Union[torch.Tensor, ComplexTensor],
+    filter_matrix_conj: Union[torch.Tensor, ComplexTensor],
+    taps,
+    delay,
+) -> Union[torch.Tensor, ComplexTensor]:
+    """perform_filter_operation
+
+    Args:
+        Y : Complex-valued STFT signal of shape (F, C, T)
+        filter Matrix (F, taps, C, C)
+    """
+    if isinstance(Y, ComplexTensor):
+        complex_module = FC
+        pad_func = FC.pad
+    elif is_torch_1_9_plus and torch.is_complex(Y):
+        complex_module = torch
+        pad_func = F.pad
+    else:
+        raise ValueError(
+            "Please update your PyTorch version to 1.9+ for complex support."
+        )
+
+    T = Y.size(-1)
+    # Y_tilde: (taps, F, C, T)
+    Y_tilde = complex_module.stack(
+        [
+            pad_func(Y[:, :, : T - delay - i], (delay + i, 0), mode="constant", value=0)
+            for i in range(taps)
+        ],
+        dim=0,
+    )
+    reverb_tail = complex_module.einsum("fpde,pfdt->fet", (filter_matrix_conj, Y_tilde))
+    return Y - reverb_tail
+
+
+def wpe_one_iteration(
+    Y: Union[torch.Tensor, ComplexTensor],
+    power: torch.Tensor,
+    taps: int = 10,
+    delay: int = 3,
+    eps: float = 1e-10,
+    inverse_power: bool = True,
+) -> Union[torch.Tensor, ComplexTensor]:
+    """WPE for one iteration
+
+    Args:
+        Y: Complex valued STFT signal with shape (..., C, T)
+        power: : (..., T)
+        taps: Number of filter taps
+        delay: Delay as a guard interval, such that X does not become zero.
+        eps:
+        inverse_power (bool):
+    Returns:
+        enhanced: (..., C, T)
+    """
+    assert Y.size()[:-2] == power.size()[:-1]
+    batch_freq_size = Y.size()[:-2]
+    Y = Y.view(-1, *Y.size()[-2:])
+    power = power.view(-1, power.size()[-1])
+
+    if inverse_power:
+        inverse_power = 1 / torch.clamp(power, min=eps)
+    else:
+        inverse_power = power
+
+    correlation_matrix, correlation_vector = get_correlations(
+        Y, inverse_power, taps, delay
+    )
+    filter_matrix_conj = get_filter_matrix_conj(correlation_matrix, correlation_vector)
+    enhanced = perform_filter_operation(Y, filter_matrix_conj, taps, delay)
+
+    enhanced = enhanced.view(*batch_freq_size, *Y.size()[-2:])
+    return enhanced
+
+
+def wpe(
+    Y: Union[torch.Tensor, ComplexTensor], taps=10, delay=3, iterations=3
+) -> Union[torch.Tensor, ComplexTensor]:
+    """WPE
+
+    Args:
+        Y: Complex valued STFT signal with shape (F, C, T)
+        taps: Number of filter taps
+        delay: Delay as a guard interval, such that X does not become zero.
+        iterations:
+
+    Returns:
+        enhanced: (F, C, T)
+
+    """
+    enhanced = Y
+    for _ in range(iterations):
+        power = get_power(enhanced)
+        enhanced = wpe_one_iteration(Y, power, taps=taps, delay=delay)
+    return enhanced
diff --git a/espnet2/enh/loss/__init__.py b/espnet2/enh/loss/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/enh/loss/criterions/__init__.py b/espnet2/enh/loss/criterions/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/enh/loss/criterions/abs_loss.py b/espnet2/enh/loss/criterions/abs_loss.py
new file mode 100644
index 00000000000..11f8482fe05
--- /dev/null
+++ b/espnet2/enh/loss/criterions/abs_loss.py
@@ -0,0 +1,24 @@
+from abc import ABC
+from abc import abstractmethod
+
+
+import torch
+
+EPS = torch.finfo(torch.get_default_dtype()).eps
+
+
+class AbsEnhLoss(torch.nn.Module, ABC):
+
+    # the name will be the key that appears in the reporter
+    @property
+    def name(self) -> str:
+        return NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        ref,
+        inf,
+    ) -> torch.Tensor:
+        # the return tensor should be shape of (batch)
+        raise NotImplementedError
diff --git a/espnet2/enh/loss/criterions/tf_domain.py b/espnet2/enh/loss/criterions/tf_domain.py
new file mode 100644
index 00000000000..a4d78bfa2f2
--- /dev/null
+++ b/espnet2/enh/loss/criterions/tf_domain.py
@@ -0,0 +1,200 @@
+from abc import ABC
+from abc import abstractmethod
+from distutils.version import LooseVersion
+from functools import reduce
+
+import torch
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.complex_utils import new_complex_like
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+EPS = torch.finfo(torch.get_default_dtype()).eps
+
+
+def _create_mask_label(mix_spec, ref_spec, mask_type="IAM"):
+    """Create mask label.
+
+    Args:
+        mix_spec: ComplexTensor(B, T, [C,] F)
+        ref_spec: List[ComplexTensor(B, T, [C,] F), ...]
+        mask_type: str
+    Returns:
+        labels: List[Tensor(B, T, [C,] F), ...] or List[ComplexTensor(B, T, F), ...]
+    """
+
+    # Must be upper case
+    mask_type = mask_type.upper()
+    assert mask_type in [
+        "IBM",
+        "IRM",
+        "IAM",
+        "PSM",
+        "NPSM",
+        "PSM^2",
+        "CIRM",
+    ], f"mask type {mask_type} not supported"
+    mask_label = []
+    for r in ref_spec:
+        mask = None
+        if mask_type == "IBM":
+            flags = [abs(r) >= abs(n) for n in ref_spec]
+            mask = reduce(lambda x, y: x * y, flags)
+            mask = mask.int()
+        elif mask_type == "IRM":
+            # TODO(Wangyou): need to fix this,
+            #  as noise referecens are provided separately
+            mask = abs(r) / (sum(([abs(n) for n in ref_spec])) + EPS)
+        elif mask_type == "IAM":
+            mask = abs(r) / (abs(mix_spec) + EPS)
+            mask = mask.clamp(min=0, max=1)
+        elif mask_type == "PSM" or mask_type == "NPSM":
+            phase_r = r / (abs(r) + EPS)
+            phase_mix = mix_spec / (abs(mix_spec) + EPS)
+            # cos(a - b) = cos(a)*cos(b) + sin(a)*sin(b)
+            cos_theta = phase_r.real * phase_mix.real + phase_r.imag * phase_mix.imag
+            mask = (abs(r) / (abs(mix_spec) + EPS)) * cos_theta
+            mask = (
+                mask.clamp(min=0, max=1)
+                if mask_type == "NPSM"
+                else mask.clamp(min=-1, max=1)
+            )
+        elif mask_type == "PSM^2":
+            # This is for training beamforming masks
+            phase_r = r / (abs(r) + EPS)
+            phase_mix = mix_spec / (abs(mix_spec) + EPS)
+            # cos(a - b) = cos(a)*cos(b) + sin(a)*sin(b)
+            cos_theta = phase_r.real * phase_mix.real + phase_r.imag * phase_mix.imag
+            mask = (abs(r).pow(2) / (abs(mix_spec).pow(2) + EPS)) * cos_theta
+            mask = mask.clamp(min=-1, max=1)
+        elif mask_type == "CIRM":
+            # Ref: Complex Ratio Masking for Monaural Speech Separation
+            denominator = mix_spec.real.pow(2) + mix_spec.imag.pow(2) + EPS
+            mask_real = (mix_spec.real * r.real + mix_spec.imag * r.imag) / denominator
+            mask_imag = (mix_spec.real * r.imag - mix_spec.imag * r.real) / denominator
+            mask = new_complex_like(mix_spec, [mask_real, mask_imag])
+        assert mask is not None, f"mask type {mask_type} not supported"
+        mask_label.append(mask)
+    return mask_label
+
+
+class FrequencyDomainLoss(AbsEnhLoss, ABC):
+
+    # The loss will be computed on mask or on spectrum
+    @property
+    @abstractmethod
+    def compute_on_mask() -> bool:
+        pass
+
+    # the mask type
+    @property
+    @abstractmethod
+    def mask_type() -> str:
+        pass
+
+    def create_mask_label(self, mix_spec, ref_spec):
+        return _create_mask_label(
+            mix_spec=mix_spec, ref_spec=ref_spec, mask_type=self.mask_type
+        )
+
+
+class FrequencyDomainMSE(FrequencyDomainLoss):
+    def __init__(self, compute_on_mask=False, mask_type="IBM"):
+        super().__init__()
+        self._compute_on_mask = compute_on_mask
+        self._mask_type = mask_type
+
+    @property
+    def compute_on_mask(self) -> bool:
+        return self._compute_on_mask
+
+    @property
+    def mask_type(self) -> str:
+        return self._mask_type
+
+    @property
+    def name(self) -> str:
+        if self.compute_on_mask:
+            return f"MSE_on_{self.mask_type}"
+        else:
+            return "MSE_on_Spec"
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """time-frequency MSE loss.
+
+        Args:
+            ref: (Batch, T, F) or (Batch, T, C, F)
+            inf: (Batch, T, F) or (Batch, T, C, F)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        diff = ref - inf
+        if is_complex(diff):
+            mseloss = diff.real**2 + diff.imag**2
+        else:
+            mseloss = diff**2
+        if ref.dim() == 3:
+            mseloss = mseloss.mean(dim=[1, 2])
+        elif ref.dim() == 4:
+            mseloss = mseloss.mean(dim=[1, 2, 3])
+        else:
+            raise ValueError(
+                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+            )
+        return mseloss
+
+
+class FrequencyDomainL1(FrequencyDomainLoss):
+    def __init__(self, compute_on_mask=False, mask_type="IBM"):
+        super().__init__()
+        self._compute_on_mask = compute_on_mask
+        self._mask_type = mask_type
+
+    @property
+    def compute_on_mask(self) -> bool:
+        return self._compute_on_mask
+
+    @property
+    def mask_type(self) -> str:
+        return self._mask_type
+
+    @property
+    def name(self) -> str:
+        if self.compute_on_mask:
+            return f"L1_on_{self.mask_type}"
+        else:
+            return "L1_on_Spec"
+
+    def forward(self, ref, inf) -> torch.Tensor:
+        """time-frequency L1 loss.
+
+        Args:
+            ref: (Batch, T, F) or (Batch, T, C, F)
+            inf: (Batch, T, F) or (Batch, T, C, F)
+        Returns:
+            loss: (Batch,)
+        """
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        if is_complex(inf):
+            l1loss = (
+                abs(ref.real - inf.real)
+                + abs(ref.imag - inf.imag)
+                + abs(ref.abs() - inf.abs())
+            )
+        else:
+            l1loss = abs(ref - inf)
+        if ref.dim() == 3:
+            l1loss = l1loss.mean(dim=[1, 2])
+        elif ref.dim() == 4:
+            l1loss = l1loss.mean(dim=[1, 2, 3])
+        else:
+            raise ValueError(
+                "Invalid input shape: ref={}, inf={}".format(ref.shape, inf.shape)
+            )
+        return l1loss
diff --git a/espnet2/enh/loss/criterions/time_domain.py b/espnet2/enh/loss/criterions/time_domain.py
new file mode 100644
index 00000000000..0ad369d3b44
--- /dev/null
+++ b/espnet2/enh/loss/criterions/time_domain.py
@@ -0,0 +1,122 @@
+from abc import ABC
+
+import ci_sdr
+import torch
+
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+
+
+class TimeDomainLoss(AbsEnhLoss, ABC):
+    pass
+
+
+EPS = torch.finfo(torch.get_default_dtype()).eps
+
+
+class CISDRLoss(TimeDomainLoss):
+    """CI-SDR loss
+
+    Reference:
+        Convolutive Transfer Function Invariant SDR Training
+        Criteria for Multi-Channel Reverberant Speech Separation;
+        C. Boeddeker et al., 2021;
+        https://arxiv.org/abs/2011.15003
+    Args:
+        ref: (Batch, samples)
+        inf: (Batch, samples)
+        filter_length (int): a time-invariant filter that allows
+                                slight distortion via filtering
+    Returns:
+        loss: (Batch,)
+    """
+
+    def __init__(self, filter_length=512):
+        super().__init__()
+        self.filter_length = filter_length
+
+    @property
+    def name(self) -> str:
+        return "ci_sdr_loss"
+
+    def forward(
+        self,
+        ref: torch.Tensor,
+        inf: torch.Tensor,
+    ) -> torch.Tensor:
+
+        assert ref.shape == inf.shape, (ref.shape, inf.shape)
+
+        return ci_sdr.pt.ci_sdr_loss(
+            inf, ref, compute_permutation=False, filter_length=self.filter_length
+        )
+
+
+class SNRLoss(TimeDomainLoss):
+    def __init__(self, eps=EPS):
+        super().__init__()
+        self.eps = float(eps)
+
+    @property
+    def name(self) -> str:
+        return "snr_loss"
+
+    def forward(
+        self,
+        ref: torch.Tensor,
+        inf: torch.Tensor,
+    ) -> torch.Tensor:
+        # the return tensor should be shape of (batch,)
+
+        noise = inf - ref
+
+        snr = 20 * (
+            torch.log10(torch.norm(ref, p=2, dim=1).clamp(min=self.eps))
+            - torch.log10(torch.norm(noise, p=2, dim=1).clamp(min=self.eps))
+        )
+        return -snr
+
+
+class SISNRLoss(TimeDomainLoss):
+    def __init__(self, eps=EPS):
+        super().__init__()
+        self.eps = float(eps)
+
+    @property
+    def name(self) -> str:
+        return "si_snr_loss"
+
+    def forward(
+        self,
+        ref: torch.Tensor,
+        inf: torch.Tensor,
+    ) -> torch.Tensor:
+        # the return tensor should be shape of (batch,)
+        assert ref.size() == inf.size()
+        B, T = ref.size()
+
+        # Step 1. Zero-mean norm
+        mean_target = torch.sum(ref, dim=1, keepdim=True) / T
+        mean_estimate = torch.sum(inf, dim=1, keepdim=True) / T
+        zero_mean_target = ref - mean_target
+        zero_mean_estimate = inf - mean_estimate
+
+        # Step 2. SI-SNR with order
+        # reshape to use broadcast
+        s_target = zero_mean_target  # [B, T]
+        s_estimate = zero_mean_estimate  # [B, T]
+        # s_target = <s', s>s / ||s||^2
+        pair_wise_dot = torch.sum(s_estimate * s_target, dim=1, keepdim=True)  # [B, 1]
+        s_target_energy = (
+            torch.sum(s_target**2, dim=1, keepdim=True) + self.eps
+        )  # [B, 1]
+        pair_wise_proj = pair_wise_dot * s_target / s_target_energy  # [B, T]
+        # e_noise = s' - s_target
+        e_noise = s_estimate - pair_wise_proj  # [B, T]
+
+        # SI-SNR = 10 * log_10(||s_target||^2 / ||e_noise||^2)
+        pair_wise_si_snr = torch.sum(pair_wise_proj**2, dim=1) / (
+            torch.sum(e_noise**2, dim=1) + self.eps
+        )
+        pair_wise_si_snr = 10 * torch.log10(pair_wise_si_snr + self.eps)  # [B]
+
+        return -1 * pair_wise_si_snr
diff --git a/espnet2/enh/loss/wrappers/__init__.py b/espnet2/enh/loss/wrappers/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/enh/loss/wrappers/abs_wrapper.py b/espnet2/enh/loss/wrappers/abs_wrapper.py
new file mode 100644
index 00000000000..b0761272248
--- /dev/null
+++ b/espnet2/enh/loss/wrappers/abs_wrapper.py
@@ -0,0 +1,24 @@
+from abc import ABC
+from abc import abstractmethod
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import torch
+
+
+class AbsLossWrapper(torch.nn.Module, ABC):
+
+    # The weight for the current loss in the multi-task learning.
+    # The overall training target will be combined as:
+    # loss = weight_1 * loss_1 + ... + weight_N * loss_N
+    weight = 1.0
+
+    @abstractmethod
+    def forward(
+        self,
+        ref: List,
+        inf: List,
+        others: Dict,
+    ) -> Tuple[torch.Tensor, Dict, Dict]:
+        raise NotImplementedError
diff --git a/espnet2/enh/loss/wrappers/fixed_order.py b/espnet2/enh/loss/wrappers/fixed_order.py
new file mode 100644
index 00000000000..d37bcfe80d6
--- /dev/null
+++ b/espnet2/enh/loss/wrappers/fixed_order.py
@@ -0,0 +1,36 @@
+import torch
+
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+
+
+class FixedOrderSolver(AbsLossWrapper):
+    def __init__(self, criterion: AbsEnhLoss, weight=1.0):
+        super().__init__()
+        self.criterion = criterion
+        self.weight = weight
+
+    def forward(self, ref, inf, others={}):
+        """An naive fixed-order solver
+
+        Args:
+            ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
+            inf (List[torch.Tensor]): [(batch, ...), ...]
+
+        Returns:
+            loss: (torch.Tensor): minimum loss with the best permutation
+            stats: dict, for collecting training status
+            others: reserved
+        """
+        assert len(ref) == len(inf), (len(ref), len(inf))
+        num_spk = len(ref)
+
+        loss = 0.0
+
+        for r, i in zip(ref, inf):
+            loss += torch.mean(self.criterion(r, i)) / num_spk
+
+        stats = dict()
+        stats[self.criterion.name] = loss.detach()
+
+        return loss.mean(), stats, {}
diff --git a/espnet2/enh/loss/wrappers/pit_solver.py b/espnet2/enh/loss/wrappers/pit_solver.py
new file mode 100644
index 00000000000..6860c7fd416
--- /dev/null
+++ b/espnet2/enh/loss/wrappers/pit_solver.py
@@ -0,0 +1,69 @@
+from itertools import permutations
+
+import torch
+
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+
+
+class PITSolver(AbsLossWrapper):
+    def __init__(self, criterion: AbsEnhLoss, weight=1.0, independent_perm=True):
+        super().__init__()
+        self.criterion = criterion
+        self.weight = weight
+        self.independent_perm = independent_perm
+
+    def forward(self, ref, inf, others={}):
+        """Permutation invariant training solver.
+
+        Args:
+            ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
+            inf (List[torch.Tensor]): [(batch, ...), ...]
+
+        Returns:
+            loss: (torch.Tensor): minimum loss with the best permutation
+            stats: dict, for collecting training status
+            others: dict, in this PIT solver, permutation order will be returned
+        """
+        perm = others["perm"] if "perm" in others else None
+
+        assert len(ref) == len(inf), (len(ref), len(inf))
+        num_spk = len(ref)
+
+        def pair_loss(permutation):
+            return sum(
+                [self.criterion(ref[s], inf[t]) for s, t in enumerate(permutation)]
+            ) / len(permutation)
+
+        if self.independent_perm or perm is None:
+            # computate permuatation independently
+            device = ref[0].device
+            all_permutations = list(permutations(range(num_spk)))
+            losses = torch.stack([pair_loss(p) for p in all_permutations], dim=1)
+            loss, perm = torch.min(losses, dim=1)
+            perm = torch.index_select(
+                torch.tensor(all_permutations, device=device, dtype=torch.long),
+                0,
+                perm,
+            )
+        else:
+            loss = torch.tensor(
+                [
+                    torch.tensor(
+                        [
+                            self.criterion(
+                                ref[s][batch].unsqueeze(0), inf[t][batch].unsqueeze(0)
+                            )
+                            for s, t in enumerate(p)
+                        ]
+                    ).mean()
+                    for batch, p in enumerate(perm)
+                ]
+            )
+
+        loss = loss.mean()
+
+        stats = dict()
+        stats[self.criterion.name] = loss.detach()
+
+        return loss.mean(), stats, {"perm": perm}
diff --git a/espnet2/enh/separator/__init__.py b/espnet2/enh/separator/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/enh/separator/asteroid_models.py b/espnet2/enh/separator/asteroid_models.py
index 8ec5264797b..aad0b1f97af 100644
--- a/espnet2/enh/separator/asteroid_models.py
+++ b/espnet2/enh/separator/asteroid_models.py
@@ -20,7 +20,7 @@ def __init__(
         """The class to convert the models from asteroid to AbsSeprator.
 
         Args:
-            encoder_output_dim: input feature dimension, deafult=1 after the NullEncoder
+            encoder_output_dim: input feature dimension, default=1 after the NullEncoder
             num_spk: number of speakers
             loss_type: loss type of enhancement
             model_name: Asteroid model names, e.g. ConvTasNet, DPTNet. Refers to
@@ -103,7 +103,7 @@ def forward(self, input: torch.Tensor, ilens: torch.Tensor = None):
     def forward_rawwav(
         self, input: torch.Tensor, ilens: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Output with waveforms. """
+        """Output with waveforms."""
         return self.forward(input, ilens)
 
     @property
diff --git a/espnet2/enh/separator/conformer_separator.py b/espnet2/enh/separator/conformer_separator.py
index 26fd6a248fe..dbc1251d99d 100644
--- a/espnet2/enh/separator/conformer_separator.py
+++ b/espnet2/enh/separator/conformer_separator.py
@@ -1,4 +1,5 @@
 from collections import OrderedDict
+from distutils.version import LooseVersion
 from typing import List
 from typing import Tuple
 from typing import Union
@@ -10,9 +11,13 @@
     Encoder as ConformerEncoder,  # noqa: H301
 )
 from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet2.enh.layers.complex_utils import is_complex
 from espnet2.enh.separator.abs_separator import AbsSeparator
 
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
 class ConformerSeparator(AbsSeparator):
     def __init__(
         self,
@@ -44,7 +49,7 @@ def __init__(
         Args:
             input_dim: input feature dimension
             num_spk: number of speakers
-            adim (int): Dimention of attention.
+            adim (int): Dimension of attention.
             aheads (int): The number of heads of multi head attention.
             linear_units (int): The number of units of position-wise feed forward.
             layers (int): The number of transformer blocks.
@@ -133,7 +138,7 @@ def forward(
         """
 
         # if complex spectrum,
-        if isinstance(input, ComplexTensor):
+        if is_complex(input):
             feature = abs(input)
         else:
             feature = input
diff --git a/espnet2/enh/separator/dc_crn_separator.py b/espnet2/enh/separator/dc_crn_separator.py
new file mode 100644
index 00000000000..4f825a6e036
--- /dev/null
+++ b/espnet2/enh/separator/dc_crn_separator.py
@@ -0,0 +1,166 @@
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.complex_utils import new_complex_like
+from espnet2.enh.layers.dc_crn import DC_CRN
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+EPS = torch.finfo(torch.get_default_dtype()).eps
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+class DC_CRNSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        num_spk: int = 2,
+        input_channels: List = [2, 16, 32, 64, 128, 256],
+        enc_hid_channels: int = 8,
+        enc_kernel_size: Tuple = (1, 3),
+        enc_padding: Tuple = (0, 1),
+        enc_last_kernel_size: Tuple = (1, 4),
+        enc_last_stride: Tuple = (1, 2),
+        enc_last_padding: Tuple = (0, 1),
+        enc_layers: int = 5,
+        skip_last_kernel_size: Tuple = (1, 3),
+        skip_last_stride: Tuple = (1, 1),
+        skip_last_padding: Tuple = (0, 1),
+        glstm_groups: int = 2,
+        glstm_layers: int = 2,
+        glstm_bidirectional: bool = False,
+        glstm_rearrange: bool = False,
+        mode: str = "masking",
+        ref_channel: int = 0,
+    ):
+        """Densely-Connected Convolutional Recurrent Network (DC-CRN) Separator
+
+        Reference:
+            Deep Learning Based Real-Time Speech Enhancement for Dual-Microphone
+            Mobile Phones; Tan et al., 2020
+            https://web.cse.ohio-state.edu/~wang.77/papers/TZW.taslp21.pdf
+
+        Args:
+            input_dim: input feature dimension
+            num_spk: number of speakers
+            input_channels (list): number of input channels for the stacked
+                DenselyConnectedBlock layers
+                Its length should be (`number of DenselyConnectedBlock layers`).
+            enc_hid_channels (int): common number of intermediate channels for all
+                DenselyConnectedBlock of the encoder
+            enc_kernel_size (tuple): common kernel size for all DenselyConnectedBlock
+                of the encoder
+            enc_padding (tuple): common padding for all DenselyConnectedBlock
+                of the encoder
+            enc_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the encoder
+            enc_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the encoder
+            enc_layers (int): common total number of Conv layers for all
+                DenselyConnectedBlock layers of the encoder
+            skip_last_kernel_size (tuple): common kernel size for the last Conv layer
+                in all DenselyConnectedBlock of the skip pathways
+            skip_last_stride (tuple): common stride for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            skip_last_padding (tuple): common padding for the last Conv layer in all
+                DenselyConnectedBlock of the skip pathways
+            glstm_groups (int): number of groups in each Grouped LSTM layer
+            glstm_layers (int): number of Grouped LSTM layers
+            glstm_bidirectional (bool): whether to use BLSTM or unidirectional LSTM
+                in Grouped LSTM layers
+            glstm_rearrange (bool): whether to apply the rearrange operation after each
+                grouped LSTM layer
+            output_channels (int): number of output channels (even number)
+            mode (str): one of ("mapping", "masking")
+                "mapping": complex spectral mapping
+                "masking": complex masking
+            ref_channel (int): index of the reference microphone
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+        self.mode = mode
+        if mode not in ("mapping", "masking"):
+            raise ValueError("mode=%s is not supported" % mode)
+        self.ref_channel = ref_channel
+
+        self.dc_crn = DC_CRN(
+            input_dim=input_dim,
+            input_channels=input_channels,
+            enc_hid_channels=enc_hid_channels,
+            enc_kernel_size=enc_kernel_size,
+            enc_padding=enc_padding,
+            enc_last_kernel_size=enc_last_kernel_size,
+            enc_last_stride=enc_last_stride,
+            enc_last_padding=enc_last_padding,
+            enc_layers=enc_layers,
+            skip_last_kernel_size=skip_last_kernel_size,
+            skip_last_stride=skip_last_stride,
+            skip_last_padding=skip_last_padding,
+            glstm_groups=glstm_groups,
+            glstm_layers=glstm_layers,
+            glstm_bidirectional=glstm_bidirectional,
+            glstm_rearrange=glstm_rearrange,
+            output_channels=num_spk * 2,
+        )
+
+    def forward(
+        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """DC-CRN Separator Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [Batch, T, F]
+                                                   or [Batch, T, C, F]
+            ilens (torch.Tensor): input lengths [Batch,]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(Batch, T, F), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        assert is_complex(input)
+        is_multichannel = input.ndim == 4
+        if is_multichannel:
+            feature = torch.cat([input.real, input.imag], dim=2).permute(0, 2, 1, 3)
+        else:
+            feature = torch.stack([input.real, input.imag], dim=1)
+
+        masks = self.dc_crn(feature)
+        masks = [new_complex_like(input, m.unbind(dim=1)) for m in masks.unbind(dim=2)]
+
+        if self.mode == "masking":
+            if is_multichannel:
+                masked = [input * m.unsqueeze(2) for m in masks]
+            else:
+                masked = [input * m for m in masks]
+        else:
+            masked = masks
+            if is_multichannel:
+                masks = [m.unsqueeze(2) / (input + EPS) for m in masked]
+            else:
+                masks = [m / (input + EPS) for m in masked]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(masks))], masks)
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dccrn_separator.py b/espnet2/enh/separator/dccrn_separator.py
new file mode 100644
index 00000000000..ac0e13b0217
--- /dev/null
+++ b/espnet2/enh/separator/dccrn_separator.py
@@ -0,0 +1,359 @@
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complexnn import complex_cat
+from espnet2.enh.layers.complexnn import ComplexBatchNorm
+from espnet2.enh.layers.complexnn import ComplexConv2d
+from espnet2.enh.layers.complexnn import ComplexConvTranspose2d
+from espnet2.enh.layers.complexnn import NavieComplexLSTM
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+EPS = torch.finfo(torch.double).eps
+
+
+class DCCRNSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        num_spk: int = 1,
+        rnn_layer: int = 2,
+        rnn_units: int = 256,
+        masking_mode: str = "E",
+        use_clstm: bool = True,
+        bidirectional: bool = False,
+        use_cbn: bool = False,
+        kernel_size: int = 5,
+        kernel_num: List[int] = [32, 64, 128, 256, 256, 256],
+        use_builtin_complex: bool = True,
+        use_noise_mask: bool = False,
+    ):
+        """DCCRN separator.
+
+        Args:
+            input_dim (int): input dimension。
+            num_spk (int, optional): number of speakers. Defaults to 1.
+            rnn_layer (int, optional): number of lstm layers in the crn. Defaults to 2.
+            rnn_units (int, optional): rnn units. Defaults to 128.
+            masking_mode (str, optional): usage of the estimated mask. Defaults to "E".
+            use_clstm (bool, optional): whether use complex LSTM. Defaults to False.
+            bidirectional (bool, optional): whether use BLSTM. Defaults to False.
+            use_cbn (bool, optional): whether use complex BN. Defaults to False.
+            kernel_size (int, optional): convolution kernel size. Defaults to 5.
+            kernel_num (list, optional): output dimension of each layer of the encoder.
+            use_builtin_complex (bool, optional): torch.complex if True,
+                                                else ComplexTensor.
+            use_noise_mask (bool, optional): whether to estimate the mask of noise.
+        """
+        super().__init__()
+        self.use_builtin_complex = use_builtin_complex
+        self._num_spk = num_spk
+        self.use_noise_mask = use_noise_mask
+        if masking_mode not in ["C", "E", "R"]:
+            raise ValueError("Unsupported masking mode: %s" % masking_mode)
+        # Network config
+        self.rnn_units = rnn_units
+        self.hidden_layers = rnn_layer
+        self.kernel_size = kernel_size
+        self.kernel_num = [2] + kernel_num
+        self.masking_mode = masking_mode
+        self.use_clstm = use_clstm
+
+        fac = 2 if bidirectional else 1
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        for idx in range(len(self.kernel_num) - 1):
+            self.encoder.append(
+                nn.Sequential(
+                    ComplexConv2d(
+                        self.kernel_num[idx],
+                        self.kernel_num[idx + 1],
+                        kernel_size=(self.kernel_size, 2),
+                        stride=(2, 1),
+                        padding=(2, 1),
+                    ),
+                    nn.BatchNorm2d(self.kernel_num[idx + 1])
+                    if not use_cbn
+                    else ComplexBatchNorm(self.kernel_num[idx + 1]),
+                    nn.PReLU(),
+                )
+            )
+        hidden_dim = (input_dim - 1 + 2 ** (len(self.kernel_num) - 1) - 1) // (
+            2 ** (len(self.kernel_num) - 1)
+        )
+        hidden_dim = hidden_dim if hidden_dim > 0 else 1
+
+        if self.use_clstm:
+            rnns = []
+            for idx in range(rnn_layer):
+                rnns.append(
+                    NavieComplexLSTM(
+                        input_size=hidden_dim * self.kernel_num[-1]
+                        if idx == 0
+                        else self.rnn_units * fac,
+                        hidden_size=self.rnn_units,
+                        bidirectional=bidirectional,
+                        batch_first=False,
+                        projection_dim=hidden_dim * self.kernel_num[-1]
+                        if idx == rnn_layer - 1
+                        else None,
+                    )
+                )
+                self.enhance = nn.Sequential(*rnns)
+        else:
+            self.enhance = nn.LSTM(
+                input_size=hidden_dim * self.kernel_num[-1],
+                hidden_size=self.rnn_units,
+                num_layers=2,
+                dropout=0.0,
+                bidirectional=bidirectional,
+                batch_first=False,
+            )
+            self.tranform = nn.Linear(
+                self.rnn_units * fac, hidden_dim * self.kernel_num[-1]
+            )
+
+        for idx in range(len(self.kernel_num) - 1, 0, -1):
+            if idx != 1:
+                self.decoder.append(
+                    nn.Sequential(
+                        ComplexConvTranspose2d(
+                            self.kernel_num[idx] * 2,
+                            self.kernel_num[idx - 1],
+                            kernel_size=(self.kernel_size, 2),
+                            stride=(2, 1),
+                            padding=(2, 0),
+                            output_padding=(1, 0),
+                        ),
+                        nn.BatchNorm2d(self.kernel_num[idx - 1])
+                        if not use_cbn
+                        else ComplexBatchNorm(self.kernel_num[idx - 1]),
+                        nn.PReLU(),
+                    )
+                )
+            else:
+                self.decoder.append(
+                    nn.Sequential(
+                        ComplexConvTranspose2d(
+                            self.kernel_num[idx] * 2,
+                            self.kernel_num[idx - 1] * (self._num_spk + 1)
+                            if self.use_noise_mask
+                            else self.kernel_num[idx - 1] * self._num_spk,
+                            kernel_size=(self.kernel_size, 2),
+                            stride=(2, 1),
+                            padding=(2, 0),
+                            output_padding=(1, 0),
+                        ),
+                    )
+                )
+
+        self.flatten_parameters()
+
+    def forward(
+        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, F]
+            ilens (torch.Tensor): input lengths [Batch]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+        # shape (B, T, F) --> (B, F, T)
+        specs = input.permute(0, 2, 1)
+        real, imag = specs.real, specs.imag
+
+        # # shape (B, F, T)
+        # spec_mags = torch.sqrt(real**2 + imag**2 + 1e-8)
+        # # shape (B, F, T)
+        # spec_phase = torch.atan2(imag, real)
+        # shape (B, 2, F, T)
+        cspecs = torch.stack([real, imag], 1)
+        # shape (B, 2, F-1, T)
+        cspecs = cspecs[:, :, 1:]
+
+        out = cspecs
+        encoder_out = []
+
+        for idx, layer in enumerate(self.encoder):
+            out = layer(out)
+            encoder_out.append(out)
+        # shape (B, C, F, T)
+        batch_size, channels, dims, lengths = out.size()
+        # shape (T, B, C, F)
+        out = out.permute(3, 0, 1, 2)
+        if self.use_clstm:
+            # shape (T, B, C // 2, F)
+            r_rnn_in = out[:, :, : channels // 2]
+            # shape (T, B, C // 2, F)
+            i_rnn_in = out[:, :, channels // 2 :]
+            # shape (T, B, C // 2 * F)
+            r_rnn_in = torch.reshape(
+                r_rnn_in, [lengths, batch_size, channels // 2 * dims]
+            )
+            # shape (T, B, C // 2 * F)
+            i_rnn_in = torch.reshape(
+                i_rnn_in, [lengths, batch_size, channels // 2 * dims]
+            )
+            r_rnn_in, i_rnn_in = self.enhance([r_rnn_in, i_rnn_in])
+            # shape (T, B, C // 2, F)
+            r_rnn_in = torch.reshape(
+                r_rnn_in, [lengths, batch_size, channels // 2, dims]
+            )
+            # shape (T, B, C // 2, F)
+            i_rnn_in = torch.reshape(
+                i_rnn_in, [lengths, batch_size, channels // 2, dims]
+            )
+            # shape (T, B, C, F)
+            out = torch.cat([r_rnn_in, i_rnn_in], 2)
+
+        else:
+            # shape (T, B, C*F)
+            out = torch.reshape(out, [lengths, batch_size, channels * dims])
+            out, _ = self.enhance(out)
+            out = self.tranform(out)
+            # shape (T, B, C, F)
+            out = torch.reshape(out, [lengths, batch_size, channels, dims])
+        # shape (B, C, F, T)
+        out = out.permute(1, 2, 3, 0)
+
+        for idx in range(len(self.decoder)):
+            # skip connection
+            out = complex_cat([out, encoder_out[-1 - idx]], 1)
+            out = self.decoder[idx](out)
+            out = out[..., 1:]
+        # out shape = (B, 2*num_spk, F-1, T) if self.use_noise_mask == False
+        # else (B, 2*(num_spk+1), F-1, T)
+
+        masks = self.create_masks(out)
+        masked = self.apply_masks(masks, real, imag)
+        others = OrderedDict(
+            zip(
+                ["mask_spk{}".format(i + 1) for i in range(self.num_spk)],
+                masks,
+            )
+        )
+
+        if self.use_noise_mask:
+            others["mask_noise1"] = masks[-1]
+            others["noise1"] = masked.pop(-1)
+
+        return (masked, ilens, others)
+
+    def flatten_parameters(self):
+        if isinstance(self.enhance, nn.LSTM):
+            self.enhance.flatten_parameters()
+
+    def create_masks(self, mask_tensor: torch.Tensor):
+        """create estimated mask for each speaker
+
+        Args:
+            mask_tensor (torch.Tensor): output of decoder, shape(B, 2*num_spk, F-1, T)
+        """
+        if self.use_noise_mask:
+            assert mask_tensor.shape[1] == 2 * (self._num_spk + 1), mask_tensor.shape[1]
+        else:
+            assert mask_tensor.shape[1] == 2 * self._num_spk, mask_tensor.shape[1]
+
+        masks = []
+        for idx in range(mask_tensor.shape[1] // 2):
+            # shape (B, F-1, T)
+            mask_real = mask_tensor[:, idx * 2]
+            # shape (B, F-1, T)
+            mask_imag = mask_tensor[:, idx * 2 + 1]
+            # shape (B, F, T)
+            mask_real = F.pad(mask_real, [0, 0, 1, 0])
+            # shape (B, F, T)
+            mask_imag = F.pad(mask_imag, [0, 0, 1, 0])
+
+            # mask shape (B, T, F)
+            if is_torch_1_9_plus and self.use_builtin_complex:
+                complex_mask = torch.complex(
+                    mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1)
+                )
+            else:
+                complex_mask = ComplexTensor(
+                    mask_real.permute(0, 2, 1), mask_imag.permute(0, 2, 1)
+                )
+
+            masks.append(complex_mask)
+
+        return masks
+
+    def apply_masks(
+        self,
+        masks: List[Union[torch.Tensor, ComplexTensor]],
+        real: torch.Tensor,
+        imag: torch.Tensor,
+    ):
+        """apply masks
+
+        Args:
+            masks : est_masks, [(B, T, F), ...]
+            real (torch.Tensor): real part of the noisy spectrum, (B, F, T)
+            imag (torch.Tensor): imag part of the noisy spectrum, (B, F, T)
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...]
+        """
+        masked = []
+        for i in range(len(masks)):
+            # shape (B, T, F) --> (B, F, T)
+            mask_real = masks[i].real.permute(0, 2, 1)
+            mask_imag = masks[i].imag.permute(0, 2, 1)
+            if self.masking_mode == "E":
+                # shape (B, F, T)
+                spec_mags = torch.sqrt(real**2 + imag**2 + 1e-8)
+                # shape (B, F, T)
+                spec_phase = torch.atan2(imag, real)
+                mask_mags = (mask_real**2 + mask_imag**2) ** 0.5
+                # mask_mags = (mask_real ** 2 + mask_imag ** 2 + EPS) ** 0.5
+                real_phase = mask_real / (mask_mags + EPS)
+                imag_phase = mask_imag / (mask_mags + EPS)
+                # mask_phase = torch.atan2(imag_phase + EPS, real_phase + EPS)
+                mask_phase = torch.atan2(imag_phase, real_phase)
+                mask_mags = torch.tanh(mask_mags)
+                est_mags = mask_mags * spec_mags
+                est_phase = spec_phase + mask_phase
+                real = est_mags * torch.cos(est_phase)
+                imag = est_mags * torch.sin(est_phase)
+            elif self.masking_mode == "C":
+                real, imag = (
+                    real * mask_real - imag * mask_imag,
+                    real * mask_imag + imag * mask_real,
+                )
+            elif self.masking_mode == "R":
+                real, imag = real * mask_real, imag * mask_imag
+
+            # shape (B, F, T) --> (B, T, F)
+            if is_torch_1_9_plus and self.use_builtin_complex:
+                masked.append(
+                    torch.complex(real.permute(0, 2, 1), imag.permute(0, 2, 1))
+                )
+            else:
+                masked.append(
+                    ComplexTensor(real.permute(0, 2, 1), imag.permute(0, 2, 1))
+                )
+        return masked
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/dprnn_separator.py b/espnet2/enh/separator/dprnn_separator.py
index 449fb3b79bc..1492d725d4b 100644
--- a/espnet2/enh/separator/dprnn_separator.py
+++ b/espnet2/enh/separator/dprnn_separator.py
@@ -1,4 +1,5 @@
 from collections import OrderedDict
+from distutils.version import LooseVersion
 from typing import List
 from typing import Tuple
 from typing import Union
@@ -6,12 +7,16 @@
 import torch
 from torch_complex.tensor import ComplexTensor
 
+from espnet2.enh.layers.complex_utils import is_complex
 from espnet2.enh.layers.dprnn import DPRNN
 from espnet2.enh.layers.dprnn import merge_feature
 from espnet2.enh.layers.dprnn import split_feature
 from espnet2.enh.separator.abs_separator import AbsSeparator
 
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
 class DPRNNSeparator(AbsSeparator):
     def __init__(
         self,
@@ -85,7 +90,7 @@ def forward(
         """
 
         # if complex spectrum,
-        if isinstance(input, ComplexTensor):
+        if is_complex(input):
             feature = abs(input)
         else:
             feature = input
diff --git a/espnet2/enh/separator/fasnet_separator.py b/espnet2/enh/separator/fasnet_separator.py
new file mode 100644
index 00000000000..a867efddeef
--- /dev/null
+++ b/espnet2/enh/separator/fasnet_separator.py
@@ -0,0 +1,107 @@
+from collections import OrderedDict
+from distutils.version import LooseVersion
+from typing import List
+from typing import Tuple
+
+import torch
+
+from espnet2.enh.layers.fasnet import FaSNet_TAC
+from espnet2.enh.layers.ifasnet import iFaSNet
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+class FaSNetSeparator(AbsSeparator):
+    def __init__(
+        self,
+        input_dim: int,
+        enc_dim: int,
+        feature_dim: int,
+        hidden_dim: int,
+        layer: int,
+        segment_size: int,
+        num_spk: int,
+        win_len: int,
+        context_len: int,
+        fasnet_type: str,
+        dropout: float = 0.0,
+        sr: int = 16000,
+    ):
+        """Filter-and-sum Network (FaSNet) Separator
+
+        Args:
+            input_dim: required by AbsSeparator. Not used in this model.
+            enc_dim: encoder dimension
+            feature_dim: feature dimension
+            hidden_dim: hidden dimension in DPRNN
+            layer: number of DPRNN blocks in iFaSNet
+            segment_size: dual-path segment size
+            num_spk: number of speakers
+            win_len: window length in millisecond
+            context_len: context length in millisecond
+            fasnet_type: 'fasnet' or 'ifasnet'.
+                Select from origin fasnet or Implicit fasnet
+            dropout: dropout rate. Default is 0.
+            sr: samplerate of input audio
+        """
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        assert fasnet_type in ["fasnet", "ifasnet"], "only support fasnet and ifasnet"
+
+        FASNET = FaSNet_TAC if fasnet_type == "fasnet" else iFaSNet
+
+        self.fasnet = FASNET(
+            enc_dim=enc_dim,
+            feature_dim=feature_dim,
+            hidden_dim=hidden_dim,
+            layer=layer,
+            segment_size=segment_size,
+            nspk=num_spk,
+            win_len=win_len,
+            context_len=context_len,
+            sr=sr,
+            dropout=dropout,
+        )
+
+    def forward(
+        self, input: torch.Tensor, ilens: torch.Tensor
+    ) -> Tuple[List[torch.Tensor], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor): (Batch, samples, channels)
+            ilens (torch.Tensor): input lengths [Batch]
+
+        Returns:
+            separated (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        assert input.dim() == 3, "only support input shape: (Batch, samples, channels)"
+        # currently only support for fixed-array
+
+        input = input.permute(0, 2, 1)
+
+        none_mic = torch.zeros(1, dtype=input.dtype)
+
+        separated = self.fasnet(input, none_mic)
+
+        separated = list(separated.unbind(dim=1))
+
+        others = {}
+
+        return separated, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/neural_beamformer.py b/espnet2/enh/separator/neural_beamformer.py
index 007072b16f7..0d236183772 100644
--- a/espnet2/enh/separator/neural_beamformer.py
+++ b/espnet2/enh/separator/neural_beamformer.py
@@ -1,6 +1,7 @@
 from collections import OrderedDict
 from typing import List
 from typing import Tuple
+from typing import Union
 
 import torch
 from torch_complex.tensor import ComplexTensor
@@ -124,16 +125,17 @@ def __init__(
         self.shared_power = shared_power and use_wpe
 
     def forward(
-        self, input: ComplexTensor, ilens: torch.Tensor
-    ) -> Tuple[List[ComplexTensor], torch.Tensor, OrderedDict]:
+        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
         """Forward.
 
         Args:
-            input (ComplexTensor): mixed speech [Batch, Frames, Channel, Freq]
+            input (torch.complex64/ComplexTensor):
+                mixed speech [Batch, Frames, Channel, Freq]
             ilens (torch.Tensor): input lengths [Batch]
 
         Returns:
-            enhanced speech (single-channel): List[ComplexTensor]
+            enhanced speech (single-channel): List[torch.complex64/ComplexTensor]
             output lengths
             other predcited data: OrderedDict[
                 'dereverb1': ComplexTensor(Batch, Frames, Channel, Freq),
diff --git a/espnet2/enh/separator/rnn_separator.py b/espnet2/enh/separator/rnn_separator.py
index 6be889479d5..032f7e5f869 100644
--- a/espnet2/enh/separator/rnn_separator.py
+++ b/espnet2/enh/separator/rnn_separator.py
@@ -1,4 +1,5 @@
 from collections import OrderedDict
+from distutils.version import LooseVersion
 from typing import List
 from typing import Tuple
 from typing import Union
@@ -7,9 +8,13 @@
 from torch_complex.tensor import ComplexTensor
 
 from espnet.nets.pytorch_backend.rnn.encoders import RNN
+from espnet2.enh.layers.complex_utils import is_complex
 from espnet2.enh.separator.abs_separator import AbsSeparator
 
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
 class RNNSeparator(AbsSeparator):
     def __init__(
         self,
@@ -81,7 +86,7 @@ def forward(
         """
 
         # if complex spectrum,
-        if isinstance(input, ComplexTensor):
+        if is_complex(input):
             feature = abs(input)
         else:
             feature = input
diff --git a/espnet2/enh/separator/skim_separator.py b/espnet2/enh/separator/skim_separator.py
new file mode 100644
index 00000000000..2f58421df32
--- /dev/null
+++ b/espnet2/enh/separator/skim_separator.py
@@ -0,0 +1,125 @@
+from collections import OrderedDict
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import torch
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.layers.skim import SkiM
+from espnet2.enh.separator.abs_separator import AbsSeparator
+
+
+class SkiMSeparator(AbsSeparator):
+    """Skipping Memory (SkiM) Separator
+
+    Args:
+        input_dim: input feature dimension
+        causal: bool, whether the system is causal.
+        num_spk: number of target speakers.
+        nonlinear: the nonlinear function for mask estimation,
+            select from 'relu', 'tanh', 'sigmoid'
+        layer: int, number of SkiM blocks. Default is 3.
+        unit: int, dimension of the hidden state.
+        segment_size: segmentation size for splitting long features
+        dropout: float, dropout ratio. Default is 0.
+        mem_type: 'hc', 'h', 'c', 'id' or None.
+            It controls whether the hidden (or cell) state of
+            SegLSTM will be processed by MemLSTM.
+            In 'id' mode, both the hidden and cell states
+            will be identically returned.
+            When mem_type is None, the MemLSTM will be removed.
+        seg_overlap: Bool, whether the segmentation will reserve 50%
+            overlap for adjacent segments. Default is False.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        causal: bool = True,
+        num_spk: int = 2,
+        nonlinear: str = "relu",
+        layer: int = 3,
+        unit: int = 512,
+        segment_size: int = 20,
+        dropout: float = 0.0,
+        mem_type: str = "hc",
+        seg_overlap: bool = False,
+    ):
+
+        super().__init__()
+
+        self._num_spk = num_spk
+
+        self.segment_size = segment_size
+
+        if mem_type not in ("hc", "h", "c", "id", None):
+            raise ValueError("Not supporting mem_type={}".format(mem_type))
+
+        self.skim = SkiM(
+            input_size=input_dim,
+            hidden_size=unit,
+            output_size=input_dim * num_spk,
+            dropout=dropout,
+            num_blocks=layer,
+            bidirectional=(not causal),
+            norm_type="cLN" if causal else "gLN",
+            segment_size=segment_size,
+            seg_overlap=seg_overlap,
+            mem_type=mem_type,
+        )
+
+        if nonlinear not in ("sigmoid", "relu", "tanh"):
+            raise ValueError("Not supporting nonlinear={}".format(nonlinear))
+
+        self.nonlinear = {
+            "sigmoid": torch.nn.Sigmoid(),
+            "relu": torch.nn.ReLU(),
+            "tanh": torch.nn.Tanh(),
+        }[nonlinear]
+
+    def forward(
+        self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor
+    ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]:
+        """Forward.
+
+        Args:
+            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
+            ilens (torch.Tensor): input lengths [Batch]
+
+        Returns:
+            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
+            ilens (torch.Tensor): (B,)
+            others predicted data, e.g. masks: OrderedDict[
+                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
+                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
+                ...
+                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
+            ]
+        """
+
+        # if complex spectrum,
+        if is_complex(input):
+            feature = abs(input)
+        else:
+            feature = input
+
+        B, T, N = feature.shape
+
+        processed = self.skim(feature)  # B,T, N
+
+        processed = processed.view(B, T, N, self.num_spk)
+        masks = self.nonlinear(processed).unbind(dim=3)
+
+        masked = [input * m for m in masks]
+
+        others = OrderedDict(
+            zip(["mask_spk{}".format(i + 1) for i in range(len(masks))], masks)
+        )
+
+        return masked, ilens, others
+
+    @property
+    def num_spk(self):
+        return self._num_spk
diff --git a/espnet2/enh/separator/tcn_separator.py b/espnet2/enh/separator/tcn_separator.py
index a59adb5453e..56f7e053e01 100644
--- a/espnet2/enh/separator/tcn_separator.py
+++ b/espnet2/enh/separator/tcn_separator.py
@@ -1,4 +1,5 @@
 from collections import OrderedDict
+from distutils.version import LooseVersion
 from typing import List
 from typing import Tuple
 from typing import Union
@@ -6,10 +7,14 @@
 import torch
 from torch_complex.tensor import ComplexTensor
 
+from espnet2.enh.layers.complex_utils import is_complex
 from espnet2.enh.layers.tcn import TemporalConvNet
 from espnet2.enh.separator.abs_separator import AbsSeparator
 
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
 class TCNSeparator(AbsSeparator):
     def __init__(
         self,
@@ -79,7 +84,7 @@ def forward(
             ]
         """
         # if complex spectrum
-        if isinstance(input, ComplexTensor):
+        if is_complex(input):
             feature = abs(input)
         else:
             feature = input
diff --git a/espnet2/enh/separator/transformer_separator.py b/espnet2/enh/separator/transformer_separator.py
index 6ca66d6b402..346410e699e 100644
--- a/espnet2/enh/separator/transformer_separator.py
+++ b/espnet2/enh/separator/transformer_separator.py
@@ -1,4 +1,5 @@
 from collections import OrderedDict
+from distutils.version import LooseVersion
 from typing import List
 from typing import Tuple
 from typing import Union
@@ -15,9 +16,13 @@
 from espnet.nets.pytorch_backend.transformer.encoder import (
     Encoder as TransformerEncoder,  # noqa: H301
 )
+from espnet2.enh.layers.complex_utils import is_complex
 from espnet2.enh.separator.abs_separator import AbsSeparator
 
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
 class TransformerSeparator(AbsSeparator):
     def __init__(
         self,
@@ -42,7 +47,7 @@ def __init__(
         Args:
             input_dim: input feature dimension
             num_spk: number of speakers
-            adim (int): Dimention of attention.
+            adim (int): Dimension of attention.
             aheads (int): The number of heads of multi head attention.
             linear_units (int): The number of units of position-wise feed forward.
             layers (int): The number of transformer blocks.
@@ -120,7 +125,7 @@ def forward(
         """
 
         # if complex spectrum,
-        if isinstance(input, ComplexTensor):
+        if is_complex(input):
             feature = abs(input)
         else:
             feature = input
diff --git a/espnet2/fst/lm_rescore.py b/espnet2/fst/lm_rescore.py
new file mode 100644
index 00000000000..340bd409643
--- /dev/null
+++ b/espnet2/fst/lm_rescore.py
@@ -0,0 +1,206 @@
+from typing import List
+from typing import Tuple
+
+import k2
+import math
+import torch
+
+
+def remove_repeated_and_leq(tokens: List[int], blank_id: int = 0):
+    """Generate valid token sequence.
+
+    Result may be used as input of transformer decoder and neural language model.
+    Fristly, remove repeated token from a "token alignment" seqs;
+    Then remove blank symbols.
+
+    This fuction may be replaced by tokenizing word_seqs with tokenizer
+    or composeing word_seqs_fsas with L_inv.fst
+    or composing token_seqs with ctc_topo.
+    Current method is slelected other than previous three methods
+    because it won't need an extra object, i.e. tokenizer, L.fst or ctc_topo.
+    """
+    new_tokens = []
+    previous = None
+    for token in tokens:
+        if token != previous:
+            new_tokens.append(token)
+            previous = token
+    new_tokens = [token for token in new_tokens if token > blank_id]
+    return new_tokens
+
+
+def _intersect_device(
+    a_fsas: k2.Fsa,
+    b_fsas: k2.Fsa,
+    b_to_a_map: torch.Tensor,
+    sorted_match_a: bool,
+    batch_size: int = 500,
+):
+    """Wrap k2.intersect_device
+
+    This is a wrapper of k2.intersect_device and its purpose is to split
+    b_fsas into several batches and process each batch separately to avoid
+    CUDA OOM error.
+    The arguments and return value of this function are the same as
+    k2.intersect_device.
+
+    NOTE: You can decrease batch_size in case of CUDA out of memory error.
+    """
+    num_fsas = b_fsas.shape[0]
+    if num_fsas <= batch_size:
+        return k2.intersect_device(
+            a_fsas, b_fsas, b_to_a_map=b_to_a_map, sorted_match_a=sorted_match_a
+        )
+
+    num_batches = int(math.ceil(float(num_fsas) / batch_size))
+    splits = []
+    for i in range(num_batches):
+        start = i * batch_size
+        end = min(start + batch_size, num_fsas)
+        splits.append((start, end))
+
+    ans = []
+    for start, end in splits:
+        indexes = torch.arange(start, end).to(b_to_a_map)
+
+        fsas = k2.index_fsa(b_fsas, indexes)
+        b_to_a = k2.index_select(b_to_a_map, indexes)
+        path_lats = k2.intersect_device(
+            a_fsas, fsas, b_to_a_map=b_to_a, sorted_match_a=sorted_match_a
+        )
+        ans.append(path_lats)
+
+    return k2.cat(ans)
+
+
+def compute_am_scores_and_lm_scores(
+    lats: k2.Fsa,
+    word_fsas_with_epsilon_loops: k2.Fsa,
+    path_to_seq_map: torch.Tensor,
+    device: str = "cuda",
+    batch_size: int = 500,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Compute AM and LM scores of n-best lists (represented as word_fsas).
+
+    Args:
+      lats:
+        An FsaVec, which is the output of `k2.intersect_dense_pruned`.
+        It must have the attribute `lm_scores`.
+      word_fsas_with_epsilon_loops:
+        An FsaVec representing a n-best list. Note that it has been processed
+        by `k2.add_epsilon_self_loops`.
+      path_to_seq_map:
+        A 1-D torch.Tensor with dtype torch.int32. path_to_seq_map[i] indicates
+        which sequence the i-th Fsa in word_fsas_with_epsilon_loops belongs to.
+        path_to_seq_map.numel() == word_fsas_with_epsilon_loops.arcs.dim0().
+      batch_size:
+        Batchify the n-best list when intersecting with inverted_lats.
+        You could tune this to avoid GPU OOM issue or increase the GPU usage.
+    Returns:
+      Return a tuple of (1-D torch.Tensor, 1-D torch.Tensor) containing
+      the AM and LM scores of each path.
+      `am_scores.numel() == word_fsas_with_epsilon_loops.shape[0]`
+      `lm_scores.numel() == word_fsas_with_epsilon_loops.shape[0]`
+    """
+    assert len(lats.shape) == 3
+
+    # k2.compose() currently does not support b_to_a_map. To void
+    # replicating `lats`, we use k2.intersect_device here.
+    #
+    # lats has phone IDs as `labels` and word IDs as aux_labels, so we
+    # need to invert it here.
+    inverted_lats = k2.invert(lats)
+
+    # Now the `labels` of inverted_lats are word IDs (a 1-D torch.Tensor)
+    # and its `aux_labels` are phone IDs ( a k2.RaggedInt with 2 axes)
+
+    # Remove its `aux_labels` since it is not needed in the
+    # following computation
+    del inverted_lats.aux_labels
+    inverted_lats = k2.arc_sort(inverted_lats)
+
+    am_path_lats = _intersect_device(
+        inverted_lats,
+        word_fsas_with_epsilon_loops,
+        b_to_a_map=path_to_seq_map,
+        sorted_match_a=True,
+        batch_size=batch_size,
+    )
+
+    am_path_lats = k2.top_sort(k2.connect(am_path_lats))
+
+    # The `scores` of every arc consists of `am_scores` and `lm_scores`
+    tot_score_device = "cpu"
+    if hasattr(lats, "lm_scores"):
+        am_path_lats.scores = am_path_lats.scores - am_path_lats.lm_scores
+        am_scores = (
+            am_path_lats.to(tot_score_device)
+            .get_tot_scores(use_double_scores=True, log_semiring=False)
+            .to(device)
+        )
+
+        # Start to compute lm_scores
+        am_path_lats.scores = am_path_lats.lm_scores
+        lm_scores = (
+            am_path_lats.to(tot_score_device)
+            .get_tot_scores(use_double_scores=True, log_semiring=False)
+            .to(device)
+        )
+    else:
+        am_scores = (
+            am_path_lats.to(tot_score_device)
+            .get_tot_scores(use_double_scores=True, log_semiring=False)
+            .to(device)
+        )
+        lm_scores = None
+
+    return am_scores, lm_scores
+
+
+def nbest_am_lm_scores(
+    lats: k2.Fsa,
+    num_paths: int,
+    device: str = "cuda",
+    batch_size: int = 500,
+):
+    """Compute am scores with word_seqs
+
+    Compatible with both ctc_decoding or TLG decoding.
+    """
+    paths = k2.random_paths(lats, num_paths=num_paths, use_double_scores=True)
+    if isinstance(lats.aux_labels, torch.Tensor):
+        word_seqs = k2.ragged.index(lats.aux_labels.contiguous(), paths)
+    else:
+        # '_k2.RaggedInt' object has no attribute 'contiguous'
+        word_seqs = lats.aux_labels.index(paths)
+        word_seqs = word_seqs.remove_axis(word_seqs.num_axes - 2)
+
+    # With ctc_decoding, word_seqs stores token_ids.
+    # With TLG decoding, word_seqs stores word_ids.
+    word_seqs = word_seqs.remove_values_leq(0)
+    unique_word_seqs, num_repeats, new2old = word_seqs.unique(
+        need_num_repeats=True, need_new2old_indexes=True
+    )
+
+    seq_to_path_shape = unique_word_seqs.shape.get_layer(0)
+    path_to_seq_map = seq_to_path_shape.row_ids(1)
+    # used to split final computed tot_scores
+    seq_to_path_splits = seq_to_path_shape.row_splits(1)
+
+    unique_word_seqs = unique_word_seqs.remove_axis(0)
+    word_fsas = k2.linear_fsa(unique_word_seqs)
+
+    word_fsas_with_epsilon_loops = k2.add_epsilon_self_loops(word_fsas)
+
+    am_scores, lm_scores = compute_am_scores_and_lm_scores(
+        lats, word_fsas_with_epsilon_loops, path_to_seq_map, device, batch_size
+    )
+
+    token_seqs = k2.ragged.index(lats.labels.contiguous(), paths)
+    token_seqs = token_seqs.remove_axis(0)
+
+    token_ids, _ = token_seqs.index(new2old, axis=0)
+    token_ids = token_ids.tolist()
+    # Now remove repeated tokens and 0s and -1s.
+    token_ids = [remove_repeated_and_leq(tokens) for tokens in token_ids]
+    return am_scores, lm_scores, token_ids, new2old, path_to_seq_map, seq_to_path_splits
diff --git a/espnet2/gan_tts/__init__.py b/espnet2/gan_tts/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/gan_tts/abs_gan_tts.py b/espnet2/gan_tts/abs_gan_tts.py
new file mode 100644
index 00000000000..248264ecbc9
--- /dev/null
+++ b/espnet2/gan_tts/abs_gan_tts.py
@@ -0,0 +1,28 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""GAN-based TTS abstrast class."""
+
+from abc import ABC
+from abc import abstractmethod
+
+from typing import Dict
+from typing import Union
+
+import torch
+
+from espnet2.tts.abs_tts import AbsTTS
+
+
+class AbsGANTTS(AbsTTS, ABC):
+    """GAN-based TTS model abstract class."""
+
+    @abstractmethod
+    def forward(
+        self,
+        forward_generator,
+        *args,
+        **kwargs,
+    ) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor], int]]:
+        """Return generator or discriminator loss."""
+        raise NotImplementedError
diff --git a/espnet2/gan_tts/espnet_model.py b/espnet2/gan_tts/espnet_model.py
new file mode 100644
index 00000000000..cbb39cc682b
--- /dev/null
+++ b/espnet2/gan_tts/espnet_model.py
@@ -0,0 +1,233 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""GAN-based text-to-speech ESPnet model."""
+
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet2.gan_tts.abs_gan_tts import AbsGANTTS
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.layers.inversible_interface import InversibleInterface
+from espnet2.train.abs_gan_espnet_model import AbsGANESPnetModel
+from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch < 1.6.0
+    @contextmanager
+    def autocast(enabled=True):  # NOQA
+        yield
+
+
+class ESPnetGANTTSModel(AbsGANESPnetModel):
+    """ESPnet model for GAN-based text-to-speech task."""
+
+    def __init__(
+        self,
+        feats_extract: Optional[AbsFeatsExtract],
+        normalize: Optional[AbsNormalize and InversibleInterface],
+        pitch_extract: Optional[AbsFeatsExtract],
+        pitch_normalize: Optional[AbsNormalize and InversibleInterface],
+        energy_extract: Optional[AbsFeatsExtract],
+        energy_normalize: Optional[AbsNormalize and InversibleInterface],
+        tts: AbsGANTTS,
+    ):
+        """Initialize ESPnetGANTTSModel module."""
+        assert check_argument_types()
+        super().__init__()
+        self.feats_extract = feats_extract
+        self.normalize = normalize
+        self.pitch_extract = pitch_extract
+        self.pitch_normalize = pitch_normalize
+        self.energy_extract = energy_extract
+        self.energy_normalize = energy_normalize
+        self.tts = tts
+        assert hasattr(
+            tts, "generator"
+        ), "generator module must be registered as tts.generator"
+        assert hasattr(
+            tts, "discriminator"
+        ), "discriminator module must be registered as tts.discriminator"
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        durations: Optional[torch.Tensor] = None,
+        durations_lengths: Optional[torch.Tensor] = None,
+        pitch: Optional[torch.Tensor] = None,
+        pitch_lengths: Optional[torch.Tensor] = None,
+        energy: Optional[torch.Tensor] = None,
+        energy_lengths: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        forward_generator: bool = True,
+    ) -> Dict[str, Any]:
+        """Return generator or discriminator loss with dict format.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+            duration (Optional[Tensor]): Duration tensor.
+            duration_lengths (Optional[Tensor]): Duration length tensor (B,).
+            pitch (Optional[Tensor]): Pitch tensor.
+            pitch_lengths (Optional[Tensor]): Pitch length tensor (B,).
+            energy (Optional[Tensor]): Energy tensor.
+            energy_lengths (Optional[Tensor]): Energy length tensor (B,).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
+            sids (Optional[Tensor]): Speaker ID tensor (B, 1).
+            lids (Optional[Tensor]): Language ID tensor (B, 1).
+            forward_generator (bool): Whether to forward generator.
+
+        Returns:
+            Dict[str, Any]:
+                - loss (Tensor): Loss scalar tensor.
+                - stats (Dict[str, float]): Statistics to be monitored.
+                - weight (Tensor): Weight tensor to summarize losses.
+                - optim_idx (int): Optimizer index (0 for G and 1 for D).
+
+        """
+        with autocast(False):
+            # Extract features
+            feats = None
+            if self.feats_extract is not None:
+                feats, feats_lengths = self.feats_extract(
+                    speech,
+                    speech_lengths,
+                )
+            if self.pitch_extract is not None and pitch is None:
+                pitch, pitch_lengths = self.pitch_extract(
+                    speech,
+                    speech_lengths,
+                    feats_lengths=feats_lengths,
+                    durations=durations,
+                    durations_lengths=durations_lengths,
+                )
+            if self.energy_extract is not None and energy is None:
+                energy, energy_lengths = self.energy_extract(
+                    speech,
+                    speech_lengths,
+                    feats_lengths=feats_lengths,
+                    durations=durations,
+                    durations_lengths=durations_lengths,
+                )
+
+            # Normalize
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+            if self.pitch_normalize is not None:
+                pitch, pitch_lengths = self.pitch_normalize(pitch, pitch_lengths)
+            if self.energy_normalize is not None:
+                energy, energy_lengths = self.energy_normalize(energy, energy_lengths)
+
+        # Make batch for tts inputs
+        batch = dict(
+            text=text,
+            text_lengths=text_lengths,
+            forward_generator=forward_generator,
+        )
+
+        # Update batch for additional auxiliary inputs
+        if feats is not None:
+            batch.update(feats=feats, feats_lengths=feats_lengths)
+        if self.tts.require_raw_speech:
+            batch.update(speech=speech, speech_lengths=speech_lengths)
+        if durations is not None:
+            batch.update(durations=durations, durations_lengths=durations_lengths)
+        if self.pitch_extract is not None and pitch is not None:
+            batch.update(pitch=pitch, pitch_lengths=pitch_lengths)
+        if self.energy_extract is not None and energy is not None:
+            batch.update(energy=energy, energy_lengths=energy_lengths)
+        if spembs is not None:
+            batch.update(spembs=spembs)
+        if sids is not None:
+            batch.update(sids=sids)
+        if lids is not None:
+            batch.update(lids=lids)
+
+        return self.tts(**batch)
+
+    def collect_feats(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        durations: Optional[torch.Tensor] = None,
+        durations_lengths: Optional[torch.Tensor] = None,
+        pitch: Optional[torch.Tensor] = None,
+        pitch_lengths: Optional[torch.Tensor] = None,
+        energy: Optional[torch.Tensor] = None,
+        energy_lengths: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """Calculate features and return them as a dict.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B, 1).
+            durations (Optional[Tensor): Duration tensor.
+            durations_lengths (Optional[Tensor): Duration length tensor (B,).
+            pitch (Optional[Tensor): Pitch tensor.
+            pitch_lengths (Optional[Tensor): Pitch length tensor (B,).
+            energy (Optional[Tensor): Energy tensor.
+            energy_lengths (Optional[Tensor): Energy length tensor (B,).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
+            sids (Optional[Tensor]): Speaker index tensor (B, 1).
+            lids (Optional[Tensor]): Language ID tensor (B, 1).
+
+        Returns:
+            Dict[str, Tensor]: Dict of features.
+
+        """
+        feats = None
+        if self.feats_extract is not None:
+            feats, feats_lengths = self.feats_extract(
+                speech,
+                speech_lengths,
+            )
+        if self.pitch_extract is not None:
+            pitch, pitch_lengths = self.pitch_extract(
+                speech,
+                speech_lengths,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+            )
+        if self.energy_extract is not None:
+            energy, energy_lengths = self.energy_extract(
+                speech,
+                speech_lengths,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+            )
+
+        # store in dict
+        feats_dict = {}
+        if feats is not None:
+            feats_dict.update(feats=feats, feats_lengths=feats_lengths)
+        if pitch is not None:
+            feats_dict.update(pitch=pitch, pitch_lengths=pitch_lengths)
+        if energy is not None:
+            feats_dict.update(energy=energy, energy_lengths=energy_lengths)
+
+        return feats_dict
diff --git a/espnet2/gan_tts/hifigan/__init__.py b/espnet2/gan_tts/hifigan/__init__.py
new file mode 100644
index 00000000000..c65d1896c03
--- /dev/null
+++ b/espnet2/gan_tts/hifigan/__init__.py
@@ -0,0 +1,8 @@
+from espnet2.gan_tts.hifigan.hifigan import HiFiGANGenerator  # NOQA
+from espnet2.gan_tts.hifigan.hifigan import HiFiGANMultiPeriodDiscriminator  # NOQA
+from espnet2.gan_tts.hifigan.hifigan import HiFiGANMultiScaleDiscriminator  # NOQA
+from espnet2.gan_tts.hifigan.hifigan import (  # NOQA
+    HiFiGANMultiScaleMultiPeriodDiscriminator,  # NOQA
+)
+from espnet2.gan_tts.hifigan.hifigan import HiFiGANPeriodDiscriminator  # NOQA
+from espnet2.gan_tts.hifigan.hifigan import HiFiGANScaleDiscriminator  # NOQA
diff --git a/espnet2/gan_tts/hifigan/hifigan.py b/espnet2/gan_tts/hifigan/hifigan.py
new file mode 100644
index 00000000000..516678366b1
--- /dev/null
+++ b/espnet2/gan_tts/hifigan/hifigan.py
@@ -0,0 +1,749 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""HiFi-GAN Modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+import copy
+import logging
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from espnet2.gan_tts.hifigan.residual_block import ResidualBlock
+
+
+class HiFiGANGenerator(torch.nn.Module):
+    """HiFiGAN generator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 80,
+        out_channels: int = 1,
+        channels: int = 512,
+        global_channels: int = -1,
+        kernel_size: int = 7,
+        upsample_scales: List[int] = [8, 8, 2, 2],
+        upsample_kernel_sizes: List[int] = [16, 16, 4, 4],
+        resblock_kernel_sizes: List[int] = [3, 7, 11],
+        resblock_dilations: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        use_additional_convs: bool = True,
+        bias: bool = True,
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.1},
+        use_weight_norm: bool = True,
+    ):
+        """Initialize HiFiGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            channels (int): Number of hidden representation channels.
+            global_channels (int): Number of global conditioning channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            upsample_scales (List[int]): List of upsampling scales.
+            upsample_kernel_sizes (List[int]): List of kernel sizes for upsample layers.
+            resblock_kernel_sizes (List[int]): List of kernel sizes for residual blocks.
+            resblock_dilations (List[List[int]]): List of list of dilations for residual
+                blocks.
+            use_additional_convs (bool): Whether to use additional conv layers in
+                residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
+                function.
+            use_weight_norm (bool): Whether to use weight norm. If set to true, it will
+                be applied to all of the conv layers.
+
+        """
+        super().__init__()
+
+        # check hyperparameters are valid
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        assert len(upsample_scales) == len(upsample_kernel_sizes)
+        assert len(resblock_dilations) == len(resblock_kernel_sizes)
+
+        # define modules
+        self.upsample_factor = int(np.prod(upsample_scales) * out_channels)
+        self.num_upsamples = len(upsample_kernel_sizes)
+        self.num_blocks = len(resblock_kernel_sizes)
+        self.input_conv = torch.nn.Conv1d(
+            in_channels,
+            channels,
+            kernel_size,
+            1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.upsamples = torch.nn.ModuleList()
+        self.blocks = torch.nn.ModuleList()
+        for i in range(len(upsample_kernel_sizes)):
+            assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
+            self.upsamples += [
+                torch.nn.Sequential(
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                    torch.nn.ConvTranspose1d(
+                        channels // (2**i),
+                        channels // (2 ** (i + 1)),
+                        upsample_kernel_sizes[i],
+                        upsample_scales[i],
+                        padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
+                        output_padding=upsample_scales[i] % 2,
+                    ),
+                )
+            ]
+            for j in range(len(resblock_kernel_sizes)):
+                self.blocks += [
+                    ResidualBlock(
+                        kernel_size=resblock_kernel_sizes[j],
+                        channels=channels // (2 ** (i + 1)),
+                        dilations=resblock_dilations[j],
+                        bias=bias,
+                        use_additional_convs=use_additional_convs,
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                    )
+                ]
+        self.output_conv = torch.nn.Sequential(
+            # NOTE(kan-bayashi): follow official implementation but why
+            #   using different slope parameter here? (0.1 vs. 0.01)
+            torch.nn.LeakyReLU(),
+            torch.nn.Conv1d(
+                channels // (2 ** (i + 1)),
+                out_channels,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1) // 2,
+            ),
+            torch.nn.Tanh(),
+        )
+        if global_channels > 0:
+            self.global_conv = torch.nn.Conv1d(global_channels, channels, 1)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(
+        self, c: torch.Tensor, g: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
+
+        """
+        c = self.input_conv(c)
+        if g is not None:
+            c = c + self.global_conv(g)
+        for i in range(self.num_upsamples):
+            c = self.upsamples[i](c)
+            cs = 0.0  # initialize
+            for j in range(self.num_blocks):
+                cs += self.blocks[i * self.num_blocks + j](c)
+            c = cs / self.num_blocks
+        c = self.output_conv(c)
+
+        return c
+
+    def reset_parameters(self):
+        """Reset parameters.
+
+        This initialization follows the official implementation manner.
+        https://github.com/jik876/hifi-gan/blob/master/models.py
+
+        """
+
+        def _reset_parameters(m: torch.nn.Module):
+            if isinstance(m, (torch.nn.Conv1d, torch.nn.ConvTranspose1d)):
+                m.weight.data.normal_(0.0, 0.01)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m: torch.nn.Module):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def inference(
+        self, c: torch.Tensor, g: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Perform inference.
+
+        Args:
+            c (torch.Tensor): Input tensor (T, in_channels).
+            g (Optional[Tensor]): Global conditioning tensor (global_channels, 1).
+
+        Returns:
+            Tensor: Output tensor (T ** upsample_factor, out_channels).
+
+        """
+        if g is not None:
+            g = g.unsqueeze(0)
+        c = self.forward(c.transpose(1, 0).unsqueeze(0), g=g)
+        return c.squeeze(0).transpose(1, 0)
+
+
+class HiFiGANPeriodDiscriminator(torch.nn.Module):
+    """HiFiGAN period discriminator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        period: int = 3,
+        kernel_sizes: List[int] = [5, 3],
+        channels: int = 32,
+        downsample_scales: List[int] = [3, 3, 3, 3, 1],
+        max_downsample_channels: int = 1024,
+        bias: bool = True,
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.1},
+        use_weight_norm: bool = True,
+        use_spectral_norm: bool = False,
+    ):
+        """Initialize HiFiGANPeriodDiscriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            period (int): Period.
+            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv
+                layer.
+            channels (int): Number of initial channels.
+            downsample_scales (List[int]): List of downsampling scales.
+            max_downsample_channels (int): Number of maximum downsampling channels.
+            use_additional_convs (bool): Whether to use additional conv layers in
+                residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
+                function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1, "Kernel size must be odd number."
+        assert kernel_sizes[1] % 2 == 1, "Kernel size must be odd number."
+
+        self.period = period
+        self.convs = torch.nn.ModuleList()
+        in_chs = in_channels
+        out_chs = channels
+        for downsample_scale in downsample_scales:
+            self.convs += [
+                torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_chs,
+                        out_chs,
+                        (kernel_sizes[0], 1),
+                        (downsample_scale, 1),
+                        padding=((kernel_sizes[0] - 1) // 2, 0),
+                    ),
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                )
+            ]
+            in_chs = out_chs
+            # NOTE(kan-bayashi): Use downsample_scale + 1?
+            out_chs = min(out_chs * 4, max_downsample_channels)
+        self.output_conv = torch.nn.Conv2d(
+            out_chs,
+            out_channels,
+            (kernel_sizes[1] - 1, 1),
+            1,
+            padding=((kernel_sizes[1] - 1) // 2, 0),
+        )
+
+        if use_weight_norm and use_spectral_norm:
+            raise ValueError("Either use use_weight_norm or use_spectral_norm.")
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # apply spectral norm
+        if use_spectral_norm:
+            self.apply_spectral_norm()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+
+        Returns:
+            list: List of each layer's tensors.
+
+        """
+        # transform 1d to 2d -> (B, C, T/P, P)
+        b, c, t = x.shape
+        if t % self.period != 0:
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t += n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        # forward conv
+        outs = []
+        for layer in self.convs:
+            x = layer(x)
+            outs += [x]
+        x = self.output_conv(x)
+        x = torch.flatten(x, 1, -1)
+        outs += [x]
+
+        return outs
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def apply_spectral_norm(self):
+        """Apply spectral normalization module from all of the layers."""
+
+        def _apply_spectral_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.spectral_norm(m)
+                logging.debug(f"Spectral norm is applied to {m}.")
+
+        self.apply(_apply_spectral_norm)
+
+
+class HiFiGANMultiPeriodDiscriminator(torch.nn.Module):
+    """HiFiGAN multi-period discriminator module."""
+
+    def __init__(
+        self,
+        periods: List[int] = [2, 3, 5, 7, 11],
+        discriminator_params: Dict[str, Any] = {
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 32,
+            "downsample_scales": [3, 3, 3, 3, 1],
+            "max_downsample_channels": 1024,
+            "bias": True,
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+            "use_weight_norm": True,
+            "use_spectral_norm": False,
+        },
+    ):
+        """Initialize HiFiGANMultiPeriodDiscriminator module.
+
+        Args:
+            periods (List[int]): List of periods.
+            discriminator_params (Dict[str, Any]): Parameters for hifi-gan period
+                discriminator module. The period parameter will be overwritten.
+
+        """
+        super().__init__()
+        self.discriminators = torch.nn.ModuleList()
+        for period in periods:
+            params = copy.deepcopy(discriminator_params)
+            params["period"] = period
+            self.discriminators += [HiFiGANPeriodDiscriminator(**params)]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each
+                layer output tensors.
+
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+
+        return outs
+
+
+class HiFiGANScaleDiscriminator(torch.nn.Module):
+    """HiFi-GAN scale discriminator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        kernel_sizes: List[int] = [15, 41, 5, 3],
+        channels: int = 128,
+        max_downsample_channels: int = 1024,
+        max_groups: int = 16,
+        bias: int = True,
+        downsample_scales: List[int] = [2, 2, 4, 4, 1],
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.1},
+        use_weight_norm: bool = True,
+        use_spectral_norm: bool = False,
+    ):
+        """Initilize HiFiGAN scale discriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (List[int]): List of four kernel sizes. The first will be used
+                for the first conv layer, and the second is for downsampling part, and
+                the remaining two are for the last two output layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling
+                layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
+                function.
+            use_weight_norm (bool): Whether to use weight norm. If set to true, it will
+                be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm. If set to true, it
+                will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+
+        # check kernel size is valid
+        assert len(kernel_sizes) == 4
+        for ks in kernel_sizes:
+            assert ks % 2 == 1
+
+        # add first layer
+        self.layers += [
+            torch.nn.Sequential(
+                torch.nn.Conv1d(
+                    in_channels,
+                    channels,
+                    # NOTE(kan-bayashi): Use always the same kernel size
+                    kernel_sizes[0],
+                    bias=bias,
+                    padding=(kernel_sizes[0] - 1) // 2,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+
+        # add downsample layers
+        in_chs = channels
+        out_chs = channels
+        # NOTE(kan-bayashi): Remove hard coding?
+        groups = 4
+        for downsample_scale in downsample_scales:
+            self.layers += [
+                torch.nn.Sequential(
+                    torch.nn.Conv1d(
+                        in_chs,
+                        out_chs,
+                        kernel_size=kernel_sizes[1],
+                        stride=downsample_scale,
+                        padding=(kernel_sizes[1] - 1) // 2,
+                        groups=groups,
+                        bias=bias,
+                    ),
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                )
+            ]
+            in_chs = out_chs
+            # NOTE(kan-bayashi): Remove hard coding?
+            out_chs = min(in_chs * 2, max_downsample_channels)
+            # NOTE(kan-bayashi): Remove hard coding?
+            groups = min(groups * 4, max_groups)
+
+        # add final layers
+        out_chs = min(in_chs * 2, max_downsample_channels)
+        self.layers += [
+            torch.nn.Sequential(
+                torch.nn.Conv1d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=kernel_sizes[2],
+                    stride=1,
+                    padding=(kernel_sizes[2] - 1) // 2,
+                    bias=bias,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+        self.layers += [
+            torch.nn.Conv1d(
+                out_chs,
+                out_channels,
+                kernel_size=kernel_sizes[3],
+                stride=1,
+                padding=(kernel_sizes[3] - 1) // 2,
+                bias=bias,
+            ),
+        ]
+
+        if use_weight_norm and use_spectral_norm:
+            raise ValueError("Either use use_weight_norm or use_spectral_norm.")
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # apply spectral norm
+        if use_spectral_norm:
+            self.apply_spectral_norm()
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List[Tensor]: List of output tensors of each layer.
+
+        """
+        outs = []
+        for f in self.layers:
+            x = f(x)
+            outs += [x]
+
+        return outs
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def apply_spectral_norm(self):
+        """Apply spectral normalization module from all of the layers."""
+
+        def _apply_spectral_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.spectral_norm(m)
+                logging.debug(f"Spectral norm is applied to {m}.")
+
+        self.apply(_apply_spectral_norm)
+
+
+class HiFiGANMultiScaleDiscriminator(torch.nn.Module):
+    """HiFi-GAN multi-scale discriminator module."""
+
+    def __init__(
+        self,
+        scales: int = 3,
+        downsample_pooling: str = "AvgPool1d",
+        # follow the official implementation setting
+        downsample_pooling_params: Dict[str, Any] = {
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 2,
+        },
+        discriminator_params: Dict[str, Any] = {
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [15, 41, 5, 3],
+            "channels": 128,
+            "max_downsample_channels": 1024,
+            "max_groups": 16,
+            "bias": True,
+            "downsample_scales": [2, 2, 4, 4, 1],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+        },
+        follow_official_norm: bool = False,
+    ):
+        """Initilize HiFiGAN multi-scale discriminator module.
+
+        Args:
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the
+                inputs.
+            downsample_pooling_params (Dict[str, Any]): Parameters for the above pooling
+                module.
+            discriminator_params (Dict[str, Any]): Parameters for hifi-gan scale
+                discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the
+                official implementaion. The first discriminator uses spectral norm
+                and the other discriminators use weight norm.
+
+        """
+        super().__init__()
+        self.discriminators = torch.nn.ModuleList()
+
+        # add discriminators
+        for i in range(scales):
+            params = copy.deepcopy(discriminator_params)
+            if follow_official_norm:
+                if i == 0:
+                    params["use_weight_norm"] = False
+                    params["use_spectral_norm"] = True
+                else:
+                    params["use_weight_norm"] = True
+                    params["use_spectral_norm"] = False
+            self.discriminators += [HiFiGANScaleDiscriminator(**params)]
+        self.pooling = None
+        if scales > 1:
+            self.pooling = getattr(torch.nn, downsample_pooling)(
+                **downsample_pooling_params
+            )
+
+    def forward(self, x: torch.Tensor) -> List[List[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List[List[torch.Tensor]]: List of list of each discriminator outputs,
+                which consists of eachlayer output tensors.
+
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+            if self.pooling is not None:
+                x = self.pooling(x)
+
+        return outs
+
+
+class HiFiGANMultiScaleMultiPeriodDiscriminator(torch.nn.Module):
+    """HiFi-GAN multi-scale + multi-period discriminator module."""
+
+    def __init__(
+        self,
+        # Multi-scale discriminator related
+        scales: int = 3,
+        scale_downsample_pooling: str = "AvgPool1d",
+        scale_downsample_pooling_params: Dict[str, Any] = {
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 2,
+        },
+        scale_discriminator_params: Dict[str, Any] = {
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [15, 41, 5, 3],
+            "channels": 128,
+            "max_downsample_channels": 1024,
+            "max_groups": 16,
+            "bias": True,
+            "downsample_scales": [2, 2, 4, 4, 1],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+        },
+        follow_official_norm: bool = True,
+        # Multi-period discriminator related
+        periods: List[int] = [2, 3, 5, 7, 11],
+        period_discriminator_params: Dict[str, Any] = {
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 32,
+            "downsample_scales": [3, 3, 3, 3, 1],
+            "max_downsample_channels": 1024,
+            "bias": True,
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+            "use_weight_norm": True,
+            "use_spectral_norm": False,
+        },
+    ):
+        """Initilize HiFiGAN multi-scale + multi-period discriminator module.
+
+        Args:
+            scales (int): Number of multi-scales.
+            scale_downsample_pooling (str): Pooling module name for downsampling of the
+                inputs.
+            scale_downsample_pooling_params (dict): Parameters for the above pooling
+                module.
+            scale_discriminator_params (dict): Parameters for hifi-gan scale
+                discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the
+                official implementaion. The first discriminator uses spectral norm and
+                the other discriminators use weight norm.
+            periods (list): List of periods.
+            period_discriminator_params (dict): Parameters for hifi-gan period
+                discriminator module. The period parameter will be overwritten.
+
+        """
+        super().__init__()
+        self.msd = HiFiGANMultiScaleDiscriminator(
+            scales=scales,
+            downsample_pooling=scale_downsample_pooling,
+            downsample_pooling_params=scale_downsample_pooling_params,
+            discriminator_params=scale_discriminator_params,
+            follow_official_norm=follow_official_norm,
+        )
+        self.mpd = HiFiGANMultiPeriodDiscriminator(
+            periods=periods,
+            discriminator_params=period_discriminator_params,
+        )
+
+    def forward(self, x: torch.Tensor) -> List[List[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List[List[Tensor]]: List of list of each discriminator outputs,
+                which consists of each layer output tensors. Multi scale and
+                multi period ones are concatenated.
+
+        """
+        msd_outs = self.msd(x)
+        mpd_outs = self.mpd(x)
+        return msd_outs + mpd_outs
diff --git a/espnet2/gan_tts/hifigan/loss.py b/espnet2/gan_tts/hifigan/loss.py
new file mode 100644
index 00000000000..083b5de6cb5
--- /dev/null
+++ b/espnet2/gan_tts/hifigan/loss.py
@@ -0,0 +1,296 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""HiFiGAN-related loss modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+
+from espnet2.tts.feats_extract.log_mel_fbank import LogMelFbank
+
+
+class GeneratorAdversarialLoss(torch.nn.Module):
+    """Generator adversarial loss module."""
+
+    def __init__(
+        self,
+        average_by_discriminators: bool = True,
+        loss_type: str = "mse",
+    ):
+        """Initialize GeneratorAversarialLoss module.
+
+        Args:
+            average_by_discriminators (bool): Whether to average the loss by
+                the number of discriminators.
+            loss_type (str): Loss type, "mse" or "hinge".
+
+        """
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.criterion = self._mse_loss
+        else:
+            self.criterion = self._hinge_loss
+
+    def forward(
+        self,
+        outputs: Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor],
+    ) -> torch.Tensor:
+        """Calcualate generator adversarial loss.
+
+        Args:
+            outputs (Union[List[List[Tensor]], List[Tensor], Tensor]): Discriminator
+                outputs, list of discriminator outputs, or list of list of discriminator
+                outputs..
+
+        Returns:
+            Tensor: Generator adversarial loss value.
+
+        """
+        if isinstance(outputs, (tuple, list)):
+            adv_loss = 0.0
+            for i, outputs_ in enumerate(outputs):
+                if isinstance(outputs_, (tuple, list)):
+                    # NOTE(kan-bayashi): case including feature maps
+                    outputs_ = outputs_[-1]
+                adv_loss += self.criterion(outputs_)
+            if self.average_by_discriminators:
+                adv_loss /= i + 1
+        else:
+            adv_loss = self.criterion(outputs)
+
+        return adv_loss
+
+    def _mse_loss(self, x):
+        return F.mse_loss(x, x.new_ones(x.size()))
+
+    def _hinge_loss(self, x):
+        return -x.mean()
+
+
+class DiscriminatorAdversarialLoss(torch.nn.Module):
+    """Discriminator adversarial loss module."""
+
+    def __init__(
+        self,
+        average_by_discriminators: bool = True,
+        loss_type: str = "mse",
+    ):
+        """Initialize DiscriminatorAversarialLoss module.
+
+        Args:
+            average_by_discriminators (bool): Whether to average the loss by
+                the number of discriminators.
+            loss_type (str): Loss type, "mse" or "hinge".
+
+        """
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.fake_criterion = self._mse_fake_loss
+            self.real_criterion = self._mse_real_loss
+        else:
+            self.fake_criterion = self._hinge_fake_loss
+            self.real_criterion = self._hinge_real_loss
+
+    def forward(
+        self,
+        outputs_hat: Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor],
+        outputs: Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Calcualate discriminator adversarial loss.
+
+        Args:
+            outputs_hat (Union[List[List[Tensor]], List[Tensor], Tensor]): Discriminator
+                outputs, list of discriminator outputs, or list of list of discriminator
+                outputs calculated from generator.
+            outputs (Union[List[List[Tensor]], List[Tensor], Tensor]): Discriminator
+                outputs, list of discriminator outputs, or list of list of discriminator
+                outputs calculated from groundtruth.
+
+        Returns:
+            Tensor: Discriminator real loss value.
+            Tensor: Discriminator fake loss value.
+
+        """
+        if isinstance(outputs, (tuple, list)):
+            real_loss = 0.0
+            fake_loss = 0.0
+            for i, (outputs_hat_, outputs_) in enumerate(zip(outputs_hat, outputs)):
+                if isinstance(outputs_hat_, (tuple, list)):
+                    # NOTE(kan-bayashi): case including feature maps
+                    outputs_hat_ = outputs_hat_[-1]
+                    outputs_ = outputs_[-1]
+                real_loss += self.real_criterion(outputs_)
+                fake_loss += self.fake_criterion(outputs_hat_)
+            if self.average_by_discriminators:
+                fake_loss /= i + 1
+                real_loss /= i + 1
+        else:
+            real_loss = self.real_criterion(outputs)
+            fake_loss = self.fake_criterion(outputs_hat)
+
+        return real_loss, fake_loss
+
+    def _mse_real_loss(self, x: torch.Tensor) -> torch.Tensor:
+        return F.mse_loss(x, x.new_ones(x.size()))
+
+    def _mse_fake_loss(self, x: torch.Tensor) -> torch.Tensor:
+        return F.mse_loss(x, x.new_zeros(x.size()))
+
+    def _hinge_real_loss(self, x: torch.Tensor) -> torch.Tensor:
+        return -torch.mean(torch.min(x - 1, x.new_zeros(x.size())))
+
+    def _hinge_fake_loss(self, x: torch.Tensor) -> torch.Tensor:
+        return -torch.mean(torch.min(-x - 1, x.new_zeros(x.size())))
+
+
+class FeatureMatchLoss(torch.nn.Module):
+    """Feature matching loss module."""
+
+    def __init__(
+        self,
+        average_by_layers: bool = True,
+        average_by_discriminators: bool = True,
+        include_final_outputs: bool = False,
+    ):
+        """Initialize FeatureMatchLoss module.
+
+        Args:
+            average_by_layers (bool): Whether to average the loss by the number
+                of layers.
+            average_by_discriminators (bool): Whether to average the loss by
+                the number of discriminators.
+            include_final_outputs (bool): Whether to include the final output of
+                each discriminator for loss calculation.
+
+        """
+        super().__init__()
+        self.average_by_layers = average_by_layers
+        self.average_by_discriminators = average_by_discriminators
+        self.include_final_outputs = include_final_outputs
+
+    def forward(
+        self,
+        feats_hat: Union[List[List[torch.Tensor]], List[torch.Tensor]],
+        feats: Union[List[List[torch.Tensor]], List[torch.Tensor]],
+    ) -> torch.Tensor:
+        """Calculate feature matching loss.
+
+        Args:
+            feats_hat (Union[List[List[Tensor]], List[Tensor]]): List of list of
+                discriminator outputs or list of discriminator outputs calcuated
+                from generator's outputs.
+            feats (Union[List[List[Tensor]], List[Tensor]]): List of list of
+                discriminator outputs or list of discriminator outputs calcuated
+                from groundtruth..
+
+        Returns:
+            Tensor: Feature matching loss value.
+
+        """
+        feat_match_loss = 0.0
+        for i, (feats_hat_, feats_) in enumerate(zip(feats_hat, feats)):
+            feat_match_loss_ = 0.0
+            if not self.include_final_outputs:
+                feats_hat_ = feats_hat_[:-1]
+                feats_ = feats_[:-1]
+            for j, (feat_hat_, feat_) in enumerate(zip(feats_hat_, feats_)):
+                feat_match_loss_ += F.l1_loss(feat_hat_, feat_.detach())
+            if self.average_by_layers:
+                feat_match_loss_ /= j + 1
+            feat_match_loss += feat_match_loss_
+        if self.average_by_discriminators:
+            feat_match_loss /= i + 1
+
+        return feat_match_loss
+
+
+class MelSpectrogramLoss(torch.nn.Module):
+    """Mel-spectrogram loss."""
+
+    def __init__(
+        self,
+        fs: int = 22050,
+        n_fft: int = 1024,
+        hop_length: int = 256,
+        win_length: Optional[int] = None,
+        window: str = "hann",
+        n_mels: int = 80,
+        fmin: Optional[int] = 0,
+        fmax: Optional[int] = None,
+        center: bool = True,
+        normalized: bool = False,
+        onesided: bool = True,
+        log_base: Optional[float] = 10.0,
+    ):
+        """Initialize Mel-spectrogram loss.
+
+        Args:
+            fs (int): Sampling rate.
+            n_fft (int): FFT points.
+            hop_length (int): Hop length.
+            win_length (Optional[int]): Window length.
+            window (str): Window type.
+            n_mels (int): Number of Mel basis.
+            fmin (Optional[int]): Minimum frequency for Mel.
+            fmax (Optional[int]): Maximum frequency for Mel.
+            center (bool): Whether to use center window.
+            normalized (bool): Whether to use normalized one.
+            onesided (bool): Whether to use oneseded one.
+            log_base (Optional[float]): Log base value.
+
+        """
+        super().__init__()
+        self.wav_to_mel = LogMelFbank(
+            fs=fs,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            n_mels=n_mels,
+            fmin=fmin,
+            fmax=fmax,
+            center=center,
+            normalized=normalized,
+            onesided=onesided,
+            log_base=log_base,
+        )
+
+    def forward(
+        self,
+        y_hat: torch.Tensor,
+        y: torch.Tensor,
+        spec: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Calculate Mel-spectrogram loss.
+
+        Args:
+            y_hat (Tensor): Generated waveform tensor (B, 1, T).
+            y (Tensor): Groundtruth waveform tensor (B, 1, T).
+            spec (Optional[Tensor]): Groundtruth linear amplitude spectrum tensor
+                (B, n_fft, T). if provided, use it instead of groundtruth waveform.
+
+        Returns:
+            Tensor: Mel-spectrogram loss value.
+
+        """
+        mel_hat, _ = self.wav_to_mel(y_hat.squeeze(1))
+        if spec is None:
+            mel, _ = self.wav_to_mel(y.squeeze(1))
+        else:
+            mel, _ = self.wav_to_mel.logmel(spec)
+        mel_loss = F.l1_loss(mel_hat, mel)
+
+        return mel_loss
diff --git a/espnet2/gan_tts/hifigan/residual_block.py b/espnet2/gan_tts/hifigan/residual_block.py
new file mode 100644
index 00000000000..c5ac2c4e2f5
--- /dev/null
+++ b/espnet2/gan_tts/hifigan/residual_block.py
@@ -0,0 +1,99 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""HiFiGAN Residual block modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+from typing import Any
+from typing import Dict
+from typing import List
+
+import torch
+
+
+class ResidualBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN."""
+
+    def __init__(
+        self,
+        kernel_size: int = 3,
+        channels: int = 512,
+        dilations: List[int] = [1, 3, 5],
+        bias: bool = True,
+        use_additional_convs: bool = True,
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.1},
+    ):
+        """Initialize ResidualBlock module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
+                function.
+
+        """
+        super().__init__()
+        self.use_additional_convs = use_additional_convs
+        self.convs1 = torch.nn.ModuleList()
+        if use_additional_convs:
+            self.convs2 = torch.nn.ModuleList()
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        for dilation in dilations:
+            self.convs1 += [
+                torch.nn.Sequential(
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                    torch.nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        bias=bias,
+                        padding=(kernel_size - 1) // 2 * dilation,
+                    ),
+                )
+            ]
+            if use_additional_convs:
+                self.convs2 += [
+                    torch.nn.Sequential(
+                        getattr(torch.nn, nonlinear_activation)(
+                            **nonlinear_activation_params
+                        ),
+                        torch.nn.Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            bias=bias,
+                            padding=(kernel_size - 1) // 2,
+                        ),
+                    )
+                ]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+
+        """
+        for idx in range(len(self.convs1)):
+            xt = self.convs1[idx](x)
+            if self.use_additional_convs:
+                xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
diff --git a/espnet2/gan_tts/joint/__init__.py b/espnet2/gan_tts/joint/__init__.py
new file mode 100644
index 00000000000..4aac84f98b1
--- /dev/null
+++ b/espnet2/gan_tts/joint/__init__.py
@@ -0,0 +1 @@
+from espnet2.gan_tts.joint.joint_text2wav import JointText2Wav  # NOQA
diff --git a/espnet2/gan_tts/joint/joint_text2wav.py b/espnet2/gan_tts/joint/joint_text2wav.py
new file mode 100644
index 00000000000..5d85e337642
--- /dev/null
+++ b/espnet2/gan_tts/joint/joint_text2wav.py
@@ -0,0 +1,632 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Joint text-to-wav module for end-to-end training."""
+
+from typing import Any
+from typing import Dict
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet2.gan_tts.abs_gan_tts import AbsGANTTS
+from espnet2.gan_tts.hifigan import HiFiGANGenerator
+from espnet2.gan_tts.hifigan import HiFiGANMultiPeriodDiscriminator
+from espnet2.gan_tts.hifigan import HiFiGANMultiScaleDiscriminator
+from espnet2.gan_tts.hifigan import HiFiGANMultiScaleMultiPeriodDiscriminator
+from espnet2.gan_tts.hifigan import HiFiGANPeriodDiscriminator
+from espnet2.gan_tts.hifigan import HiFiGANScaleDiscriminator
+from espnet2.gan_tts.hifigan.loss import DiscriminatorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import FeatureMatchLoss
+from espnet2.gan_tts.hifigan.loss import GeneratorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import MelSpectrogramLoss
+from espnet2.gan_tts.melgan import MelGANGenerator
+from espnet2.gan_tts.melgan import MelGANMultiScaleDiscriminator
+from espnet2.gan_tts.melgan.pqmf import PQMF
+from espnet2.gan_tts.parallel_wavegan import ParallelWaveGANDiscriminator
+from espnet2.gan_tts.parallel_wavegan import ParallelWaveGANGenerator
+from espnet2.gan_tts.style_melgan import StyleMelGANDiscriminator
+from espnet2.gan_tts.style_melgan import StyleMelGANGenerator
+from espnet2.gan_tts.utils import get_random_segments
+from espnet2.gan_tts.utils import get_segments
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.tts.fastspeech import FastSpeech
+from espnet2.tts.fastspeech2 import FastSpeech2
+from espnet2.tts.tacotron2 import Tacotron2
+from espnet2.tts.transformer import Transformer
+
+AVAILABLE_TEXT2MEL = {
+    "tacotron2": Tacotron2,
+    "transformer": Transformer,
+    "fastspeech": FastSpeech,
+    "fastspeech2": FastSpeech2,
+}
+AVAILABLE_VOCODER = {
+    "hifigan_generator": HiFiGANGenerator,
+    "melgan_generator": MelGANGenerator,
+    "parallel_wavegan_generator": ParallelWaveGANGenerator,
+    "style_melgan_generator": StyleMelGANGenerator,
+}
+AVAILABLE_DISCRIMINATORS = {
+    "hifigan_period_discriminator": HiFiGANPeriodDiscriminator,
+    "hifigan_scale_discriminator": HiFiGANScaleDiscriminator,
+    "hifigan_multi_period_discriminator": HiFiGANMultiPeriodDiscriminator,
+    "hifigan_multi_scale_discriminator": HiFiGANMultiScaleDiscriminator,
+    "hifigan_multi_scale_multi_period_discriminator": HiFiGANMultiScaleMultiPeriodDiscriminator,  # NOQA
+    "melgan_multi_scale_discriminator": MelGANMultiScaleDiscriminator,
+    "parallel_wavegan_discriminator": ParallelWaveGANDiscriminator,
+    "style_melgan_discriminator": StyleMelGANDiscriminator,
+}
+
+
+class JointText2Wav(AbsGANTTS):
+    """General class to jointly train text2mel and vocoder parts."""
+
+    def __init__(
+        self,
+        # generator (text2mel + vocoder) related
+        idim: int,
+        odim: int,
+        segment_size: int = 32,
+        sampling_rate: int = 22050,
+        text2mel_type: str = "fastspeech2",
+        text2mel_params: Dict[str, Any] = {
+            "adim": 384,
+            "aheads": 2,
+            "elayers": 4,
+            "eunits": 1536,
+            "dlayers": 4,
+            "dunits": 1536,
+            "postnet_layers": 5,
+            "postnet_chans": 512,
+            "postnet_filts": 5,
+            "postnet_dropout_rate": 0.5,
+            "positionwise_layer_type": "conv1d",
+            "positionwise_conv_kernel_size": 1,
+            "use_scaled_pos_enc": True,
+            "use_batch_norm": True,
+            "encoder_normalize_before": True,
+            "decoder_normalize_before": True,
+            "encoder_concat_after": False,
+            "decoder_concat_after": False,
+            "reduction_factor": 1,
+            "encoder_type": "conformer",
+            "decoder_type": "conformer",
+            "transformer_enc_dropout_rate": 0.1,
+            "transformer_enc_positional_dropout_rate": 0.1,
+            "transformer_enc_attn_dropout_rate": 0.1,
+            "transformer_dec_dropout_rate": 0.1,
+            "transformer_dec_positional_dropout_rate": 0.1,
+            "transformer_dec_attn_dropout_rate": 0.1,
+            "conformer_rel_pos_type": "latest",
+            "conformer_pos_enc_layer_type": "rel_pos",
+            "conformer_self_attn_layer_type": "rel_selfattn",
+            "conformer_activation_type": "swish",
+            "use_macaron_style_in_conformer": True,
+            "use_cnn_in_conformer": True,
+            "zero_triu": False,
+            "conformer_enc_kernel_size": 7,
+            "conformer_dec_kernel_size": 31,
+            "duration_predictor_layers": 2,
+            "duration_predictor_chans": 384,
+            "duration_predictor_kernel_size": 3,
+            "duration_predictor_dropout_rate": 0.1,
+            "energy_predictor_layers": 2,
+            "energy_predictor_chans": 384,
+            "energy_predictor_kernel_size": 3,
+            "energy_predictor_dropout": 0.5,
+            "energy_embed_kernel_size": 1,
+            "energy_embed_dropout": 0.5,
+            "stop_gradient_from_energy_predictor": False,
+            "pitch_predictor_layers": 5,
+            "pitch_predictor_chans": 384,
+            "pitch_predictor_kernel_size": 5,
+            "pitch_predictor_dropout": 0.5,
+            "pitch_embed_kernel_size": 1,
+            "pitch_embed_dropout": 0.5,
+            "stop_gradient_from_pitch_predictor": True,
+            "spks": -1,
+            "langs": -1,
+            "spk_embed_dim": None,
+            "spk_embed_integration_type": "add",
+            "use_gst": False,
+            "gst_tokens": 10,
+            "gst_heads": 4,
+            "gst_conv_layers": 6,
+            "gst_conv_chans_list": [32, 32, 64, 64, 128, 128],
+            "gst_conv_kernel_size": 3,
+            "gst_conv_stride": 2,
+            "gst_gru_layers": 1,
+            "gst_gru_units": 128,
+            "init_type": "xavier_uniform",
+            "init_enc_alpha": 1.0,
+            "init_dec_alpha": 1.0,
+            "use_masking": False,
+            "use_weighted_masking": False,
+        },
+        vocoder_type: str = "hifigan_generator",
+        vocoder_params: Dict[str, Any] = {
+            "out_channels": 1,
+            "channels": 512,
+            "global_channels": -1,
+            "kernel_size": 7,
+            "upsample_scales": [8, 8, 2, 2],
+            "upsample_kernel_sizes": [16, 16, 4, 4],
+            "resblock_kernel_sizes": [3, 7, 11],
+            "resblock_dilations": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            "use_additional_convs": True,
+            "bias": True,
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+            "use_weight_norm": True,
+        },
+        use_pqmf: bool = False,
+        pqmf_params: Dict[str, Any] = {
+            "subbands": 4,
+            "taps": 62,
+            "cutoff_ratio": 0.142,
+            "beta": 9.0,
+        },
+        # discriminator related
+        discriminator_type: str = "hifigan_multi_scale_multi_period_discriminator",
+        discriminator_params: Dict[str, Any] = {
+            "scales": 1,
+            "scale_downsample_pooling": "AvgPool1d",
+            "scale_downsample_pooling_params": {
+                "kernel_size": 4,
+                "stride": 2,
+                "padding": 2,
+            },
+            "scale_discriminator_params": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [15, 41, 5, 3],
+                "channels": 128,
+                "max_downsample_channels": 1024,
+                "max_groups": 16,
+                "bias": True,
+                "downsample_scales": [2, 2, 4, 4, 1],
+                "nonlinear_activation": "LeakyReLU",
+                "nonlinear_activation_params": {"negative_slope": 0.1},
+                "use_weight_norm": True,
+                "use_spectral_norm": False,
+            },
+            "follow_official_norm": False,
+            "periods": [2, 3, 5, 7, 11],
+            "period_discriminator_params": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 32,
+                "downsample_scales": [3, 3, 3, 3, 1],
+                "max_downsample_channels": 1024,
+                "bias": True,
+                "nonlinear_activation": "LeakyReLU",
+                "nonlinear_activation_params": {"negative_slope": 0.1},
+                "use_weight_norm": True,
+                "use_spectral_norm": False,
+            },
+        },
+        # loss related
+        generator_adv_loss_params: Dict[str, Any] = {
+            "average_by_discriminators": False,
+            "loss_type": "mse",
+        },
+        discriminator_adv_loss_params: Dict[str, Any] = {
+            "average_by_discriminators": False,
+            "loss_type": "mse",
+        },
+        use_feat_match_loss: bool = True,
+        feat_match_loss_params: Dict[str, Any] = {
+            "average_by_discriminators": False,
+            "average_by_layers": False,
+            "include_final_outputs": True,
+        },
+        use_mel_loss: bool = True,
+        mel_loss_params: Dict[str, Any] = {
+            "fs": 22050,
+            "n_fft": 1024,
+            "hop_length": 256,
+            "win_length": None,
+            "window": "hann",
+            "n_mels": 80,
+            "fmin": 0,
+            "fmax": None,
+            "log_base": None,
+        },
+        lambda_text2mel: float = 1.0,
+        lambda_adv: float = 1.0,
+        lambda_feat_match: float = 2.0,
+        lambda_mel: float = 45.0,
+        cache_generator_outputs: bool = False,
+    ):
+        """Initialize JointText2Wav module.
+
+        Args:
+            idim (int): Input vocabrary size.
+            odim (int): Acoustic feature dimension. The actual output channels will
+                be 1 since the model is the end-to-end text-to-wave model but for the
+                compatibility odim is used to indicate the acoustic feature dimension.
+            segment_size (int): Segment size for random windowed inputs.
+            sampling_rate (int): Sampling rate, not used for the training but it will
+                be referred in saving waveform during the inference.
+            text2mel_type (str): The text2mel model type.
+            text2mel_params (Dict[str, Any]): Parameter dict for text2mel model.
+            use_pqmf (bool): Whether to use PQMF for multi-band vocoder.
+            pqmf_params (Dict[str, Any]): Parameter dict for PQMF module.
+            vocoder_type (str): The vocoder model type.
+            vocoder_params (Dict[str, Any]): Parameter dict for vocoder model.
+            discriminator_type (str): Discriminator type.
+            discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
+            generator_adv_loss_params (Dict[str, Any]): Parameter dict for generator
+                adversarial loss.
+            discriminator_adv_loss_params (Dict[str, Any]): Parameter dict for
+                discriminator adversarial loss.
+            use_feat_match_loss (bool): Whether to use feat match loss.
+            feat_match_loss_params (Dict[str, Any]): Parameter dict for feat match loss.
+            use_mel_loss (bool): Whether to use mel loss.
+            mel_loss_params (Dict[str, Any]): Parameter dict for mel loss.
+            lambda_text2mel (float): Loss scaling coefficient for text2mel model loss.
+            lambda_adv (float): Loss scaling coefficient for adversarial loss.
+            lambda_feat_match (float): Loss scaling coefficient for feat match loss.
+            lambda_mel (float): Loss scaling coefficient for mel loss.
+            cache_generator_outputs (bool): Whether to cache generator outputs.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.segment_size = segment_size
+        self.use_pqmf = use_pqmf
+
+        # define modules
+        self.generator = torch.nn.ModuleDict()
+        text2mel_class = AVAILABLE_TEXT2MEL[text2mel_type]
+        text2mel_params.update(idim=idim, odim=odim)
+        self.generator["text2mel"] = text2mel_class(
+            **text2mel_params,
+        )
+        vocoder_class = AVAILABLE_VOCODER[vocoder_type]
+        if vocoder_type in ["hifigan_generator", "melgan_generator"]:
+            vocoder_params.update(in_channels=odim)
+        elif vocoder_type in ["parallel_wavegan_generator", "style_melgan_generator"]:
+            vocoder_params.update(aux_channels=odim)
+        self.generator["vocoder"] = vocoder_class(
+            **vocoder_params,
+        )
+        if self.use_pqmf:
+            self.pqmf = PQMF(**pqmf_params)
+        discriminator_class = AVAILABLE_DISCRIMINATORS[discriminator_type]
+        self.discriminator = discriminator_class(
+            **discriminator_params,
+        )
+        self.generator_adv_loss = GeneratorAdversarialLoss(
+            **generator_adv_loss_params,
+        )
+        self.discriminator_adv_loss = DiscriminatorAdversarialLoss(
+            **discriminator_adv_loss_params,
+        )
+        self.use_feat_match_loss = use_feat_match_loss
+        if self.use_feat_match_loss:
+            self.feat_match_loss = FeatureMatchLoss(
+                **feat_match_loss_params,
+            )
+        self.use_mel_loss = use_mel_loss
+        if self.use_mel_loss:
+            self.mel_loss = MelSpectrogramLoss(
+                **mel_loss_params,
+            )
+
+        # coefficients
+        self.lambda_text2mel = lambda_text2mel
+        self.lambda_adv = lambda_adv
+        if self.use_feat_match_loss:
+            self.lambda_feat_match = lambda_feat_match
+        if self.use_mel_loss:
+            self.lambda_mel = lambda_mel
+
+        # cache
+        self.cache_generator_outputs = cache_generator_outputs
+        self._cache = None
+
+        # store sampling rate for saving wav file
+        # (not used for the training)
+        self.fs = sampling_rate
+
+        # store parameters for test compatibility
+        self.spks = self.generator["text2mel"].spks
+        self.langs = self.generator["text2mel"].langs
+        self.spk_embed_dim = self.generator["text2mel"].spk_embed_dim
+
+    @property
+    def require_raw_speech(self):
+        """Return whether or not speech is required."""
+        return True
+
+    @property
+    def require_vocoder(self):
+        """Return whether or not vocoder is required."""
+        return False
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        forward_generator: bool = True,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Perform generator forward.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+            forward_generator (bool): Whether to forward generator.
+
+        Returns:
+            Dict[str, Any]:
+                - loss (Tensor): Loss scalar tensor.
+                - stats (Dict[str, float]): Statistics to be monitored.
+                - weight (Tensor): Weight tensor to summarize losses.
+                - optim_idx (int): Optimizer index (0 for G and 1 for D).
+
+        """
+        if forward_generator:
+            return self._forward_generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                speech=speech,
+                speech_lengths=speech_lengths,
+                **kwargs,
+            )
+        else:
+            return self._forward_discrminator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                speech=speech,
+                speech_lengths=speech_lengths,
+                **kwargs,
+            )
+
+    def _forward_generator(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Perform generator forward.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+
+        Returns:
+            Dict[str, Any]:
+                * loss (Tensor): Loss scalar tensor.
+                * stats (Dict[str, float]): Statistics to be monitored.
+                * weight (Tensor): Weight tensor to summarize losses.
+                * optim_idx (int): Optimizer index (0 for G and 1 for D).
+
+        """
+        # setup
+        batch_size = text.size(0)
+        speech = speech.unsqueeze(1)
+
+        # calculate generator outputs
+        reuse_cache = True
+        if not self.cache_generator_outputs or self._cache is None:
+            reuse_cache = False
+            # calculate text2mel outputs
+            text2mel_loss, stats, feats_gen = self.generator["text2mel"](
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                joint_training=True,
+                **kwargs,
+            )
+            # get random segments
+            feats_gen_, start_idxs = get_random_segments(
+                x=feats_gen.transpose(1, 2),
+                x_lengths=feats_lengths,
+                segment_size=self.segment_size,
+            )
+            # calculate vocoder outputs
+            speech_hat_ = self.generator["vocoder"](feats_gen_)
+            if self.use_pqmf:
+                speech_hat_ = self.pqmf.synthesis(speech_hat_)
+        else:
+            text2mel_loss, stats, speech_hat_, start_idxs = self._cache
+
+        # store cache
+        if self.training and self.cache_generator_outputs and not reuse_cache:
+            self._cache = (text2mel_loss, stats, speech_hat_, start_idxs)
+
+        speech_ = get_segments(
+            x=speech,
+            start_idxs=start_idxs * self.generator["vocoder"].upsample_factor,
+            segment_size=self.segment_size * self.generator["vocoder"].upsample_factor,
+        )
+
+        # calculate discriminator outputs
+        p_hat = self.discriminator(speech_hat_)
+        with torch.no_grad():
+            # do not store discriminator gradient in generator turn
+            p = self.discriminator(speech_)
+
+        # calculate losses
+        adv_loss = self.generator_adv_loss(p_hat)
+        adv_loss = adv_loss * self.lambda_adv
+        text2mel_loss = text2mel_loss * self.lambda_text2mel
+        loss = adv_loss + text2mel_loss
+        if self.use_feat_match_loss:
+            feat_match_loss = self.feat_match_loss(p_hat, p)
+            feat_match_loss = feat_match_loss * self.lambda_feat_match
+            loss = loss + feat_match_loss
+            stats.update(feat_match_loss=feat_match_loss.item())
+        if self.use_mel_loss:
+            mel_loss = self.mel_loss(speech_hat_, speech_)
+            mel_loss = self.lambda_mel * mel_loss
+            loss = loss + mel_loss
+            stats.update(mel_loss=mel_loss.item())
+
+        stats.update(
+            adv_loss=adv_loss.item(),
+            text2mel_loss=text2mel_loss.item(),
+            loss=loss.item(),
+        )
+
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+
+        # reset cache
+        if reuse_cache or not self.training:
+            self._cache = None
+
+        return {
+            "loss": loss,
+            "stats": stats,
+            "weight": weight,
+            "optim_idx": 0,  # needed for trainer
+        }
+
+    def _forward_discrminator(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Perform discriminator forward.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+
+        Returns:
+            Dict[str, Any]:
+                * loss (Tensor): Loss scalar tensor.
+                * stats (Dict[str, float]): Statistics to be monitored.
+                * weight (Tensor): Weight tensor to summarize losses.
+                * optim_idx (int): Optimizer index (0 for G and 1 for D).
+
+        """
+        # setup
+        batch_size = text.size(0)
+        speech = speech.unsqueeze(1)
+
+        # calculate generator outputs
+        reuse_cache = True
+        if not self.cache_generator_outputs or self._cache is None:
+            reuse_cache = False
+            # calculate text2mel outputs
+            text2mel_loss, stats, feats_gen = self.generator["text2mel"](
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                joint_training=True,
+                **kwargs,
+            )
+            # get random segments
+            feats_gen_, start_idxs = get_random_segments(
+                x=feats_gen.transpose(1, 2),
+                x_lengths=feats_lengths,
+                segment_size=self.segment_size,
+            )
+            # calculate vocoder outputs
+            speech_hat_ = self.generator["vocoder"](feats_gen_)
+            if self.use_pqmf:
+                speech_hat_ = self.pqmf.synthesis(speech_hat_)
+        else:
+            _, _, speech_hat_, start_idxs = self._cache
+
+        # store cache
+        if self.cache_generator_outputs and not reuse_cache:
+            self._cache = (text2mel_loss, stats, speech_hat_, start_idxs)
+
+        # parse outputs
+        speech_ = get_segments(
+            x=speech,
+            start_idxs=start_idxs * self.generator["vocoder"].upsample_factor,
+            segment_size=self.segment_size * self.generator["vocoder"].upsample_factor,
+        )
+
+        # calculate discriminator outputs
+        p_hat = self.discriminator(speech_hat_.detach())
+        p = self.discriminator(speech_)
+
+        # calculate losses
+        real_loss, fake_loss = self.discriminator_adv_loss(p_hat, p)
+        loss = real_loss + fake_loss
+
+        stats = dict(
+            discriminator_loss=loss.item(),
+            real_loss=real_loss.item(),
+            fake_loss=fake_loss.item(),
+        )
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+
+        # reset cache
+        if reuse_cache or not self.training:
+            self._cache = None
+
+        return {
+            "loss": loss,
+            "stats": stats,
+            "weight": weight,
+            "optim_idx": 1,  # needed for trainer
+        }
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        """Run inference.
+
+        Args:
+            text (Tensor): Input text index tensor (T_text,).
+
+        Returns:
+            Dict[str, Tensor]:
+                * wav (Tensor): Generated waveform tensor (T_wav,).
+                * feat_gan (Tensor): Generated feature tensor (T_text, C).
+
+        """
+        output_dict = self.generator["text2mel"].inference(
+            text=text,
+            **kwargs,
+        )
+        wav = self.generator["vocoder"].inference(output_dict["feat_gen"])
+        if self.use_pqmf:
+            wav = self.pqmf.synthesis(wav.unsqueeze(0).transpose(1, 2))
+            wav = wav.squeeze(0).transpose(0, 1)
+        output_dict.update(wav=wav)
+
+        return output_dict
diff --git a/espnet2/gan_tts/melgan/__init__.py b/espnet2/gan_tts/melgan/__init__.py
new file mode 100644
index 00000000000..42364d47f26
--- /dev/null
+++ b/espnet2/gan_tts/melgan/__init__.py
@@ -0,0 +1,3 @@
+from espnet2.gan_tts.melgan.melgan import MelGANDiscriminator  # NOQA
+from espnet2.gan_tts.melgan.melgan import MelGANGenerator  # NOQA
+from espnet2.gan_tts.melgan.melgan import MelGANMultiScaleDiscriminator  # NOQA
diff --git a/espnet2/gan_tts/melgan/melgan.py b/espnet2/gan_tts/melgan/melgan.py
new file mode 100644
index 00000000000..7b1281d14fd
--- /dev/null
+++ b/espnet2/gan_tts/melgan/melgan.py
@@ -0,0 +1,465 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""MelGAN Modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+import logging
+
+from typing import Any
+from typing import Dict
+from typing import List
+
+import numpy as np
+import torch
+
+from espnet2.gan_tts.melgan.residual_stack import ResidualStack
+
+
+class MelGANGenerator(torch.nn.Module):
+    """MelGAN generator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 80,
+        out_channels: int = 1,
+        kernel_size: int = 7,
+        channels: int = 512,
+        bias: bool = True,
+        upsample_scales: List[int] = [8, 8, 2, 2],
+        stack_kernel_size: int = 3,
+        stacks: int = 3,
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
+        pad: str = "ReflectionPad1d",
+        pad_params: Dict[str, Any] = {},
+        use_final_nonlinear_activation: bool = True,
+        use_weight_norm: bool = True,
+    ):
+        """Initialize MelGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (List[int]): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual
+                stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
+                function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (Dict[str, Any]): Hyperparameters for padding function.
+            use_final_nonlinear_activation (torch.nn.Module): Activation function for
+                the final layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+
+        # check hyper parameters is valid
+        assert channels >= np.prod(upsample_scales)
+        assert channels % (2 ** len(upsample_scales)) == 0
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+
+        # add initial layer
+        layers = []
+        layers += [
+            getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
+            torch.nn.Conv1d(in_channels, channels, kernel_size, bias=bias),
+        ]
+
+        self.upsample_factor = int(np.prod(upsample_scales) * out_channels)
+        for i, upsample_scale in enumerate(upsample_scales):
+            # add upsampling layer
+            layers += [
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
+            ]
+            layers += [
+                torch.nn.ConvTranspose1d(
+                    channels // (2**i),
+                    channels // (2 ** (i + 1)),
+                    upsample_scale * 2,
+                    stride=upsample_scale,
+                    padding=upsample_scale // 2 + upsample_scale % 2,
+                    output_padding=upsample_scale % 2,
+                    bias=bias,
+                )
+            ]
+
+            # add residual stack
+            for j in range(stacks):
+                layers += [
+                    ResidualStack(
+                        kernel_size=stack_kernel_size,
+                        channels=channels // (2 ** (i + 1)),
+                        dilation=stack_kernel_size**j,
+                        bias=bias,
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                        pad=pad,
+                        pad_params=pad_params,
+                    )
+                ]
+
+        # add final layer
+        layers += [
+            getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
+        ]
+        layers += [
+            getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
+            torch.nn.Conv1d(
+                channels // (2 ** (i + 1)), out_channels, kernel_size, bias=bias
+            ),
+        ]
+        if use_final_nonlinear_activation:
+            layers += [torch.nn.Tanh()]
+
+        # define the model as a single function
+        self.melgan = torch.nn.Sequential(*layers)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, c: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, 1, T ** prod(upsample_scales)).
+
+        """
+        return self.melgan(c)
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m: torch.nn.Module):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+
+        """
+
+        def _reset_parameters(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
+
+    def inference(self, c: torch.Tensor) -> torch.Tensor:
+        """Perform inference.
+
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
+
+        """
+        c = self.melgan(c.transpose(1, 0).unsqueeze(0))
+        return c.squeeze(0).transpose(1, 0)
+
+
+class MelGANDiscriminator(torch.nn.Module):
+    """MelGAN discriminator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        kernel_sizes: List[int] = [5, 3],
+        channels: int = 16,
+        max_downsample_channels: int = 1024,
+        bias: bool = True,
+        downsample_scales: List[int] = [4, 4, 4, 4],
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
+        pad: str = "ReflectionPad1d",
+        pad_params: Dict[str, Any] = {},
+    ):
+        """Initilize MelGANDiscriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (List[int]): List of two kernel sizes. The prod will be used
+                for the first conv layer, and the first and the second kernel sizes
+                will be used for the last two layers. For example if kernel_sizes =
+                [5, 3], the first layer kernel size will be 5 * 3 = 15, the last two
+                layers' kernel size will be 5 and 3, respectively.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling
+                layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
+                function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (Dict[str, Any]): Hyperparameters for padding function.
+
+        """
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+
+        # check kernel size is valid
+        assert len(kernel_sizes) == 2
+        assert kernel_sizes[0] % 2 == 1
+        assert kernel_sizes[1] % 2 == 1
+
+        # add first layer
+        self.layers += [
+            torch.nn.Sequential(
+                getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
+                torch.nn.Conv1d(
+                    in_channels, channels, np.prod(kernel_sizes), bias=bias
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+
+        # add downsample layers
+        in_chs = channels
+        for downsample_scale in downsample_scales:
+            out_chs = min(in_chs * downsample_scale, max_downsample_channels)
+            self.layers += [
+                torch.nn.Sequential(
+                    torch.nn.Conv1d(
+                        in_chs,
+                        out_chs,
+                        kernel_size=downsample_scale * 10 + 1,
+                        stride=downsample_scale,
+                        padding=downsample_scale * 5,
+                        groups=in_chs // 4,
+                        bias=bias,
+                    ),
+                    getattr(torch.nn, nonlinear_activation)(
+                        **nonlinear_activation_params
+                    ),
+                )
+            ]
+            in_chs = out_chs
+
+        # add final layers
+        out_chs = min(in_chs * 2, max_downsample_channels)
+        self.layers += [
+            torch.nn.Sequential(
+                torch.nn.Conv1d(
+                    in_chs,
+                    out_chs,
+                    kernel_sizes[0],
+                    padding=(kernel_sizes[0] - 1) // 2,
+                    bias=bias,
+                ),
+                getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            )
+        ]
+        self.layers += [
+            torch.nn.Conv1d(
+                out_chs,
+                out_channels,
+                kernel_sizes[1],
+                padding=(kernel_sizes[1] - 1) // 2,
+                bias=bias,
+            ),
+        ]
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List[Tensor]: List of output tensors of each layer.
+
+        """
+        outs = []
+        for f in self.layers:
+            x = f(x)
+            outs += [x]
+
+        return outs
+
+
+class MelGANMultiScaleDiscriminator(torch.nn.Module):
+    """MelGAN multi-scale discriminator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        scales: int = 3,
+        downsample_pooling: str = "AvgPool1d",
+        # follow the official implementation setting
+        downsample_pooling_params: Dict[str, Any] = {
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 1,
+            "count_include_pad": False,
+        },
+        kernel_sizes: List[int] = [5, 3],
+        channels: int = 16,
+        max_downsample_channels: int = 1024,
+        bias: bool = True,
+        downsample_scales: List[int] = [4, 4, 4, 4],
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
+        pad: str = "ReflectionPad1d",
+        pad_params: Dict[str, Any] = {},
+        use_weight_norm: bool = True,
+    ):
+        """Initilize MelGANMultiScaleDiscriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the
+                inputs.
+            downsample_pooling_params (Dict[str, Any]): Parameters for the above
+                pooling module.
+            kernel_sizes (List[int]): List of two kernel sizes. The sum will be used
+                for the first conv layer, and the first and the second kernel sizes
+                will be used for the last two layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling
+                layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
+                function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (Dict[str, Any]): Hyperparameters for padding function.
+            use_weight_norm (bool): Whether to use weight norm.
+
+        """
+        super().__init__()
+        self.discriminators = torch.nn.ModuleList()
+
+        # add discriminators
+        for _ in range(scales):
+            self.discriminators += [
+                MelGANDiscriminator(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_sizes=kernel_sizes,
+                    channels=channels,
+                    max_downsample_channels=max_downsample_channels,
+                    bias=bias,
+                    downsample_scales=downsample_scales,
+                    nonlinear_activation=nonlinear_activation,
+                    nonlinear_activation_params=nonlinear_activation_params,
+                    pad=pad,
+                    pad_params=pad_params,
+                )
+            ]
+        self.pooling = getattr(torch.nn, downsample_pooling)(
+            **downsample_pooling_params
+        )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, x: torch.Tensor) -> List[List[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            List[List[Tensor]]: List of list of each discriminator outputs, which
+                consists of each layer output tensors.
+
+        """
+        outs = []
+        for f in self.discriminators:
+            outs += [f(x)]
+            x = self.pooling(x)
+
+        return outs
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m: torch.nn.Module):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+
+        """
+
+        def _reset_parameters(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
diff --git a/espnet2/gan_tts/melgan/pqmf.py b/espnet2/gan_tts/melgan/pqmf.py
new file mode 100644
index 00000000000..ef4e053d862
--- /dev/null
+++ b/espnet2/gan_tts/melgan/pqmf.py
@@ -0,0 +1,160 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Pseudo QMF modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from scipy.signal import kaiser
+
+
+def design_prototype_filter(
+    taps: int = 62, cutoff_ratio: float = 0.142, beta: float = 9.0
+) -> np.ndarray:
+    """Design prototype filter for PQMF.
+
+    This method is based on `A Kaiser window approach for the design of prototype
+    filters of cosine modulated filterbanks`_.
+
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+
+    Returns:
+        ndarray: Impluse response of prototype filter (taps + 1,).
+
+    .. _`A Kaiser window approach for the design of prototype filters of cosine
+        modulated filterbanks`: https://ieeexplore.ieee.org/abstract/document/681427
+
+    """
+    # check the arguments are valid
+    assert taps % 2 == 0, "The number of taps mush be even number."
+    assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
+
+    # make initial filter
+    omega_c = np.pi * cutoff_ratio
+    with np.errstate(invalid="ignore"):
+        h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (
+            np.pi * (np.arange(taps + 1) - 0.5 * taps)
+        )
+    h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
+
+    # apply kaiser window
+    w = kaiser(taps + 1, beta)
+    h = h_i * w
+
+    return h
+
+
+class PQMF(torch.nn.Module):
+    """PQMF module.
+
+    This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
+
+    .. _`Near-perfect-reconstruction pseudo-QMF banks`:
+        https://ieeexplore.ieee.org/document/258122
+
+    """
+
+    def __init__(
+        self,
+        subbands: int = 4,
+        taps: int = 62,
+        cutoff_ratio: float = 0.142,
+        beta: float = 9.0,
+    ):
+        """Initilize PQMF module.
+
+        The cutoff_ratio and beta parameters are optimized for #subbands = 4.
+        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
+
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
+
+        """
+        super().__init__()
+
+        # build analysis & synthesis filter coefficients
+        h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
+        h_analysis = np.zeros((subbands, len(h_proto)))
+        h_synthesis = np.zeros((subbands, len(h_proto)))
+        for k in range(subbands):
+            h_analysis[k] = (
+                2
+                * h_proto
+                * np.cos(
+                    (2 * k + 1)
+                    * (np.pi / (2 * subbands))
+                    * (np.arange(taps + 1) - (taps / 2))
+                    + (-1) ** k * np.pi / 4
+                )
+            )
+            h_synthesis[k] = (
+                2
+                * h_proto
+                * np.cos(
+                    (2 * k + 1)
+                    * (np.pi / (2 * subbands))
+                    * (np.arange(taps + 1) - (taps / 2))
+                    - (-1) ** k * np.pi / 4
+                )
+            )
+
+        # convert to tensor
+        analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
+        synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
+
+        # register coefficients as beffer
+        self.register_buffer("analysis_filter", analysis_filter)
+        self.register_buffer("synthesis_filter", synthesis_filter)
+
+        # filter for downsampling & upsampling
+        updown_filter = torch.zeros((subbands, subbands, subbands)).float()
+        for k in range(subbands):
+            updown_filter[k, k, 0] = 1.0
+        self.register_buffer("updown_filter", updown_filter)
+        self.subbands = subbands
+
+        # keep padding info
+        self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
+
+    def analysis(self, x: torch.Tensor) -> torch.Tensor:
+        """Analysis with PQMF.
+
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
+
+        """
+        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
+        return F.conv1d(x, self.updown_filter, stride=self.subbands)
+
+    def synthesis(self, x: torch.Tensor) -> torch.Tensor:
+        """Synthesis with PQMF.
+
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+
+        Returns:
+            Tensor: Output tensor (B, 1, T).
+
+        """
+        # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
+        #   Not sure this is the correct way, it is better to check again.
+        # TODO(kan-bayashi): Understand the reconstruction procedure
+        x = F.conv_transpose1d(
+            x, self.updown_filter * self.subbands, stride=self.subbands
+        )
+        return F.conv1d(self.pad_fn(x), self.synthesis_filter)
diff --git a/espnet2/gan_tts/melgan/residual_stack.py b/espnet2/gan_tts/melgan/residual_stack.py
new file mode 100644
index 00000000000..3fb7e927e87
--- /dev/null
+++ b/espnet2/gan_tts/melgan/residual_stack.py
@@ -0,0 +1,71 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Residual stack module in MelGAN.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+from typing import Any
+from typing import Dict
+
+import torch
+
+
+class ResidualStack(torch.nn.Module):
+    """Residual stack module introduced in MelGAN."""
+
+    def __init__(
+        self,
+        kernel_size: int = 3,
+        channels: int = 32,
+        dilation: int = 1,
+        bias: bool = True,
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
+        pad: str = "ReflectionPad1d",
+        pad_params: Dict[str, Any] = {},
+    ):
+        """Initialize ResidualStack module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for
+                activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (Dict[str, Any]): Hyperparameters for padding function.
+
+        """
+        super().__init__()
+
+        # defile residual stack part
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        self.stack = torch.nn.Sequential(
+            getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
+            torch.nn.Conv1d(
+                channels, channels, kernel_size, dilation=dilation, bias=bias
+            ),
+            getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+            torch.nn.Conv1d(channels, channels, 1, bias=bias),
+        )
+
+        # defile extra layer for skip connection
+        self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
+
+    def forward(self, c: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, chennels, T).
+
+        """
+        return self.stack(c) + self.skip_layer(c)
diff --git a/espnet2/gan_tts/parallel_wavegan/__init__.py b/espnet2/gan_tts/parallel_wavegan/__init__.py
new file mode 100644
index 00000000000..357235c4847
--- /dev/null
+++ b/espnet2/gan_tts/parallel_wavegan/__init__.py
@@ -0,0 +1,4 @@
+from espnet2.gan_tts.parallel_wavegan.parallel_wavegan import (  # NOQA
+    ParallelWaveGANDiscriminator,  # NOQA
+    ParallelWaveGANGenerator,  # NOQA
+)
diff --git a/espnet2/gan_tts/parallel_wavegan/parallel_wavegan.py b/espnet2/gan_tts/parallel_wavegan/parallel_wavegan.py
new file mode 100644
index 00000000000..85b9ac224ae
--- /dev/null
+++ b/espnet2/gan_tts/parallel_wavegan/parallel_wavegan.py
@@ -0,0 +1,357 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Parallel WaveGAN Modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+import logging
+import math
+
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import numpy as np
+import torch
+
+from espnet2.gan_tts.parallel_wavegan import upsample
+from espnet2.gan_tts.wavenet.residual_block import Conv1d
+from espnet2.gan_tts.wavenet.residual_block import Conv1d1x1
+from espnet2.gan_tts.wavenet.residual_block import ResidualBlock
+
+
+class ParallelWaveGANGenerator(torch.nn.Module):
+    """Parallel WaveGAN Generator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        kernel_size: int = 3,
+        layers: int = 30,
+        stacks: int = 3,
+        residual_channels: int = 64,
+        gate_channels: int = 128,
+        skip_channels: int = 64,
+        aux_channels: int = 80,
+        aux_context_window: int = 2,
+        dropout_rate: float = 0.0,
+        bias: bool = True,
+        use_weight_norm: bool = True,
+        upsample_conditional_features: bool = True,
+        upsample_net: str = "ConvInUpsampleNetwork",
+        upsample_params: Dict[str, Any] = {"upsample_scales": [4, 4, 4, 4]},
+    ):
+        """Initialize ParallelWaveGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of dilated convolution.
+            layers (int): Number of residual block layers.
+            stacks (int): Number of stacks i.e., dilation cycles.
+            residual_channels (int): Number of channels in residual conv.
+            gate_channels (int):  Number of channels in gated conv.
+            skip_channels (int): Number of channels in skip conv.
+            aux_channels (int): Number of channels for auxiliary feature conv.
+            aux_context_window (int): Context window size for auxiliary feature.
+            dropout_rate (float): Dropout rate. 0.0 means no dropout applied.
+            bias (bool): Whether to use bias parameter in conv layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            upsample_conditional_features (bool): Whether to use upsampling network.
+            upsample_net (str): Upsampling network architecture.
+            upsample_params (Dict[str, Any]): Upsampling network parameters.
+
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.aux_channels = aux_channels
+        self.aux_context_window = aux_context_window
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+
+        # check the number of layers and stacks
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        # define first convolution
+        self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True)
+
+        # define conv + upsampling network
+        if upsample_conditional_features:
+            if upsample_net == "ConvInUpsampleNetwork":
+                upsample_params.update(
+                    {
+                        "aux_channels": aux_channels,
+                        "aux_context_window": aux_context_window,
+                    }
+                )
+            self.upsample_net = getattr(upsample, upsample_net)(**upsample_params)
+            self.upsample_factor = int(np.prod(upsample_params["upsample_scales"]))
+        else:
+            self.upsample_net = None
+            self.upsample_factor = out_channels
+
+        # define residual blocks
+        self.conv_layers = torch.nn.ModuleList()
+        for layer in range(layers):
+            dilation = 2 ** (layer % layers_per_stack)
+            conv = ResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=aux_channels,
+                dilation=dilation,
+                dropout_rate=dropout_rate,
+                bias=bias,
+                scale_residual=True,
+            )
+            self.conv_layers += [conv]
+
+        # define output layers
+        self.last_conv_layers = torch.nn.ModuleList(
+            [
+                torch.nn.ReLU(),
+                Conv1d1x1(skip_channels, skip_channels, bias=True),
+                torch.nn.ReLU(),
+                Conv1d1x1(skip_channels, out_channels, bias=True),
+            ]
+        )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # NOTE(kan-bayashi): register pre hook function for the compatibility with
+        #   parallel_wavegan repo
+        self._register_load_state_dict_pre_hook(self._load_state_dict_pre_hook)
+
+    def forward(
+        self, c: torch.Tensor, z: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Local conditioning auxiliary features (B, C ,T_feats).
+            z (Tensor): Input noise signal (B, 1, T_wav).
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_wav)
+
+        """
+        if z is None:
+            b, _, t = c.size()
+            z = torch.randn(b, 1, t * self.upsample_factor).to(
+                device=c.device, dtype=c.dtype
+            )
+
+        # perform upsampling
+        if self.upsample_net is not None:
+            c = self.upsample_net(c)
+            assert c.size(-1) == z.size(-1)
+
+        # encode to hidden representation
+        x = self.first_conv(z)
+        skips = 0
+        for f in self.conv_layers:
+            x, h = f(x=x, x_mask=None, c=c)
+            skips += h
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+
+        # apply final layers
+        x = skips
+        for f in self.last_conv_layers:
+            x = f(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m: torch.nn.Module):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    @staticmethod
+    def _get_receptive_field_size(
+        layers, stacks, kernel_size, dilation=lambda x: 2**x
+    ):
+        assert layers % stacks == 0
+        layers_per_cycle = layers // stacks
+        dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
+        return (kernel_size - 1) * sum(dilations) + 1
+
+    @property
+    def receptive_field_size(self):
+        """Return receptive field size."""
+        return self._get_receptive_field_size(
+            self.layers, self.stacks, self.kernel_size
+        )
+
+    def inference(
+        self, c: torch.Tensor, z: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Perform inference.
+
+        Args:
+            c (Tensor): Local conditioning auxiliary features (T_feats ,C).
+            z (Optional[Tensor]): Input noise signal (T_wav, 1).
+
+        Returns:
+            Tensor: Output tensor (T_wav, out_channels)
+
+        """
+        if z is not None:
+            z = z.transpose(1, 0).unsqueeze(0)
+        c = c.transpose(1, 0).unsqueeze(0)
+        return self.forward(c, z).squeeze(0).transpose(1, 0)
+
+    def _load_state_dict_pre_hook(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        """Apply pre hook function before loading state dict."""
+        keys = list(state_dict.keys())
+        for k in keys:
+            if "conv1x1_skip" in k.replace(prefix, ""):
+                v_skip = state_dict.pop(k)
+                v_out = state_dict[k.replace("skip", "out")]
+                state_dict[k.replace("skip", "out")] = torch.cat([v_out, v_skip], dim=0)
+
+
+class ParallelWaveGANDiscriminator(torch.nn.Module):
+    """Parallel WaveGAN Discriminator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        kernel_size: int = 3,
+        layers: int = 10,
+        conv_channels: int = 64,
+        dilation_factor: int = 1,
+        nonlinear_activation: str = "LeakyReLU",
+        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
+        bias: bool = True,
+        use_weight_norm: bool = True,
+    ):
+        """Initialize ParallelWaveGANDiscriminator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Number of output channels.
+            layers (int): Number of conv layers.
+            conv_channels (int): Number of chnn layers.
+            dilation_factor (int): Dilation factor. For example, if dilation_factor = 2,
+                the dilation will be 2, 4, 8, ..., and so on.
+            nonlinear_activation (str): Nonlinear function after each conv.
+            nonlinear_activation_params (Dict[str, Any]): Nonlinear function parameters
+            bias (bool): Whether to use bias parameter in conv.
+            use_weight_norm (bool) Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        assert dilation_factor > 0, "Dilation factor must be > 0."
+        self.conv_layers = torch.nn.ModuleList()
+        conv_in_channels = in_channels
+        for i in range(layers - 1):
+            if i == 0:
+                dilation = 1
+            else:
+                dilation = i if dilation_factor == 1 else dilation_factor**i
+                conv_in_channels = conv_channels
+            padding = (kernel_size - 1) // 2 * dilation
+            conv_layer = [
+                Conv1d(
+                    conv_in_channels,
+                    conv_channels,
+                    kernel_size=kernel_size,
+                    padding=padding,
+                    dilation=dilation,
+                    bias=bias,
+                ),
+                getattr(torch.nn, nonlinear_activation)(
+                    inplace=True, **nonlinear_activation_params
+                ),
+            ]
+            self.conv_layers += conv_layer
+        padding = (kernel_size - 1) // 2
+        last_conv_layer = Conv1d(
+            conv_in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            bias=bias,
+        )
+        self.conv_layers += [last_conv_layer]
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+
+        Returns:
+            Tensor: Output tensor (B, 1, T).
+
+        """
+        for f in self.conv_layers:
+            x = f(x)
+        return x
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m: torch.nn.Module):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
diff --git a/espnet2/gan_tts/parallel_wavegan/upsample.py b/espnet2/gan_tts/parallel_wavegan/upsample.py
new file mode 100644
index 00000000000..4e0acee577c
--- /dev/null
+++ b/espnet2/gan_tts/parallel_wavegan/upsample.py
@@ -0,0 +1,189 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Upsampling module.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from espnet2.gan_tts.wavenet.residual_block import Conv1d
+
+
+class Stretch2d(torch.nn.Module):
+    """Stretch2d module."""
+
+    def __init__(self, x_scale: int, y_scale: int, mode: str = "nearest"):
+        """Initialize Stretch2d module.
+
+        Args:
+            x_scale (int): X scaling factor (Time axis in spectrogram).
+            y_scale (int): Y scaling factor (Frequency axis in spectrogram).
+            mode (str): Interpolation mode.
+
+        """
+        super().__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+        self.mode = mode
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, C, F, T).
+
+        Returns:
+            Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
+
+        """
+        return F.interpolate(
+            x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode
+        )
+
+
+class Conv2d(torch.nn.Conv2d):
+    """Conv2d module with customized initialization."""
+
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv2d module."""
+        super().__init__(*args, **kwargs)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.weight.data.fill_(1.0 / np.prod(self.kernel_size))
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+
+
+class UpsampleNetwork(torch.nn.Module):
+    """Upsampling network module."""
+
+    def __init__(
+        self,
+        upsample_scales: List[int],
+        nonlinear_activation: Optional[str] = None,
+        nonlinear_activation_params: Dict[str, Any] = {},
+        interpolate_mode: str = "nearest",
+        freq_axis_kernel_size: int = 1,
+    ):
+        """Initialize UpsampleNetwork module.
+
+        Args:
+            upsample_scales (List[int]): List of upsampling scales.
+            nonlinear_activation (Optional[str]): Activation function name.
+            nonlinear_activation_params (Dict[str, Any]): Arguments for the specified
+                activation function.
+            interpolate_mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+
+        """
+        super().__init__()
+        self.up_layers = torch.nn.ModuleList()
+        for scale in upsample_scales:
+            # interpolation layer
+            stretch = Stretch2d(scale, 1, interpolate_mode)
+            self.up_layers += [stretch]
+
+            # conv layer
+            assert (
+                freq_axis_kernel_size - 1
+            ) % 2 == 0, "Not support even number freq axis kernel size."
+            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
+            kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
+            padding = (freq_axis_padding, scale)
+            conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
+            self.up_layers += [conv]
+
+            # nonlinear
+            if nonlinear_activation is not None:
+                nonlinear = getattr(torch.nn, nonlinear_activation)(
+                    **nonlinear_activation_params
+                )
+                self.up_layers += [nonlinear]
+
+    def forward(self, c: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            c : Input tensor (B, C, T_feats).
+
+        Returns:
+            Tensor: Upsampled tensor (B, C, T_wav).
+
+        """
+        c = c.unsqueeze(1)  # (B, 1, C, T)
+        for f in self.up_layers:
+            c = f(c)
+        return c.squeeze(1)  # (B, C, T')
+
+
+class ConvInUpsampleNetwork(torch.nn.Module):
+    """Convolution + upsampling network module."""
+
+    def __init__(
+        self,
+        upsample_scales: List[int],
+        nonlinear_activation: Optional[str] = None,
+        nonlinear_activation_params: Dict[str, Any] = {},
+        interpolate_mode: str = "nearest",
+        freq_axis_kernel_size: int = 1,
+        aux_channels: int = 80,
+        aux_context_window: int = 0,
+    ):
+        """Initialize ConvInUpsampleNetwork module.
+
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (Optional[str]): Activation function name.
+            nonlinear_activation_params (Dict[str, Any]): Arguments for the specified
+                activation function.
+            mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of
+                frequency axis.
+            aux_channels (int): Number of channels of pre-conv layer.
+            aux_context_window (int): Context window size of the pre-conv layer.
+
+        """
+        super().__init__()
+        self.aux_context_window = aux_context_window
+        # To capture wide-context information in conditional features
+        kernel_size = 2 * aux_context_window + 1
+        # NOTE(kan-bayashi): Use pad here, which is not used in parallel_wavegan
+        self.pad = torch.nn.ReplicationPad1d(aux_context_window)
+        self.conv_in = Conv1d(
+            aux_channels,
+            aux_channels,
+            kernel_size=kernel_size,
+            bias=False,
+        )
+        self.upsample = UpsampleNetwork(
+            upsample_scales=upsample_scales,
+            nonlinear_activation=nonlinear_activation,
+            nonlinear_activation_params=nonlinear_activation_params,
+            interpolate_mode=interpolate_mode,
+            freq_axis_kernel_size=freq_axis_kernel_size,
+        )
+
+    def forward(self, c: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, C, T_feats).
+
+        Returns:
+            Tensor: Upsampled tensor (B, C, T_wav),
+                where T_wav = T_feats * prod(upsample_scales).
+
+        """
+        c = self.conv_in(self.pad(c))
+        return self.upsample(c)
diff --git a/espnet2/gan_tts/style_melgan/__init__.py b/espnet2/gan_tts/style_melgan/__init__.py
new file mode 100644
index 00000000000..8f47edde698
--- /dev/null
+++ b/espnet2/gan_tts/style_melgan/__init__.py
@@ -0,0 +1,2 @@
+from espnet2.gan_tts.style_melgan.style_melgan import StyleMelGANDiscriminator  # NOQA
+from espnet2.gan_tts.style_melgan.style_melgan import StyleMelGANGenerator  # NOQA
diff --git a/espnet2/gan_tts/style_melgan/style_melgan.py b/espnet2/gan_tts/style_melgan/style_melgan.py
new file mode 100644
index 00000000000..4934a094f23
--- /dev/null
+++ b/espnet2/gan_tts/style_melgan/style_melgan.py
@@ -0,0 +1,355 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""StyleMelGAN Modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+import copy
+import logging
+import math
+
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from espnet2.gan_tts.melgan import MelGANDiscriminator as BaseDiscriminator
+from espnet2.gan_tts.melgan.pqmf import PQMF
+from espnet2.gan_tts.style_melgan.tade_res_block import TADEResBlock
+
+
+class StyleMelGANGenerator(torch.nn.Module):
+    """Style MelGAN generator module."""
+
+    def __init__(
+        self,
+        in_channels: int = 128,
+        aux_channels: int = 80,
+        channels: int = 64,
+        out_channels: int = 1,
+        kernel_size: int = 9,
+        dilation: int = 2,
+        bias: bool = True,
+        noise_upsample_scales: List[int] = [11, 2, 2, 2],
+        noise_upsample_activation: str = "LeakyReLU",
+        noise_upsample_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
+        upsample_scales: List[int] = [2, 2, 2, 2, 2, 2, 2, 2, 1],
+        upsample_mode: str = "nearest",
+        gated_function: str = "softmax",
+        use_weight_norm: bool = True,
+    ):
+        """Initilize StyleMelGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input noise channels.
+            aux_channels (int): Number of auxiliary input channels.
+            channels (int): Number of channels for conv layer.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of conv layers.
+            dilation (int): Dilation factor for conv layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            noise_upsample_scales (List[int]): List of noise upsampling scales.
+            noise_upsample_activation (str): Activation function module name for noise
+                upsampling.
+            noise_upsample_activation_params (Dict[str, Any]): Hyperparameters for the
+                above activation function.
+            upsample_scales (List[int]): List of upsampling scales.
+            upsample_mode (str): Upsampling mode in TADE layer.
+            gated_function (str): Gated function used in TADEResBlock
+                ("softmax" or "sigmoid").
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+
+        self.in_channels = in_channels
+
+        noise_upsample = []
+        in_chs = in_channels
+        for noise_upsample_scale in noise_upsample_scales:
+            # NOTE(kan-bayashi): How should we design noise upsampling part?
+            noise_upsample += [
+                torch.nn.ConvTranspose1d(
+                    in_chs,
+                    channels,
+                    noise_upsample_scale * 2,
+                    stride=noise_upsample_scale,
+                    padding=noise_upsample_scale // 2 + noise_upsample_scale % 2,
+                    output_padding=noise_upsample_scale % 2,
+                    bias=bias,
+                )
+            ]
+            noise_upsample += [
+                getattr(torch.nn, noise_upsample_activation)(
+                    **noise_upsample_activation_params
+                )
+            ]
+            in_chs = channels
+        self.noise_upsample = torch.nn.Sequential(*noise_upsample)
+        self.noise_upsample_factor = int(np.prod(noise_upsample_scales))
+
+        self.blocks = torch.nn.ModuleList()
+        aux_chs = aux_channels
+        for upsample_scale in upsample_scales:
+            self.blocks += [
+                TADEResBlock(
+                    in_channels=channels,
+                    aux_channels=aux_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    bias=bias,
+                    upsample_factor=upsample_scale,
+                    upsample_mode=upsample_mode,
+                    gated_function=gated_function,
+                ),
+            ]
+            aux_chs = channels
+        self.upsample_factor = int(np.prod(upsample_scales) * out_channels)
+
+        self.output_conv = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                channels,
+                out_channels,
+                kernel_size,
+                1,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            torch.nn.Tanh(),
+        )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(
+        self, c: torch.Tensor, z: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            c (Tensor): Auxiliary input tensor (B, channels, T).
+            z (Tensor): Input noise tensor (B, in_channels, 1).
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        """
+        if z is None:
+            z = torch.randn(c.size(0), self.in_channels, 1).to(
+                device=c.device,
+                dtype=c.dtype,
+            )
+        x = self.noise_upsample(z)
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)
+        return x
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m: torch.nn.Module):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+
+        def _reset_parameters(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
+
+    def inference(self, c: torch.Tensor) -> torch.Tensor:
+        """Perform inference.
+
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
+
+        """
+        c = c.transpose(1, 0).unsqueeze(0)
+
+        # prepare noise input
+        noise_size = (
+            1,
+            self.in_channels,
+            math.ceil(c.size(2) / self.noise_upsample_factor),
+        )
+        noise = torch.randn(*noise_size, dtype=torch.float).to(
+            next(self.parameters()).device
+        )
+        x = self.noise_upsample(noise)
+
+        # NOTE(kan-bayashi): To remove pop noise at the end of audio, perform padding
+        #    for feature sequence and after generation cut the generated audio. This
+        #    requires additional computation but it can prevent pop noise.
+        total_length = c.size(2) * self.upsample_factor
+        c = F.pad(c, (0, x.size(2) - c.size(2)), "replicate")
+
+        # This version causes pop noise.
+        # x = x[:, :, :c.size(2)]
+
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)[..., :total_length]
+
+        return x.squeeze(0).transpose(1, 0)
+
+
+class StyleMelGANDiscriminator(torch.nn.Module):
+    """Style MelGAN disciminator module."""
+
+    def __init__(
+        self,
+        repeats: int = 2,
+        window_sizes: List[int] = [512, 1024, 2048, 4096],
+        pqmf_params: List[List[int]] = [
+            [1, None, None, None],
+            [2, 62, 0.26700, 9.0],
+            [4, 62, 0.14200, 9.0],
+            [8, 62, 0.07949, 9.0],
+        ],
+        discriminator_params: Dict[str, Any] = {
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 16,
+            "max_downsample_channels": 512,
+            "bias": True,
+            "downsample_scales": [4, 4, 4, 1],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.2},
+            "pad": "ReflectionPad1d",
+            "pad_params": {},
+        },
+        use_weight_norm: bool = True,
+    ):
+        """Initilize StyleMelGANDiscriminator module.
+
+        Args:
+            repeats (int): Number of repititons to apply RWD.
+            window_sizes (List[int]): List of random window sizes.
+            pqmf_params (List[List[int]]): List of list of Parameters for PQMF modules
+            discriminator_params (Dict[str, Any]): Parameters for base discriminator
+                module.
+            use_weight_nom (bool): Whether to apply weight normalization.
+
+        """
+        super().__init__()
+
+        # window size check
+        assert len(window_sizes) == len(pqmf_params)
+        sizes = [ws // p[0] for ws, p in zip(window_sizes, pqmf_params)]
+        assert len(window_sizes) == sum([sizes[0] == size for size in sizes])
+
+        self.repeats = repeats
+        self.window_sizes = window_sizes
+        self.pqmfs = torch.nn.ModuleList()
+        self.discriminators = torch.nn.ModuleList()
+        for pqmf_param in pqmf_params:
+            d_params = copy.deepcopy(discriminator_params)
+            d_params["in_channels"] = pqmf_param[0]
+            if pqmf_param[0] == 1:
+                self.pqmfs += [torch.nn.Identity()]
+            else:
+                self.pqmfs += [PQMF(*pqmf_param)]
+            self.discriminators += [BaseDiscriminator(**d_params)]
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+
+        Returns:
+            List: List of discriminator outputs, #items in the list will be
+                equal to repeats * #discriminators.
+
+        """
+        outs = []
+        for _ in range(self.repeats):
+            outs += self._forward(x)
+
+        return outs
+
+    def _forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        outs = []
+        for idx, (ws, pqmf, disc) in enumerate(
+            zip(self.window_sizes, self.pqmfs, self.discriminators)
+        ):
+            # NOTE(kan-bayashi): Is it ok to apply different window for real and fake
+            #   samples?
+            start_idx = np.random.randint(x.size(-1) - ws)
+            x_ = x[:, :, start_idx : start_idx + ws]
+            if idx == 0:
+                x_ = pqmf(x_)
+            else:
+                x_ = pqmf.analysis(x_)
+            outs += [disc(x_)]
+        return outs
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+
+        def _reset_parameters(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(
+                m, torch.nn.ConvTranspose1d
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+                logging.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
diff --git a/espnet2/gan_tts/style_melgan/tade_res_block.py b/espnet2/gan_tts/style_melgan/tade_res_block.py
new file mode 100644
index 00000000000..3a930466821
--- /dev/null
+++ b/espnet2/gan_tts/style_melgan/tade_res_block.py
@@ -0,0 +1,185 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""StyleMelGAN's TADEResBlock Modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+from functools import partial
+
+import torch
+
+
+class TADELayer(torch.nn.Module):
+    """TADE Layer module."""
+
+    def __init__(
+        self,
+        in_channels: int = 64,
+        aux_channels: int = 80,
+        kernel_size: int = 9,
+        bias: bool = True,
+        upsample_factor: int = 2,
+        upsample_mode: str = "nearest",
+    ):
+        """Initilize TADELayer module.
+
+        Args:
+            in_channels (int): Number of input channles.
+            aux_channels (int): Number of auxirialy channles.
+            kernel_size (int): Kernel size.
+            bias (bool): Whether to use bias parameter in conv.
+            upsample_factor (int): Upsample factor.
+            upsample_mode (str): Upsample mode.
+
+        """
+        super().__init__()
+        self.norm = torch.nn.InstanceNorm1d(in_channels)
+        self.aux_conv = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                aux_channels,
+                in_channels,
+                kernel_size,
+                1,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            # NOTE(kan-bayashi): Use non-linear activation?
+        )
+        self.gated_conv = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                in_channels,
+                in_channels * 2,
+                kernel_size,
+                1,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            # NOTE(kan-bayashi): Use non-linear activation?
+        )
+        self.upsample = torch.nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode
+        )
+
+    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T').
+
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * in_upsample_factor).
+            Tensor: Upsampled aux tensor (B, in_channels, T * aux_upsample_factor).
+
+        """
+        x = self.norm(x)
+        c = self.upsample(c)
+        c = self.aux_conv(c)
+        cg = self.gated_conv(c)
+        cg1, cg2 = cg.split(cg.size(1) // 2, dim=1)
+        # NOTE(kan-bayashi): Use upsample for noise input here?
+        y = cg1 * self.upsample(x) + cg2
+        # NOTE(kan-bayashi): Return upsampled aux here?
+        return y, c
+
+
+class TADEResBlock(torch.nn.Module):
+    """TADEResBlock module."""
+
+    def __init__(
+        self,
+        in_channels: int = 64,
+        aux_channels: int = 80,
+        kernel_size: int = 9,
+        dilation: int = 2,
+        bias: bool = True,
+        upsample_factor: int = 2,
+        upsample_mode: str = "nearest",
+        gated_function: str = "softmax",
+    ):
+        """Initialize TADEResBlock module.
+
+        Args:
+            in_channels (int): Number of input channles.
+            aux_channels (int): Number of auxirialy channles.
+            kernel_size (int): Kernel size.
+            bias (bool): Whether to use bias parameter in conv.
+            upsample_factor (int): Upsample factor.
+            upsample_mode (str): Upsample mode.
+            gated_function (str): Gated function type (softmax of sigmoid).
+
+        """
+        super().__init__()
+        self.tade1 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=aux_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            # NOTE(kan-bayashi): Use upsample in the first TADE layer?
+            upsample_factor=1,
+            upsample_mode=upsample_mode,
+        )
+        self.gated_conv1 = torch.nn.Conv1d(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias=bias,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.tade2 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=in_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=upsample_factor,
+            upsample_mode=upsample_mode,
+        )
+        self.gated_conv2 = torch.nn.Conv1d(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias=bias,
+            dilation=dilation,
+            padding=(kernel_size - 1) // 2 * dilation,
+        )
+        self.upsample = torch.nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode
+        )
+        if gated_function == "softmax":
+            self.gated_function = partial(torch.softmax, dim=1)
+        elif gated_function == "sigmoid":
+            self.gated_function = torch.sigmoid
+        else:
+            raise ValueError(f"{gated_function} is not supported.")
+
+    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T').
+
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * in_upsample_factor).
+            Tensor: Upsampled auxirialy tensor (B, in_channels, T * in_upsample_factor).
+
+        """
+        residual = x
+
+        x, c = self.tade1(x, c)
+        x = self.gated_conv1(x)
+        xa, xb = x.split(x.size(1) // 2, dim=1)
+        x = self.gated_function(xa) * torch.tanh(xb)
+
+        x, c = self.tade2(x, c)
+        x = self.gated_conv2(x)
+        xa, xb = x.split(x.size(1) // 2, dim=1)
+        x = self.gated_function(xa) * torch.tanh(xb)
+
+        # NOTE(kan-bayashi): Return upsampled aux here?
+        return self.upsample(residual) + x, c
diff --git a/espnet2/gan_tts/utils/__init__.py b/espnet2/gan_tts/utils/__init__.py
new file mode 100644
index 00000000000..591d46fc2a5
--- /dev/null
+++ b/espnet2/gan_tts/utils/__init__.py
@@ -0,0 +1,2 @@
+from espnet2.gan_tts.utils.get_random_segments import get_random_segments  # NOQA
+from espnet2.gan_tts.utils.get_random_segments import get_segments  # NOQA
diff --git a/espnet2/gan_tts/utils/get_random_segments.py b/espnet2/gan_tts/utils/get_random_segments.py
new file mode 100644
index 00000000000..a8e99b04c80
--- /dev/null
+++ b/espnet2/gan_tts/utils/get_random_segments.py
@@ -0,0 +1,57 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Function to get random segments."""
+
+from typing import Tuple
+
+import torch
+
+
+def get_random_segments(
+    x: torch.Tensor,
+    x_lengths: torch.Tensor,
+    segment_size: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Get random segments.
+
+    Args:
+        x (Tensor): Input tensor (B, C, T).
+        x_lengths (Tensor): Length tensor (B,).
+        segment_size (int): Segment size.
+
+    Returns:
+        Tensor: Segmented tensor (B, C, segment_size).
+        Tensor: Start index tensor (B,).
+
+    """
+    b, c, t = x.size()
+    max_start_idx = x_lengths - segment_size
+    start_idxs = (torch.rand([b]).to(x.device) * max_start_idx).to(
+        dtype=torch.long,
+    )
+    segments = get_segments(x, start_idxs, segment_size)
+    return segments, start_idxs
+
+
+def get_segments(
+    x: torch.Tensor,
+    start_idxs: torch.Tensor,
+    segment_size: int,
+) -> torch.Tensor:
+    """Get segments.
+
+    Args:
+        x (Tensor): Input tensor (B, C, T).
+        start_idxs (Tensor): Start index tensor (B,).
+        segment_size (int): Segment size.
+
+    Returns:
+        Tensor: Segmented tensor (B, C, segment_size).
+
+    """
+    b, c, t = x.size()
+    segments = x.new_zeros(b, c, segment_size)
+    for i, start_idx in enumerate(start_idxs):
+        segments[i] = x[i, :, start_idx : start_idx + segment_size]
+    return segments
diff --git a/espnet2/gan_tts/vits/__init__.py b/espnet2/gan_tts/vits/__init__.py
new file mode 100644
index 00000000000..348923cbfbc
--- /dev/null
+++ b/espnet2/gan_tts/vits/__init__.py
@@ -0,0 +1 @@
+from espnet2.gan_tts.vits.vits import VITS  # NOQA
diff --git a/espnet2/gan_tts/vits/duration_predictor.py b/espnet2/gan_tts/vits/duration_predictor.py
new file mode 100644
index 00000000000..5a480b11344
--- /dev/null
+++ b/espnet2/gan_tts/vits/duration_predictor.py
@@ -0,0 +1,191 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Stochastic duration predictor modules in VITS.
+
+This code is based on https://github.com/jaywalnut310/vits.
+
+"""
+
+import math
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from espnet2.gan_tts.vits.flow import ConvFlow
+from espnet2.gan_tts.vits.flow import DilatedDepthSeparableConv
+from espnet2.gan_tts.vits.flow import ElementwiseAffineFlow
+from espnet2.gan_tts.vits.flow import FlipFlow
+from espnet2.gan_tts.vits.flow import LogFlow
+
+
+class StochasticDurationPredictor(torch.nn.Module):
+    """Stochastic duration predictor module.
+
+    This is a module of stochastic duration predictor described in `Conditional
+    Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech`_.
+
+    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
+        Text-to-Speech`: https://arxiv.org/abs/2006.04558
+
+    """
+
+    def __init__(
+        self,
+        channels: int = 192,
+        kernel_size: int = 3,
+        dropout_rate: float = 0.5,
+        flows: int = 4,
+        dds_conv_layers: int = 3,
+        global_channels: int = -1,
+    ):
+        """Initialize StochasticDurationPredictor module.
+
+        Args:
+            channels (int): Number of channels.
+            kernel_size (int): Kernel size.
+            dropout_rate (float): Dropout rate.
+            flows (int): Number of flows.
+            dds_conv_layers (int): Number of conv layers in DDS conv.
+            global_channels (int): Number of global conditioning channels.
+
+        """
+        super().__init__()
+
+        self.pre = torch.nn.Conv1d(channels, channels, 1)
+        self.dds = DilatedDepthSeparableConv(
+            channels,
+            kernel_size,
+            layers=dds_conv_layers,
+            dropout_rate=dropout_rate,
+        )
+        self.proj = torch.nn.Conv1d(channels, channels, 1)
+
+        self.log_flow = LogFlow()
+        self.flows = torch.nn.ModuleList()
+        self.flows += [ElementwiseAffineFlow(2)]
+        for i in range(flows):
+            self.flows += [
+                ConvFlow(
+                    2,
+                    channels,
+                    kernel_size,
+                    layers=dds_conv_layers,
+                )
+            ]
+            self.flows += [FlipFlow()]
+
+        self.post_pre = torch.nn.Conv1d(1, channels, 1)
+        self.post_dds = DilatedDepthSeparableConv(
+            channels,
+            kernel_size,
+            layers=dds_conv_layers,
+            dropout_rate=dropout_rate,
+        )
+        self.post_proj = torch.nn.Conv1d(channels, channels, 1)
+        self.post_flows = torch.nn.ModuleList()
+        self.post_flows += [ElementwiseAffineFlow(2)]
+        for i in range(flows):
+            self.post_flows += [
+                ConvFlow(
+                    2,
+                    channels,
+                    kernel_size,
+                    layers=dds_conv_layers,
+                )
+            ]
+            self.post_flows += [FlipFlow()]
+
+        if global_channels > 0:
+            self.global_conv = torch.nn.Conv1d(global_channels, channels, 1)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        w: Optional[torch.Tensor] = None,
+        g: Optional[torch.Tensor] = None,
+        inverse: bool = False,
+        noise_scale: float = 1.0,
+    ) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T_text).
+            x_mask (Tensor): Mask tensor (B, 1, T_text).
+            w (Optional[Tensor]): Duration tensor (B, 1, T_text).
+            g (Optional[Tensor]): Global conditioning tensor (B, channels, 1)
+            inverse (bool): Whether to inverse the flow.
+            noise_scale (float): Noise scale value.
+
+        Returns:
+            Tensor: If not inverse, negative log-likelihood (NLL) tensor (B,).
+                If inverse, log-duration tensor (B, 1, T_text).
+
+        """
+        x = x.detach()  # stop gradient
+        x = self.pre(x)
+        if g is not None:
+            x = x + self.global_conv(g.detach())  # stop gradient
+        x = self.dds(x, x_mask)
+        x = self.proj(x) * x_mask
+
+        if not inverse:
+            assert w is not None, "w must be provided."
+            h_w = self.post_pre(w)
+            h_w = self.post_dds(h_w, x_mask)
+            h_w = self.post_proj(h_w) * x_mask
+            e_q = (
+                torch.randn(
+                    w.size(0),
+                    2,
+                    w.size(2),
+                ).to(device=x.device, dtype=x.dtype)
+                * x_mask
+            )
+            z_q = e_q
+            logdet_tot_q = 0.0
+            for flow in self.post_flows:
+                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+                logdet_tot_q += logdet_q
+            z_u, z1 = torch.split(z_q, [1, 1], 1)
+            u = torch.sigmoid(z_u) * x_mask
+            z0 = (w - u) * x_mask
+            logdet_tot_q += torch.sum(
+                (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
+            )
+            logq = (
+                torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
+                - logdet_tot_q
+            )
+
+            logdet_tot = 0
+            z0, logdet = self.log_flow(z0, x_mask)
+            logdet_tot += logdet
+            z = torch.cat([z0, z1], 1)
+            for flow in self.flows:
+                z, logdet = flow(z, x_mask, g=x, inverse=inverse)
+                logdet_tot = logdet_tot + logdet
+            nll = (
+                torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
+                - logdet_tot
+            )
+            return nll + logq  # (B,)
+        else:
+            flows = list(reversed(self.flows))
+            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+            z = (
+                torch.randn(
+                    x.size(0),
+                    2,
+                    x.size(2),
+                ).to(device=x.device, dtype=x.dtype)
+                * noise_scale
+            )
+            for flow in flows:
+                z = flow(z, x_mask, g=x, inverse=inverse)
+            z0, z1 = z.split(1, 1)
+            logw = z0
+            return logw
diff --git a/espnet2/gan_tts/vits/flow.py b/espnet2/gan_tts/vits/flow.py
new file mode 100644
index 00000000000..ef384df3802
--- /dev/null
+++ b/espnet2/gan_tts/vits/flow.py
@@ -0,0 +1,313 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Basic Flow modules used in VITS.
+
+This code is based on https://github.com/jaywalnut310/vits.
+
+"""
+
+import math
+
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from espnet2.gan_tts.vits.transform import piecewise_rational_quadratic_transform
+
+
+class FlipFlow(torch.nn.Module):
+    """Flip flow module."""
+
+    def forward(
+        self, x: torch.Tensor, *args, inverse: bool = False, **kwargs
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+            inverse (bool): Whether to inverse the flow.
+
+        Returns:
+            Tensor: Flipped tensor (B, channels, T).
+            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+
+        """
+        x = torch.flip(x, [1])
+        if not inverse:
+            logdet = x.new_zeros(x.size(0))
+            return x, logdet
+        else:
+            return x
+
+
+class LogFlow(torch.nn.Module):
+    """Log flow module."""
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        inverse: bool = False,
+        eps: float = 1e-5,
+        **kwargs
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+            x_mask (Tensor): Mask tensor (B, 1, T).
+            inverse (bool): Whether to inverse the flow.
+            eps (float): Epsilon for log.
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+
+        """
+        if not inverse:
+            y = torch.log(torch.clamp_min(x, eps)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
+
+
+class ElementwiseAffineFlow(torch.nn.Module):
+    """Elementwise affine flow module."""
+
+    def __init__(self, channels: int):
+        """Initialize ElementwiseAffineFlow module.
+
+        Args:
+            channels (int): Number of channels.
+
+        """
+        super().__init__()
+        self.channels = channels
+        self.register_parameter("m", torch.nn.Parameter(torch.zeros(channels, 1)))
+        self.register_parameter("logs", torch.nn.Parameter(torch.zeros(channels, 1)))
+
+    def forward(
+        self, x: torch.Tensor, x_mask: torch.Tensor, inverse: bool = False, **kwargs
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+            x_lengths (Tensor): Length tensor (B,).
+            inverse (bool): Whether to inverse the flow.
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+
+        """
+        if not inverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
+
+
+class Transpose(torch.nn.Module):
+    """Transpose module for torch.nn.Sequential()."""
+
+    def __init__(self, dim1: int, dim2: int):
+        """Initialize Transpose module."""
+        super().__init__()
+        self.dim1 = dim1
+        self.dim2 = dim2
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Transpose."""
+        return x.transpose(self.dim1, self.dim2)
+
+
+class DilatedDepthSeparableConv(torch.nn.Module):
+    """Dilated depth-separable conv module."""
+
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int,
+        layers: int,
+        dropout_rate: float = 0.0,
+        eps: float = 1e-5,
+    ):
+        """Initialize DilatedDepthSeparableConv module.
+
+        Args:
+            channels (int): Number of channels.
+            kernel_size (int): Kernel size.
+            layers (int): Number of layers.
+            dropout_rate (float): Dropout rate.
+            eps (float): Epsilon for layer norm.
+
+        """
+        super().__init__()
+
+        self.convs = torch.nn.ModuleList()
+        for i in range(layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs += [
+                torch.nn.Sequential(
+                    torch.nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        groups=channels,
+                        dilation=dilation,
+                        padding=padding,
+                    ),
+                    Transpose(1, 2),
+                    torch.nn.LayerNorm(
+                        channels,
+                        eps=eps,
+                        elementwise_affine=True,
+                    ),
+                    Transpose(1, 2),
+                    torch.nn.GELU(),
+                    torch.nn.Conv1d(
+                        channels,
+                        channels,
+                        1,
+                    ),
+                    Transpose(1, 2),
+                    torch.nn.LayerNorm(
+                        channels,
+                        eps=eps,
+                        elementwise_affine=True,
+                    ),
+                    Transpose(1, 2),
+                    torch.nn.GELU(),
+                    torch.nn.Dropout(dropout_rate),
+                )
+            ]
+
+    def forward(
+        self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            x_mask (Tensor): Mask tensor (B, 1, T).
+            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+
+        """
+        if g is not None:
+            x = x + g
+        for f in self.convs:
+            y = f(x * x_mask)
+            x = x + y
+        return x * x_mask
+
+
+class ConvFlow(torch.nn.Module):
+    """Convolutional flow module."""
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: int,
+        kernel_size: int,
+        layers: int,
+        bins: int = 10,
+        tail_bound: float = 5.0,
+    ):
+        """Initialize ConvFlow module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            hidden_channels (int): Number of hidden channels.
+            kernel_size (int): Kernel size.
+            layers (int): Number of layers.
+            bins (int): Number of bins.
+            tail_bound (float): Tail bound value.
+
+        """
+        super().__init__()
+        self.half_channels = in_channels // 2
+        self.hidden_channels = hidden_channels
+        self.bins = bins
+        self.tail_bound = tail_bound
+
+        self.input_conv = torch.nn.Conv1d(
+            self.half_channels,
+            hidden_channels,
+            1,
+        )
+        self.dds_conv = DilatedDepthSeparableConv(
+            hidden_channels,
+            kernel_size,
+            layers,
+            dropout_rate=0.0,
+        )
+        self.proj = torch.nn.Conv1d(
+            hidden_channels,
+            self.half_channels * (bins * 3 - 1),
+            1,
+        )
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+        inverse: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+            x_mask (Tensor): Mask tensor (B,).
+            g (Optional[Tensor]): Global conditioning tensor (B, channels, 1).
+            inverse (bool): Whether to inverse the flow.
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+
+        """
+        xa, xb = x.split(x.size(1) // 2, 1)
+        h = self.input_conv(xa)
+        h = self.dds_conv(h, x_mask, g=g)
+        h = self.proj(h) * x_mask  # (B, half_channels * (bins * 3 - 1), T)
+
+        b, c, t = xa.shape
+        # (B, half_channels, bins * 3 - 1, T) -> (B, half_channels, T, bins * 3 - 1)
+        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)
+
+        # TODO(kan-bayashi): Understand this calculation
+        denom = math.sqrt(self.hidden_channels)
+        unnorm_widths = h[..., : self.bins] / denom
+        unnorm_heights = h[..., self.bins : 2 * self.bins] / denom
+        unnorm_derivatives = h[..., 2 * self.bins :]
+        xb, logdet_abs = piecewise_rational_quadratic_transform(
+            xb,
+            unnorm_widths,
+            unnorm_heights,
+            unnorm_derivatives,
+            inverse=inverse,
+            tails="linear",
+            tail_bound=self.tail_bound,
+        )
+        x = torch.cat([xa, xb], 1) * x_mask
+        logdet = torch.sum(logdet_abs * x_mask, [1, 2])
+        if not inverse:
+            return x, logdet
+        else:
+            return x
diff --git a/espnet2/gan_tts/vits/generator.py b/espnet2/gan_tts/vits/generator.py
new file mode 100644
index 00000000000..4907dbd6162
--- /dev/null
+++ b/espnet2/gan_tts/vits/generator.py
@@ -0,0 +1,577 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Generator module in VITS.
+
+This code is based on https://github.com/jaywalnut310/vits.
+
+"""
+
+import math
+
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet2.gan_tts.hifigan import HiFiGANGenerator
+from espnet2.gan_tts.utils import get_random_segments
+from espnet2.gan_tts.vits.duration_predictor import StochasticDurationPredictor
+from espnet2.gan_tts.vits.posterior_encoder import PosteriorEncoder
+from espnet2.gan_tts.vits.residual_coupling import ResidualAffineCouplingBlock
+from espnet2.gan_tts.vits.text_encoder import TextEncoder
+
+
+class VITSGenerator(torch.nn.Module):
+    """Generator module in VITS.
+
+    This is a module of VITS described in `Conditional Variational Autoencoder
+    with Adversarial Learning for End-to-End Text-to-Speech`_.
+
+    As text encoder, we use conformer architecture instead of the relative positional
+    Transformer, which contains additional convolution layers.
+
+    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
+        Text-to-Speech`: https://arxiv.org/abs/2006.04558
+
+    """
+
+    def __init__(
+        self,
+        vocabs: int,
+        aux_channels: int = 513,
+        hidden_channels: int = 192,
+        spks: Optional[int] = None,
+        langs: Optional[int] = None,
+        spk_embed_dim: Optional[int] = None,
+        global_channels: int = -1,
+        segment_size: int = 32,
+        text_encoder_attention_heads: int = 2,
+        text_encoder_ffn_expand: int = 4,
+        text_encoder_blocks: int = 6,
+        text_encoder_positionwise_layer_type: str = "conv1d",
+        text_encoder_positionwise_conv_kernel_size: int = 1,
+        text_encoder_positional_encoding_layer_type: str = "rel_pos",
+        text_encoder_self_attention_layer_type: str = "rel_selfattn",
+        text_encoder_activation_type: str = "swish",
+        text_encoder_normalize_before: bool = True,
+        text_encoder_dropout_rate: float = 0.1,
+        text_encoder_positional_dropout_rate: float = 0.0,
+        text_encoder_attention_dropout_rate: float = 0.0,
+        text_encoder_conformer_kernel_size: int = 7,
+        use_macaron_style_in_text_encoder: bool = True,
+        use_conformer_conv_in_text_encoder: bool = True,
+        decoder_kernel_size: int = 7,
+        decoder_channels: int = 512,
+        decoder_upsample_scales: List[int] = [8, 8, 2, 2],
+        decoder_upsample_kernel_sizes: List[int] = [16, 16, 4, 4],
+        decoder_resblock_kernel_sizes: List[int] = [3, 7, 11],
+        decoder_resblock_dilations: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        use_weight_norm_in_decoder: bool = True,
+        posterior_encoder_kernel_size: int = 5,
+        posterior_encoder_layers: int = 16,
+        posterior_encoder_stacks: int = 1,
+        posterior_encoder_base_dilation: int = 1,
+        posterior_encoder_dropout_rate: float = 0.0,
+        use_weight_norm_in_posterior_encoder: bool = True,
+        flow_flows: int = 4,
+        flow_kernel_size: int = 5,
+        flow_base_dilation: int = 1,
+        flow_layers: int = 4,
+        flow_dropout_rate: float = 0.0,
+        use_weight_norm_in_flow: bool = True,
+        use_only_mean_in_flow: bool = True,
+        stochastic_duration_predictor_kernel_size: int = 3,
+        stochastic_duration_predictor_dropout_rate: float = 0.5,
+        stochastic_duration_predictor_flows: int = 4,
+        stochastic_duration_predictor_dds_conv_layers: int = 3,
+    ):
+        """Initialize VITS generator module.
+
+        Args:
+            vocabs (int): Input vocabulary size.
+            aux_channels (int): Number of acoustic feature channels.
+            hidden_channels (int): Number of hidden channels.
+            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            langs (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spembs will be provided as the input.
+            global_channels (int): Number of global conditioning channels.
+            segment_size (int): Segment size for decoder.
+            text_encoder_attention_heads (int): Number of heads in conformer block
+                of text encoder.
+            text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block
+                of text encoder.
+            text_encoder_blocks (int): Number of conformer blocks in text encoder.
+            text_encoder_positionwise_layer_type (str): Position-wise layer type in
+                conformer block of text encoder.
+            text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution
+                kernel size in conformer block of text encoder. Only used when the
+                above layer type is conv1d or conv1d-linear.
+            text_encoder_positional_encoding_layer_type (str): Positional encoding layer
+                type in conformer block of text encoder.
+            text_encoder_self_attention_layer_type (str): Self-attention layer type in
+                conformer block of text encoder.
+            text_encoder_activation_type (str): Activation function type in conformer
+                block of text encoder.
+            text_encoder_normalize_before (bool): Whether to apply layer norm before
+                self-attention in conformer block of text encoder.
+            text_encoder_dropout_rate (float): Dropout rate in conformer block of
+                text encoder.
+            text_encoder_positional_dropout_rate (float): Dropout rate for positional
+                encoding in conformer block of text encoder.
+            text_encoder_attention_dropout_rate (float): Dropout rate for attention in
+                conformer block of text encoder.
+            text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It
+                will be used when only use_conformer_conv_in_text_encoder = True.
+            use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN
+                in conformer block of text encoder.
+            use_conformer_conv_in_text_encoder (bool): Whether to use covolution in
+                conformer block of text encoder.
+            decoder_kernel_size (int): Decoder kernel size.
+            decoder_channels (int): Number of decoder initial channels.
+            decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
+            decoder_upsample_kernel_sizes (List[int]): List of kernel size for
+                upsampling layers in decoder.
+            decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
+                in decoder.
+            decoder_resblock_dilations (List[List[int]]): List of list of dilations for
+                resblocks in decoder.
+            use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
+                decoder.
+            posterior_encoder_kernel_size (int): Posterior encoder kernel size.
+            posterior_encoder_layers (int): Number of layers of posterior encoder.
+            posterior_encoder_stacks (int): Number of stacks of posterior encoder.
+            posterior_encoder_base_dilation (int): Base dilation of posterior encoder.
+            posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder.
+            use_weight_norm_in_posterior_encoder (bool): Whether to apply weight
+                normalization in posterior encoder.
+            flow_flows (int): Number of flows in flow.
+            flow_kernel_size (int): Kernel size in flow.
+            flow_base_dilation (int): Base dilation in flow.
+            flow_layers (int): Number of layers in flow.
+            flow_dropout_rate (float): Dropout rate in flow
+            use_weight_norm_in_flow (bool): Whether to apply weight normalization in
+                flow.
+            use_only_mean_in_flow (bool): Whether to use only mean in flow.
+            stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic
+                duration predictor.
+            stochastic_duration_predictor_dropout_rate (float): Dropout rate in
+                stochastic duration predictor.
+            stochastic_duration_predictor_flows (int): Number of flows in stochastic
+                duration predictor.
+            stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv
+                layers in stochastic duration predictor.
+
+        """
+        super().__init__()
+        self.segment_size = segment_size
+        self.text_encoder = TextEncoder(
+            vocabs=vocabs,
+            attention_dim=hidden_channels,
+            attention_heads=text_encoder_attention_heads,
+            linear_units=hidden_channels * text_encoder_ffn_expand,
+            blocks=text_encoder_blocks,
+            positionwise_layer_type=text_encoder_positionwise_layer_type,
+            positionwise_conv_kernel_size=text_encoder_positionwise_conv_kernel_size,
+            positional_encoding_layer_type=text_encoder_positional_encoding_layer_type,
+            self_attention_layer_type=text_encoder_self_attention_layer_type,
+            activation_type=text_encoder_activation_type,
+            normalize_before=text_encoder_normalize_before,
+            dropout_rate=text_encoder_dropout_rate,
+            positional_dropout_rate=text_encoder_positional_dropout_rate,
+            attention_dropout_rate=text_encoder_attention_dropout_rate,
+            conformer_kernel_size=text_encoder_conformer_kernel_size,
+            use_macaron_style=use_macaron_style_in_text_encoder,
+            use_conformer_conv=use_conformer_conv_in_text_encoder,
+        )
+        self.decoder = HiFiGANGenerator(
+            in_channels=hidden_channels,
+            out_channels=1,
+            channels=decoder_channels,
+            global_channels=global_channels,
+            kernel_size=decoder_kernel_size,
+            upsample_scales=decoder_upsample_scales,
+            upsample_kernel_sizes=decoder_upsample_kernel_sizes,
+            resblock_kernel_sizes=decoder_resblock_kernel_sizes,
+            resblock_dilations=decoder_resblock_dilations,
+            use_weight_norm=use_weight_norm_in_decoder,
+        )
+        self.posterior_encoder = PosteriorEncoder(
+            in_channels=aux_channels,
+            out_channels=hidden_channels,
+            hidden_channels=hidden_channels,
+            kernel_size=posterior_encoder_kernel_size,
+            layers=posterior_encoder_layers,
+            stacks=posterior_encoder_stacks,
+            base_dilation=posterior_encoder_base_dilation,
+            global_channels=global_channels,
+            dropout_rate=posterior_encoder_dropout_rate,
+            use_weight_norm=use_weight_norm_in_posterior_encoder,
+        )
+        self.flow = ResidualAffineCouplingBlock(
+            in_channels=hidden_channels,
+            hidden_channels=hidden_channels,
+            flows=flow_flows,
+            kernel_size=flow_kernel_size,
+            base_dilation=flow_base_dilation,
+            layers=flow_layers,
+            global_channels=global_channels,
+            dropout_rate=flow_dropout_rate,
+            use_weight_norm=use_weight_norm_in_flow,
+            use_only_mean=use_only_mean_in_flow,
+        )
+        # TODO(kan-bayashi): Add deterministic version as an option
+        self.duration_predictor = StochasticDurationPredictor(
+            channels=hidden_channels,
+            kernel_size=stochastic_duration_predictor_kernel_size,
+            dropout_rate=stochastic_duration_predictor_dropout_rate,
+            flows=stochastic_duration_predictor_flows,
+            dds_conv_layers=stochastic_duration_predictor_dds_conv_layers,
+            global_channels=global_channels,
+        )
+
+        self.upsample_factor = int(np.prod(decoder_upsample_scales))
+        self.spks = None
+        if spks is not None and spks > 1:
+            assert global_channels > 0
+            self.spks = spks
+            self.global_emb = torch.nn.Embedding(spks, global_channels)
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            assert global_channels > 0
+            self.spk_embed_dim = spk_embed_dim
+            self.spemb_proj = torch.nn.Linear(spk_embed_dim, global_channels)
+        self.langs = None
+        if langs is not None and langs > 1:
+            assert global_channels > 0
+            self.langs = langs
+            self.lang_emb = torch.nn.Embedding(langs, global_channels)
+
+        # delayed import
+        from espnet2.gan_tts.vits.monotonic_align import maximum_path
+
+        self.maximum_path = maximum_path
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        sids: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Tuple[
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor,
+            torch.Tensor,
+        ],
+    ]:
+        """Calculate forward propagation.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, aux_channels, T_feats).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+
+        Returns:
+            Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
+            Tensor: Duration negative log-likelihood (NLL) tensor (B,).
+            Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text).
+            Tensor: Segments start index tensor (B,).
+            Tensor: Text mask tensor (B, 1, T_text).
+            Tensor: Feature mask tensor (B, 1, T_feats).
+            tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
+                - Tensor: Posterior encoder hidden representation (B, H, T_feats).
+                - Tensor: Flow hidden representation (B, H, T_feats).
+                - Tensor: Expanded text encoder projected mean (B, H, T_feats).
+                - Tensor: Expanded text encoder projected scale (B, H, T_feats).
+                - Tensor: Posterior encoder projected mean (B, H, T_feats).
+                - Tensor: Posterior encoder projected scale (B, H, T_feats).
+
+        """
+        # forward text encoder
+        x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
+
+        # calculate global conditioning
+        g = None
+        if self.spks is not None:
+            # speaker one-hot vector embedding: (B, global_channels, 1)
+            g = self.global_emb(sids.view(-1)).unsqueeze(-1)
+        if self.spk_embed_dim is not None:
+            # pretreined speaker embedding, e.g., X-vector (B, global_channels, 1)
+            g_ = self.spemb_proj(F.normalize(spembs)).unsqueeze(-1)
+            if g is None:
+                g = g_
+            else:
+                g = g + g_
+        if self.langs is not None:
+            # language one-hot vector embedding: (B, global_channels, 1)
+            g_ = self.lang_emb(lids.view(-1)).unsqueeze(-1)
+            if g is None:
+                g = g_
+            else:
+                g = g + g_
+
+        # forward posterior encoder
+        z, m_q, logs_q, y_mask = self.posterior_encoder(feats, feats_lengths, g=g)
+
+        # forward flow
+        z_p = self.flow(z, y_mask, g=g)  # (B, H, T_feats)
+
+        # monotonic alignment search
+        with torch.no_grad():
+            # negative cross-entropy
+            s_p_sq_r = torch.exp(-2 * logs_p)  # (B, H, T_text)
+            # (B, 1, T_text)
+            neg_x_ent_1 = torch.sum(
+                -0.5 * math.log(2 * math.pi) - logs_p,
+                [1],
+                keepdim=True,
+            )
+            # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
+            neg_x_ent_2 = torch.matmul(
+                -0.5 * (z_p**2).transpose(1, 2),
+                s_p_sq_r,
+            )
+            # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
+            neg_x_ent_3 = torch.matmul(
+                z_p.transpose(1, 2),
+                (m_p * s_p_sq_r),
+            )
+            # (B, 1, T_text)
+            neg_x_ent_4 = torch.sum(
+                -0.5 * (m_p**2) * s_p_sq_r,
+                [1],
+                keepdim=True,
+            )
+            # (B, T_feats, T_text)
+            neg_x_ent = neg_x_ent_1 + neg_x_ent_2 + neg_x_ent_3 + neg_x_ent_4
+            # (B, 1, T_feats, T_text)
+            attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+            # monotonic attention weight: (B, 1, T_feats, T_text)
+            attn = (
+                self.maximum_path(
+                    neg_x_ent,
+                    attn_mask.squeeze(1),
+                )
+                .unsqueeze(1)
+                .detach()
+            )
+
+        # forward duration predictor
+        w = attn.sum(2)  # (B, 1, T_text)
+        dur_nll = self.duration_predictor(x, x_mask, w=w, g=g)
+        dur_nll = dur_nll / torch.sum(x_mask)
+
+        # expand the length to match with the feature sequence
+        # (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
+        # (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
+
+        # get random segments
+        z_segments, z_start_idxs = get_random_segments(
+            z,
+            feats_lengths,
+            self.segment_size,
+        )
+
+        # forward decoder with random segments
+        wav = self.decoder(z_segments, g=g)
+
+        return (
+            wav,
+            dur_nll,
+            attn,
+            z_start_idxs,
+            x_mask,
+            y_mask,
+            (z, z_p, m_p, logs_p, m_q, logs_q),
+        )
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: Optional[torch.Tensor] = None,
+        feats_lengths: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        dur: Optional[torch.Tensor] = None,
+        noise_scale: float = 0.667,
+        noise_scale_dur: float = 0.8,
+        alpha: float = 1.0,
+        max_len: Optional[int] = None,
+        use_teacher_forcing: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Run inference.
+
+        Args:
+            text (Tensor): Input text index tensor (B, T_text,).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+            dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided,
+                skip the prediction of durations (i.e., teacher forcing).
+            noise_scale (float): Noise scale parameter for flow.
+            noise_scale_dur (float): Noise scale parameter for duration predictor.
+            alpha (float): Alpha parameter to control the speed of generated speech.
+            max_len (Optional[int]): Maximum length of acoustic feature sequence.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
+            Tensor: Generated waveform tensor (B, T_wav).
+            Tensor: Monotonic attention weight tensor (B, T_feats, T_text).
+            Tensor: Duration tensor (B, T_text).
+
+        """
+        # encoder
+        x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
+        g = None
+        if self.spks is not None:
+            # (B, global_channels, 1)
+            g = self.global_emb(sids.view(-1)).unsqueeze(-1)
+        if self.spk_embed_dim is not None:
+            # (B, global_channels, 1)
+            g_ = self.spemb_proj(F.normalize(spembs.unsqueeze(0))).unsqueeze(-1)
+            if g is None:
+                g = g_
+            else:
+                g = g + g_
+        if self.langs is not None:
+            # (B, global_channels, 1)
+            g_ = self.lang_emb(lids.view(-1)).unsqueeze(-1)
+            if g is None:
+                g = g_
+            else:
+                g = g + g_
+
+        if use_teacher_forcing:
+            # forward posterior encoder
+            z, m_q, logs_q, y_mask = self.posterior_encoder(feats, feats_lengths, g=g)
+
+            # forward flow
+            z_p = self.flow(z, y_mask, g=g)  # (B, H, T_feats)
+
+            # monotonic alignment search
+            s_p_sq_r = torch.exp(-2 * logs_p)  # (B, H, T_text)
+            # (B, 1, T_text)
+            neg_x_ent_1 = torch.sum(
+                -0.5 * math.log(2 * math.pi) - logs_p,
+                [1],
+                keepdim=True,
+            )
+            # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
+            neg_x_ent_2 = torch.matmul(
+                -0.5 * (z_p**2).transpose(1, 2),
+                s_p_sq_r,
+            )
+            # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
+            neg_x_ent_3 = torch.matmul(
+                z_p.transpose(1, 2),
+                (m_p * s_p_sq_r),
+            )
+            # (B, 1, T_text)
+            neg_x_ent_4 = torch.sum(
+                -0.5 * (m_p**2) * s_p_sq_r,
+                [1],
+                keepdim=True,
+            )
+            # (B, T_feats, T_text)
+            neg_x_ent = neg_x_ent_1 + neg_x_ent_2 + neg_x_ent_3 + neg_x_ent_4
+            # (B, 1, T_feats, T_text)
+            attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+            # monotonic attention weight: (B, 1, T_feats, T_text)
+            attn = self.maximum_path(
+                neg_x_ent,
+                attn_mask.squeeze(1),
+            ).unsqueeze(1)
+            dur = attn.sum(2)  # (B, 1, T_text)
+
+            # forward decoder with random segments
+            wav = self.decoder(z * y_mask, g=g)
+        else:
+            # duration
+            if dur is None:
+                logw = self.duration_predictor(
+                    x,
+                    x_mask,
+                    g=g,
+                    inverse=True,
+                    noise_scale=noise_scale_dur,
+                )
+                w = torch.exp(logw) * x_mask * alpha
+                dur = torch.ceil(w)
+            y_lengths = torch.clamp_min(torch.sum(dur, [1, 2]), 1).long()
+            y_mask = make_non_pad_mask(y_lengths).unsqueeze(1).to(text.device)
+            attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+            attn = self._generate_path(dur, attn_mask)
+
+            # expand the length to match with the feature sequence
+            # (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
+            m_p = torch.matmul(
+                attn.squeeze(1),
+                m_p.transpose(1, 2),
+            ).transpose(1, 2)
+            # (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
+            logs_p = torch.matmul(
+                attn.squeeze(1),
+                logs_p.transpose(1, 2),
+            ).transpose(1, 2)
+
+            # decoder
+            z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+            z = self.flow(z_p, y_mask, g=g, inverse=True)
+            wav = self.decoder((z * y_mask)[:, :, :max_len], g=g)
+
+        return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1)
+
+    def _generate_path(self, dur: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """Generate path a.k.a. monotonic attention.
+
+        Args:
+            dur (Tensor): Duration tensor (B, 1, T_text).
+            mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text).
+
+        Returns:
+            Tensor: Path tensor (B, 1, T_feats, T_text).
+
+        """
+        b, _, t_y, t_x = mask.shape
+        cum_dur = torch.cumsum(dur, -1)
+        cum_dur_flat = cum_dur.view(b * t_x)
+        path = torch.arange(t_y, dtype=dur.dtype, device=dur.device)
+        path = path.unsqueeze(0) < cum_dur_flat.unsqueeze(1)
+        path = path.view(b, t_x, t_y).to(dtype=mask.dtype)
+        # path will be like (t_x = 3, t_y = 5):
+        # [[[1., 1., 0., 0., 0.],      [[[1., 1., 0., 0., 0.],
+        #   [1., 1., 1., 1., 0.],  -->   [0., 0., 1., 1., 0.],
+        #   [1., 1., 1., 1., 1.]]]       [0., 0., 0., 0., 1.]]]
+        path = path - F.pad(path, [0, 0, 1, 0, 0, 0])[:, :-1]
+        return path.unsqueeze(1).transpose(2, 3) * mask
diff --git a/espnet2/gan_tts/vits/loss.py b/espnet2/gan_tts/vits/loss.py
new file mode 100644
index 00000000000..b1bfaf79ec9
--- /dev/null
+++ b/espnet2/gan_tts/vits/loss.py
@@ -0,0 +1,47 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""VITS-related loss modules.
+
+This code is based on https://github.com/jaywalnut310/vits.
+
+"""
+
+import torch
+
+
+class KLDivergenceLoss(torch.nn.Module):
+    """KL divergence loss."""
+
+    def forward(
+        self,
+        z_p: torch.Tensor,
+        logs_q: torch.Tensor,
+        m_p: torch.Tensor,
+        logs_p: torch.Tensor,
+        z_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """Calculate KL divergence loss.
+
+        Args:
+            z_p (Tensor): Flow hidden representation (B, H, T_feats).
+            logs_q (Tensor): Posterior encoder projected scale (B, H, T_feats).
+            m_p (Tensor): Expanded text encoder projected mean (B, H, T_feats).
+            logs_p (Tensor): Expanded text encoder projected scale (B, H, T_feats).
+            z_mask (Tensor): Mask tensor (B, 1, T_feats).
+
+        Returns:
+            Tensor: KL divergence loss.
+
+        """
+        z_p = z_p.float()
+        logs_q = logs_q.float()
+        m_p = m_p.float()
+        logs_p = logs_p.float()
+        z_mask = z_mask.float()
+        kl = logs_p - logs_q - 0.5
+        kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+        kl = torch.sum(kl * z_mask)
+        loss = kl / torch.sum(z_mask)
+
+        return loss
diff --git a/espnet2/gan_tts/vits/monotonic_align/__init__.py b/espnet2/gan_tts/vits/monotonic_align/__init__.py
new file mode 100644
index 00000000000..59bbf12dba4
--- /dev/null
+++ b/espnet2/gan_tts/vits/monotonic_align/__init__.py
@@ -0,0 +1,81 @@
+"""Maximum path calculation module.
+
+This code is based on https://github.com/jaywalnut310/vits.
+
+"""
+
+import warnings
+
+import numpy as np
+import torch
+
+from numba import njit
+from numba import prange
+
+try:
+    from .core import maximum_path_c
+
+    is_cython_avalable = True
+except ImportError:
+    is_cython_avalable = False
+    warnings.warn(
+        "Cython version is not available. Fallback to 'EXPERIMETAL' numba version. "
+        "If you want to use the cython version, please build it as follows: "
+        "`cd espnet2/gan_tts/vits/monotonic_align; python setup.py build_ext --inplace`"
+    )
+
+
+def maximum_path(neg_x_ent: torch.Tensor, attn_mask: torch.Tensor) -> torch.Tensor:
+    """Calculate maximum path.
+
+    Args:
+        neg_x_ent (Tensor): Negative X entropy tensor (B, T_feats, T_text).
+        attn_mask (Tensor): Attention mask (B, T_feats, T_text).
+
+    Returns:
+        Tensor: Maximum path tensor (B, T_feats, T_text).
+
+    """
+    device, dtype = neg_x_ent.device, neg_x_ent.dtype
+    neg_x_ent = neg_x_ent.cpu().numpy().astype(np.float32)
+    path = np.zeros(neg_x_ent.shape, dtype=np.int32)
+    t_t_max = attn_mask.sum(1)[:, 0].cpu().numpy().astype(np.int32)
+    t_s_max = attn_mask.sum(2)[:, 0].cpu().numpy().astype(np.int32)
+    if is_cython_avalable:
+        maximum_path_c(path, neg_x_ent, t_t_max, t_s_max)
+    else:
+        maximum_path_numba(path, neg_x_ent, t_t_max, t_s_max)
+
+    return torch.from_numpy(path).to(device=device, dtype=dtype)
+
+
+@njit
+def maximum_path_each_numba(path, value, t_y, t_x, max_neg_val=-np.inf):
+    """Calculate a single maximum path with numba."""
+    index = t_x - 1
+    for y in range(t_y):
+        for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+            if x == y:
+                v_cur = max_neg_val
+            else:
+                v_cur = value[y - 1, x]
+            if x == 0:
+                if y == 0:
+                    v_prev = 0.0
+                else:
+                    v_prev = max_neg_val
+            else:
+                v_prev = value[y - 1, x - 1]
+            value[y, x] += max(v_prev, v_cur)
+
+    for y in range(t_y - 1, -1, -1):
+        path[y, index] = 1
+        if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]):
+            index = index - 1
+
+
+@njit(parallel=True)
+def maximum_path_numba(paths, values, t_ys, t_xs):
+    """Calculate batch maximum path with numba."""
+    for i in prange(paths.shape[0]):
+        maximum_path_each_numba(paths[i], values[i], t_ys[i], t_xs[i])
diff --git a/espnet2/gan_tts/vits/monotonic_align/core.pyx b/espnet2/gan_tts/vits/monotonic_align/core.pyx
new file mode 100644
index 00000000000..d99ee7fbad4
--- /dev/null
+++ b/espnet2/gan_tts/vits/monotonic_align/core.pyx
@@ -0,0 +1,49 @@
+"""Maximum path calculation module with cython optimization.
+
+This code is copied from https://github.com/jaywalnut310/vits and modifed code format.
+
+"""
+
+cimport cython
+
+from cython.parallel import prange
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef void maximum_path_each(int[:, ::1] path, float[:, ::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
+    cdef int x
+    cdef int y
+    cdef float v_prev
+    cdef float v_cur
+    cdef float tmp
+    cdef int index = t_x - 1
+
+    for y in range(t_y):
+        for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
+            if x == y:
+                v_cur = max_neg_val
+            else:
+                v_cur = value[y - 1, x]
+            if x == 0:
+                if y == 0:
+                    v_prev = 0.0
+                else:
+                    v_prev = max_neg_val
+            else:
+                v_prev = value[y - 1, x - 1]
+            value[y, x] += max(v_prev, v_cur)
+
+    for y in range(t_y - 1, -1, -1):
+        path[y, index] = 1
+        if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]):
+            index = index - 1
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void maximum_path_c(int[:, :, ::1] paths, float[:, :, ::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
+    cdef int b = paths.shape[0]
+    cdef int i
+    for i in prange(b, nogil=True):
+        maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
diff --git a/espnet2/gan_tts/vits/monotonic_align/setup.py b/espnet2/gan_tts/vits/monotonic_align/setup.py
new file mode 100644
index 00000000000..6df5c46d7f2
--- /dev/null
+++ b/espnet2/gan_tts/vits/monotonic_align/setup.py
@@ -0,0 +1,33 @@
+"""Setup cython code."""
+
+from setuptools import Extension
+from setuptools import setup
+
+from setuptools.command.build_ext import build_ext as _build_ext
+
+from Cython.Build import cythonize
+
+
+class build_ext(_build_ext):
+    """Overwrite build_ext."""
+
+    def finalize_options(self):
+        """Prevent numpy from thinking it is still in its setup process."""
+        _build_ext.finalize_options(self)
+        __builtins__.__NUMPY_SETUP__ = False
+        import numpy
+
+        self.include_dirs.append(numpy.get_include())
+
+
+exts = [
+    Extension(
+        name="core",
+        sources=["core.pyx"],
+    )
+]
+setup(
+    name="monotonic_align",
+    ext_modules=cythonize(exts, language_level=3),
+    cmdclass={"build_ext": build_ext},
+)
diff --git a/espnet2/gan_tts/vits/posterior_encoder.py b/espnet2/gan_tts/vits/posterior_encoder.py
new file mode 100644
index 00000000000..1ae3a8ca332
--- /dev/null
+++ b/espnet2/gan_tts/vits/posterior_encoder.py
@@ -0,0 +1,117 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Posterior encoder module in VITS.
+
+This code is based on https://github.com/jaywalnut310/vits.
+
+"""
+
+from typing import Optional
+from typing import Tuple
+
+import torch
+
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet2.gan_tts.wavenet.residual_block import Conv1d
+from espnet2.gan_tts.wavenet import WaveNet
+
+
+class PosteriorEncoder(torch.nn.Module):
+    """Posterior encoder module in VITS.
+
+    This is a module of posterior encoder described in `Conditional Variational
+    Autoencoder with Adversarial Learning for End-to-End Text-to-Speech`_.
+
+    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
+        Text-to-Speech`: https://arxiv.org/abs/2006.04558
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 513,
+        out_channels: int = 192,
+        hidden_channels: int = 192,
+        kernel_size: int = 5,
+        layers: int = 16,
+        stacks: int = 1,
+        base_dilation: int = 1,
+        global_channels: int = -1,
+        dropout_rate: float = 0.0,
+        bias: bool = True,
+        use_weight_norm: bool = True,
+    ):
+        """Initilialize PosteriorEncoder module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            hidden_channels (int): Number of hidden channels.
+            kernel_size (int): Kernel size in WaveNet.
+            layers (int): Number of layers of WaveNet.
+            stacks (int): Number of repeat stacking of WaveNet.
+            base_dilation (int): Base dilation factor.
+            global_channels (int): Number of global conditioning channels.
+            dropout_rate (float): Dropout rate.
+            bias (bool): Whether to use bias parameters in conv.
+            use_weight_norm (bool): Whether to apply weight norm.
+
+        """
+        super().__init__()
+
+        # define modules
+        self.input_conv = Conv1d(in_channels, hidden_channels, 1)
+        self.encoder = WaveNet(
+            in_channels=-1,
+            out_channels=-1,
+            kernel_size=kernel_size,
+            layers=layers,
+            stacks=stacks,
+            base_dilation=base_dilation,
+            residual_channels=hidden_channels,
+            aux_channels=-1,
+            gate_channels=hidden_channels * 2,
+            skip_channels=hidden_channels,
+            global_channels=global_channels,
+            dropout_rate=dropout_rate,
+            bias=bias,
+            use_weight_norm=use_weight_norm,
+            use_first_conv=False,
+            use_last_conv=False,
+            scale_residual=False,
+            scale_skip_connect=True,
+        )
+        self.proj = Conv1d(hidden_channels, out_channels * 2, 1)
+
+    def forward(
+        self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_feats).
+            x_lengths (Tensor): Length tensor (B,).
+            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
+
+        Returns:
+            Tensor: Encoded hidden representation tensor (B, out_channels, T_feats).
+            Tensor: Projected mean tensor (B, out_channels, T_feats).
+            Tensor: Projected scale tensor (B, out_channels, T_feats).
+            Tensor: Mask tensor for input tensor (B, 1, T_feats).
+
+        """
+        x_mask = (
+            make_non_pad_mask(x_lengths)
+            .unsqueeze(1)
+            .to(
+                dtype=x.dtype,
+                device=x.device,
+            )
+        )
+        x = self.input_conv(x) * x_mask
+        x = self.encoder(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = stats.split(stats.size(1) // 2, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+
+        return z, m, logs, x_mask
diff --git a/espnet2/gan_tts/vits/residual_coupling.py b/espnet2/gan_tts/vits/residual_coupling.py
new file mode 100644
index 00000000000..e01bd2c85ac
--- /dev/null
+++ b/espnet2/gan_tts/vits/residual_coupling.py
@@ -0,0 +1,229 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Residual affine coupling modules in VITS.
+
+This code is based on https://github.com/jaywalnut310/vits.
+
+"""
+
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from espnet2.gan_tts.vits.flow import FlipFlow
+from espnet2.gan_tts.wavenet import WaveNet
+
+
+class ResidualAffineCouplingBlock(torch.nn.Module):
+    """Residual affine coupling block module.
+
+    This is a module of residual affine coupling block, which used as "Flow" in
+    `Conditional Variational Autoencoder with Adversarial Learning for End-to-End
+    Text-to-Speech`_.
+
+    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
+        Text-to-Speech`: https://arxiv.org/abs/2006.04558
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 192,
+        hidden_channels: int = 192,
+        flows: int = 4,
+        kernel_size: int = 5,
+        base_dilation: int = 1,
+        layers: int = 4,
+        global_channels: int = -1,
+        dropout_rate: float = 0.0,
+        use_weight_norm: bool = True,
+        bias: bool = True,
+        use_only_mean: bool = True,
+    ):
+        """Initilize ResidualAffineCouplingBlock module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            hidden_channels (int): Number of hidden channels.
+            flows (int): Number of flows.
+            kernel_size (int): Kernel size for WaveNet.
+            base_dilation (int): Base dilation factor for WaveNet.
+            layers (int): Number of layers of WaveNet.
+            stacks (int): Number of stacks of WaveNet.
+            global_channels (int): Number of global channels.
+            dropout_rate (float): Dropout rate.
+            use_weight_norm (bool): Whether to use weight normalization in WaveNet.
+            bias (bool): Whether to use bias paramters in WaveNet.
+            use_only_mean (bool): Whether to estimate only mean.
+
+        """
+        super().__init__()
+
+        self.flows = torch.nn.ModuleList()
+        for i in range(flows):
+            self.flows += [
+                ResidualAffineCouplingLayer(
+                    in_channels=in_channels,
+                    hidden_channels=hidden_channels,
+                    kernel_size=kernel_size,
+                    base_dilation=base_dilation,
+                    layers=layers,
+                    stacks=1,
+                    global_channels=global_channels,
+                    dropout_rate=dropout_rate,
+                    use_weight_norm=use_weight_norm,
+                    bias=bias,
+                    use_only_mean=use_only_mean,
+                )
+            ]
+            self.flows += [FlipFlow()]
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+        inverse: bool = False,
+    ) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            x_lengths (Tensor): Length tensor (B,).
+            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
+            inverse (bool): Whether to inverse the flow.
+
+        Returns:
+            Tensor: Output tensor (B, in_channels, T).
+
+        """
+        if not inverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, inverse=inverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, inverse=inverse)
+        return x
+
+
+class ResidualAffineCouplingLayer(torch.nn.Module):
+    """Residual affine coupling layer."""
+
+    def __init__(
+        self,
+        in_channels: int = 192,
+        hidden_channels: int = 192,
+        kernel_size: int = 5,
+        base_dilation: int = 1,
+        layers: int = 5,
+        stacks: int = 1,
+        global_channels: int = -1,
+        dropout_rate: float = 0.0,
+        use_weight_norm: bool = True,
+        bias: bool = True,
+        use_only_mean: bool = True,
+    ):
+        """Initialzie ResidualAffineCouplingLayer module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            hidden_channels (int): Number of hidden channels.
+            kernel_size (int): Kernel size for WaveNet.
+            base_dilation (int): Base dilation factor for WaveNet.
+            layers (int): Number of layers of WaveNet.
+            stacks (int): Number of stacks of WaveNet.
+            global_channels (int): Number of global channels.
+            dropout_rate (float): Dropout rate.
+            use_weight_norm (bool): Whether to use weight normalization in WaveNet.
+            bias (bool): Whether to use bias paramters in WaveNet.
+            use_only_mean (bool): Whether to estimate only mean.
+
+        """
+        assert in_channels % 2 == 0, "in_channels should be divisible by 2"
+        super().__init__()
+        self.half_channels = in_channels // 2
+        self.use_only_mean = use_only_mean
+
+        # define modules
+        self.input_conv = torch.nn.Conv1d(
+            self.half_channels,
+            hidden_channels,
+            1,
+        )
+        self.encoder = WaveNet(
+            in_channels=-1,
+            out_channels=-1,
+            kernel_size=kernel_size,
+            layers=layers,
+            stacks=stacks,
+            base_dilation=base_dilation,
+            residual_channels=hidden_channels,
+            aux_channels=-1,
+            gate_channels=hidden_channels * 2,
+            skip_channels=hidden_channels,
+            global_channels=global_channels,
+            dropout_rate=dropout_rate,
+            bias=bias,
+            use_weight_norm=use_weight_norm,
+            use_first_conv=False,
+            use_last_conv=False,
+            scale_residual=False,
+            scale_skip_connect=True,
+        )
+        if use_only_mean:
+            self.proj = torch.nn.Conv1d(
+                hidden_channels,
+                self.half_channels,
+                1,
+            )
+        else:
+            self.proj = torch.nn.Conv1d(
+                hidden_channels,
+                self.half_channels * 2,
+                1,
+            )
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        g: Optional[torch.Tensor] = None,
+        inverse: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            x_lengths (Tensor): Length tensor (B,).
+            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
+            inverse (bool): Whether to inverse the flow.
+
+        Returns:
+            Tensor: Output tensor (B, in_channels, T).
+            Tensor: Log-determinant tensor for NLL (B,) if not inverse.
+
+        """
+        xa, xb = x.split(x.size(1) // 2, dim=1)
+        h = self.input_conv(xa) * x_mask
+        h = self.encoder(h, x_mask, g=g)
+        stats = self.proj(h) * x_mask
+        if not self.use_only_mean:
+            m, logs = stats.split(stats.size(1) // 2, dim=1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not inverse:
+            xb = m + xb * torch.exp(logs) * x_mask
+            x = torch.cat([xa, xb], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            xb = (xb - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([xa, xb], 1)
+            return x
diff --git a/espnet2/gan_tts/vits/text_encoder.py b/espnet2/gan_tts/vits/text_encoder.py
new file mode 100644
index 00000000000..6e529081d57
--- /dev/null
+++ b/espnet2/gan_tts/vits/text_encoder.py
@@ -0,0 +1,141 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Text encoder module in VITS.
+
+This code is based on https://github.com/jaywalnut310/vits.
+
+"""
+
+import math
+
+from typing import Tuple
+
+import torch
+
+from espnet.nets.pytorch_backend.conformer.encoder import Encoder
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+
+
+class TextEncoder(torch.nn.Module):
+    """Text encoder module in VITS.
+
+    This is a module of text encoder described in `Conditional Variational Autoencoder
+    with Adversarial Learning for End-to-End Text-to-Speech`_.
+
+    Instead of the relative positional Transformer, we use conformer architecture as
+    the encoder module, which contains additional convolution layers.
+
+    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
+        Text-to-Speech`: https://arxiv.org/abs/2006.04558
+
+    """
+
+    def __init__(
+        self,
+        vocabs: int,
+        attention_dim: int = 192,
+        attention_heads: int = 2,
+        linear_units: int = 768,
+        blocks: int = 6,
+        positionwise_layer_type: str = "conv1d",
+        positionwise_conv_kernel_size: int = 3,
+        positional_encoding_layer_type: str = "rel_pos",
+        self_attention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        normalize_before: bool = True,
+        use_macaron_style: bool = False,
+        use_conformer_conv: bool = False,
+        conformer_kernel_size: int = 7,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.0,
+        attention_dropout_rate: float = 0.0,
+    ):
+        """Initialize TextEncoder module.
+
+        Args:
+            vocabs (int): Vocabulary size.
+            attention_dim (int): Attention dimension.
+            attention_heads (int): Number of attention heads.
+            linear_units (int): Number of linear units of positionwise layers.
+            blocks (int): Number of encoder blocks.
+            positionwise_layer_type (str): Positionwise layer type.
+            positionwise_conv_kernel_size (int): Positionwise layer's kernel size.
+            positional_encoding_layer_type (str): Positional encoding layer type.
+            self_attention_layer_type (str): Self-attention layer type.
+            activation_type (str): Activation function type.
+            normalize_before (bool): Whether to apply LayerNorm before attention.
+            use_macaron_style (bool): Whether to use macaron style components.
+            use_conformer_conv (bool): Whether to use conformer conv layers.
+            conformer_kernel_size (int): Conformer's conv kernel size.
+            dropout_rate (float): Dropout rate.
+            positional_dropout_rate (float): Dropout rate for positional encoding.
+            attention_dropout_rate (float): Dropout rate for attention.
+
+        """
+        super().__init__()
+        # store for forward
+        self.attention_dim = attention_dim
+
+        # define modules
+        self.emb = torch.nn.Embedding(vocabs, attention_dim)
+        torch.nn.init.normal_(self.emb.weight, 0.0, attention_dim**-0.5)
+        self.encoder = Encoder(
+            idim=-1,
+            input_layer=None,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            normalize_before=normalize_before,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            macaron_style=use_macaron_style,
+            pos_enc_layer_type=positional_encoding_layer_type,
+            selfattention_layer_type=self_attention_layer_type,
+            activation_type=activation_type,
+            use_cnn_module=use_conformer_conv,
+            cnn_module_kernel=conformer_kernel_size,
+        )
+        self.proj = torch.nn.Conv1d(attention_dim, attention_dim * 2, 1)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input index tensor (B, T_text).
+            x_lengths (Tensor): Length tensor (B,).
+
+        Returns:
+            Tensor: Encoded hidden representation (B, attention_dim, T_text).
+            Tensor: Projected mean tensor (B, attention_dim, T_text).
+            Tensor: Projected scale tensor (B, attention_dim, T_text).
+            Tensor: Mask tensor for input tensor (B, 1, T_text).
+
+        """
+        x = self.emb(x) * math.sqrt(self.attention_dim)
+        x_mask = (
+            make_non_pad_mask(x_lengths)
+            .to(
+                device=x.device,
+                dtype=x.dtype,
+            )
+            .unsqueeze(1)
+        )
+        # encoder assume the channel last (B, T_text, attention_dim)
+        # but mask shape shoud be (B, 1, T_text)
+        x, _ = self.encoder(x, x_mask)
+
+        # convert the channel first (B, attention_dim, T_text)
+        x = x.transpose(1, 2)
+        stats = self.proj(x) * x_mask
+        m, logs = stats.split(stats.size(1) // 2, dim=1)
+
+        return x, m, logs, x_mask
diff --git a/espnet2/gan_tts/vits/transform.py b/espnet2/gan_tts/vits/transform.py
new file mode 100644
index 00000000000..aa7729839cf
--- /dev/null
+++ b/espnet2/gan_tts/vits/transform.py
@@ -0,0 +1,218 @@
+"""Flow-related transformation.
+
+This code is derived from https://github.com/bayesiains/nflows.
+
+"""
+
+import torch
+from torch.nn import functional as F
+
+import numpy as np
+
+
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+
+
+# TODO(kan-bayashi): Documentation and type hint
+def piecewise_rational_quadratic_transform(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails=None,
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
+
+    outputs, logabsdet = spline_fn(
+        inputs=inputs,
+        unnormalized_widths=unnormalized_widths,
+        unnormalized_heights=unnormalized_heights,
+        unnormalized_derivatives=unnormalized_derivatives,
+        inverse=inverse,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+        **spline_kwargs
+    )
+    return outputs, logabsdet
+
+
+# TODO(kan-bayashi): Documentation and type hint
+def unconstrained_rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails="linear",
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+
+    if tails == "linear":
+        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        unnormalized_derivatives[..., -1] = constant
+
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError("{} tails are not implemented.".format(tails))
+
+    (
+        outputs[inside_interval_mask],
+        logabsdet[inside_interval_mask],
+    ) = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound,
+        right=tail_bound,
+        bottom=-tail_bound,
+        top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+    )
+
+    return outputs, logabsdet
+
+
+# TODO(kan-bayashi): Documentation and type hint
+def rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    left=0.0,
+    right=1.0,
+    bottom=0.0,
+    top=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    if torch.min(inputs) < left or torch.max(inputs) > right:
+        raise ValueError("Input to a transform is not within its domain")
+
+    num_bins = unnormalized_widths.shape[-1]
+
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError("Minimal bin width too large for the number of bins")
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError("Minimal bin height too large for the number of bins")
+
+    widths = F.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    cumwidths[..., -1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+
+    heights = F.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    cumheights[..., -1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+    if inverse:
+        bin_idx = _searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = _searchsorted(cumwidths, inputs)[..., None]
+
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+    if inverse:
+        a = (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        ) + input_heights * (input_delta - input_derivatives)
+        b = input_heights * input_derivatives - (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        )
+        c = -input_delta * (inputs - input_cumheights)
+
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all()
+
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * root.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - root).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, -logabsdet
+    else:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+
+        numerator = input_heights * (
+            input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
+        )
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        outputs = input_cumheights + numerator / denominator
+
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * theta.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - theta).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, logabsdet
+
+
+def _searchsorted(bin_locations, inputs, eps=1e-6):
+    bin_locations[..., -1] += eps
+    return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
diff --git a/espnet2/gan_tts/vits/vits.py b/espnet2/gan_tts/vits/vits.py
new file mode 100644
index 00000000000..3f906b96374
--- /dev/null
+++ b/espnet2/gan_tts/vits/vits.py
@@ -0,0 +1,611 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""VITS module for GAN-TTS task."""
+
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet2.gan_tts.abs_gan_tts import AbsGANTTS
+from espnet2.gan_tts.hifigan import HiFiGANMultiPeriodDiscriminator
+from espnet2.gan_tts.hifigan import HiFiGANMultiScaleDiscriminator
+from espnet2.gan_tts.hifigan import HiFiGANMultiScaleMultiPeriodDiscriminator
+from espnet2.gan_tts.hifigan import HiFiGANPeriodDiscriminator
+from espnet2.gan_tts.hifigan import HiFiGANScaleDiscriminator
+from espnet2.gan_tts.hifigan.loss import DiscriminatorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import FeatureMatchLoss
+from espnet2.gan_tts.hifigan.loss import GeneratorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import MelSpectrogramLoss
+from espnet2.gan_tts.utils import get_segments
+from espnet2.gan_tts.vits.generator import VITSGenerator
+from espnet2.gan_tts.vits.loss import KLDivergenceLoss
+from espnet2.torch_utils.device_funcs import force_gatherable
+
+AVAILABLE_GENERATERS = {
+    "vits_generator": VITSGenerator,
+}
+AVAILABLE_DISCRIMINATORS = {
+    "hifigan_period_discriminator": HiFiGANPeriodDiscriminator,
+    "hifigan_scale_discriminator": HiFiGANScaleDiscriminator,
+    "hifigan_multi_period_discriminator": HiFiGANMultiPeriodDiscriminator,
+    "hifigan_multi_scale_discriminator": HiFiGANMultiScaleDiscriminator,
+    "hifigan_multi_scale_multi_period_discriminator": HiFiGANMultiScaleMultiPeriodDiscriminator,  # NOQA
+}
+
+
+class VITS(AbsGANTTS):
+    """VITS module (generator + discriminator).
+
+    This is a module of VITS described in `Conditional Variational Autoencoder
+    with Adversarial Learning for End-to-End Text-to-Speech`_.
+
+    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
+        Text-to-Speech`: https://arxiv.org/abs/2006.04558
+
+    """
+
+    def __init__(
+        self,
+        # generator related
+        idim: int,
+        odim: int,
+        sampling_rate: int = 22050,
+        generator_type: str = "vits_generator",
+        generator_params: Dict[str, Any] = {
+            "hidden_channels": 192,
+            "spks": None,
+            "langs": None,
+            "spk_embed_dim": None,
+            "global_channels": -1,
+            "segment_size": 32,
+            "text_encoder_attention_heads": 2,
+            "text_encoder_ffn_expand": 4,
+            "text_encoder_blocks": 6,
+            "text_encoder_positionwise_layer_type": "conv1d",
+            "text_encoder_positionwise_conv_kernel_size": 1,
+            "text_encoder_positional_encoding_layer_type": "rel_pos",
+            "text_encoder_self_attention_layer_type": "rel_selfattn",
+            "text_encoder_activation_type": "swish",
+            "text_encoder_normalize_before": True,
+            "text_encoder_dropout_rate": 0.1,
+            "text_encoder_positional_dropout_rate": 0.0,
+            "text_encoder_attention_dropout_rate": 0.0,
+            "text_encoder_conformer_kernel_size": 7,
+            "use_macaron_style_in_text_encoder": True,
+            "use_conformer_conv_in_text_encoder": True,
+            "decoder_kernel_size": 7,
+            "decoder_channels": 512,
+            "decoder_upsample_scales": [8, 8, 2, 2],
+            "decoder_upsample_kernel_sizes": [16, 16, 4, 4],
+            "decoder_resblock_kernel_sizes": [3, 7, 11],
+            "decoder_resblock_dilations": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            "use_weight_norm_in_decoder": True,
+            "posterior_encoder_kernel_size": 5,
+            "posterior_encoder_layers": 16,
+            "posterior_encoder_stacks": 1,
+            "posterior_encoder_base_dilation": 1,
+            "posterior_encoder_dropout_rate": 0.0,
+            "use_weight_norm_in_posterior_encoder": True,
+            "flow_flows": 4,
+            "flow_kernel_size": 5,
+            "flow_base_dilation": 1,
+            "flow_layers": 4,
+            "flow_dropout_rate": 0.0,
+            "use_weight_norm_in_flow": True,
+            "use_only_mean_in_flow": True,
+            "stochastic_duration_predictor_kernel_size": 3,
+            "stochastic_duration_predictor_dropout_rate": 0.5,
+            "stochastic_duration_predictor_flows": 4,
+            "stochastic_duration_predictor_dds_conv_layers": 3,
+        },
+        # discriminator related
+        discriminator_type: str = "hifigan_multi_scale_multi_period_discriminator",
+        discriminator_params: Dict[str, Any] = {
+            "scales": 1,
+            "scale_downsample_pooling": "AvgPool1d",
+            "scale_downsample_pooling_params": {
+                "kernel_size": 4,
+                "stride": 2,
+                "padding": 2,
+            },
+            "scale_discriminator_params": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [15, 41, 5, 3],
+                "channels": 128,
+                "max_downsample_channels": 1024,
+                "max_groups": 16,
+                "bias": True,
+                "downsample_scales": [2, 2, 4, 4, 1],
+                "nonlinear_activation": "LeakyReLU",
+                "nonlinear_activation_params": {"negative_slope": 0.1},
+                "use_weight_norm": True,
+                "use_spectral_norm": False,
+            },
+            "follow_official_norm": False,
+            "periods": [2, 3, 5, 7, 11],
+            "period_discriminator_params": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 32,
+                "downsample_scales": [3, 3, 3, 3, 1],
+                "max_downsample_channels": 1024,
+                "bias": True,
+                "nonlinear_activation": "LeakyReLU",
+                "nonlinear_activation_params": {"negative_slope": 0.1},
+                "use_weight_norm": True,
+                "use_spectral_norm": False,
+            },
+        },
+        # loss related
+        generator_adv_loss_params: Dict[str, Any] = {
+            "average_by_discriminators": False,
+            "loss_type": "mse",
+        },
+        discriminator_adv_loss_params: Dict[str, Any] = {
+            "average_by_discriminators": False,
+            "loss_type": "mse",
+        },
+        feat_match_loss_params: Dict[str, Any] = {
+            "average_by_discriminators": False,
+            "average_by_layers": False,
+            "include_final_outputs": True,
+        },
+        mel_loss_params: Dict[str, Any] = {
+            "fs": 22050,
+            "n_fft": 1024,
+            "hop_length": 256,
+            "win_length": None,
+            "window": "hann",
+            "n_mels": 80,
+            "fmin": 0,
+            "fmax": None,
+            "log_base": None,
+        },
+        lambda_adv: float = 1.0,
+        lambda_mel: float = 45.0,
+        lambda_feat_match: float = 2.0,
+        lambda_dur: float = 1.0,
+        lambda_kl: float = 1.0,
+        cache_generator_outputs: bool = True,
+    ):
+        """Initialize VITS module.
+
+        Args:
+            idim (int): Input vocabrary size.
+            odim (int): Acoustic feature dimension. The actual output channels will
+                be 1 since VITS is the end-to-end text-to-wave model but for the
+                compatibility odim is used to indicate the acoustic feature dimension.
+            sampling_rate (int): Sampling rate, not used for the training but it will
+                be referred in saving waveform during the inference.
+            generator_type (str): Generator type.
+            generator_params (Dict[str, Any]): Parameter dict for generator.
+            discriminator_type (str): Discriminator type.
+            discriminator_params (Dict[str, Any]): Parameter dict for discriminator.
+            generator_adv_loss_params (Dict[str, Any]): Parameter dict for generator
+                adversarial loss.
+            discriminator_adv_loss_params (Dict[str, Any]): Parameter dict for
+                discriminator adversarial loss.
+            feat_match_loss_params (Dict[str, Any]): Parameter dict for feat match loss.
+            mel_loss_params (Dict[str, Any]): Parameter dict for mel loss.
+            lambda_adv (float): Loss scaling coefficient for adversarial loss.
+            lambda_mel (float): Loss scaling coefficient for mel spectrogram loss.
+            lambda_feat_match (float): Loss scaling coefficient for feat match loss.
+            lambda_dur (float): Loss scaling coefficient for duration loss.
+            lambda_kl (float): Loss scaling coefficient for KL divergence loss.
+            cache_generator_outputs (bool): Whether to cache generator outputs.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # define modules
+        generator_class = AVAILABLE_GENERATERS[generator_type]
+        if generator_type == "vits_generator":
+            # NOTE(kan-bayashi): Update parameters for the compatibility.
+            #   The idim and odim is automatically decided from input data,
+            #   where idim represents #vocabularies and odim represents
+            #   the input acoustic feature dimension.
+            generator_params.update(vocabs=idim, aux_channels=odim)
+        self.generator = generator_class(
+            **generator_params,
+        )
+        discriminator_class = AVAILABLE_DISCRIMINATORS[discriminator_type]
+        self.discriminator = discriminator_class(
+            **discriminator_params,
+        )
+        self.generator_adv_loss = GeneratorAdversarialLoss(
+            **generator_adv_loss_params,
+        )
+        self.discriminator_adv_loss = DiscriminatorAdversarialLoss(
+            **discriminator_adv_loss_params,
+        )
+        self.feat_match_loss = FeatureMatchLoss(
+            **feat_match_loss_params,
+        )
+        self.mel_loss = MelSpectrogramLoss(
+            **mel_loss_params,
+        )
+        self.kl_loss = KLDivergenceLoss()
+
+        # coefficients
+        self.lambda_adv = lambda_adv
+        self.lambda_mel = lambda_mel
+        self.lambda_kl = lambda_kl
+        self.lambda_feat_match = lambda_feat_match
+        self.lambda_dur = lambda_dur
+
+        # cache
+        self.cache_generator_outputs = cache_generator_outputs
+        self._cache = None
+
+        # store sampling rate for saving wav file
+        # (not used for the training)
+        self.fs = sampling_rate
+
+        # store parameters for test compatibility
+        self.spks = self.generator.spks
+        self.langs = self.generator.langs
+        self.spk_embed_dim = self.generator.spk_embed_dim
+
+    @property
+    def require_raw_speech(self):
+        """Return whether or not speech is required."""
+        return True
+
+    @property
+    def require_vocoder(self):
+        """Return whether or not vocoder is required."""
+        return False
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        sids: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        forward_generator: bool = True,
+    ) -> Dict[str, Any]:
+        """Perform generator forward.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+            forward_generator (bool): Whether to forward generator.
+
+        Returns:
+            Dict[str, Any]:
+                - loss (Tensor): Loss scalar tensor.
+                - stats (Dict[str, float]): Statistics to be monitored.
+                - weight (Tensor): Weight tensor to summarize losses.
+                - optim_idx (int): Optimizer index (0 for G and 1 for D).
+
+        """
+        if forward_generator:
+            return self._forward_generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                speech=speech,
+                speech_lengths=speech_lengths,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+            )
+        else:
+            return self._forward_discrminator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                speech=speech,
+                speech_lengths=speech_lengths,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+            )
+
+    def _forward_generator(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        sids: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+    ) -> Dict[str, Any]:
+        """Perform generator forward.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+
+        Returns:
+            Dict[str, Any]:
+                * loss (Tensor): Loss scalar tensor.
+                * stats (Dict[str, float]): Statistics to be monitored.
+                * weight (Tensor): Weight tensor to summarize losses.
+                * optim_idx (int): Optimizer index (0 for G and 1 for D).
+
+        """
+        # setup
+        batch_size = text.size(0)
+        feats = feats.transpose(1, 2)
+        speech = speech.unsqueeze(1)
+
+        # calculate generator outputs
+        reuse_cache = True
+        if not self.cache_generator_outputs or self._cache is None:
+            reuse_cache = False
+            outs = self.generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+            )
+        else:
+            outs = self._cache
+
+        # store cache
+        if self.training and self.cache_generator_outputs and not reuse_cache:
+            self._cache = outs
+
+        # parse outputs
+        speech_hat_, dur_nll, _, start_idxs, _, z_mask, outs_ = outs
+        _, z_p, m_p, logs_p, _, logs_q = outs_
+        speech_ = get_segments(
+            x=speech,
+            start_idxs=start_idxs * self.generator.upsample_factor,
+            segment_size=self.generator.segment_size * self.generator.upsample_factor,
+        )
+
+        # calculate discriminator outputs
+        p_hat = self.discriminator(speech_hat_)
+        with torch.no_grad():
+            # do not store discriminator gradient in generator turn
+            p = self.discriminator(speech_)
+
+        # calculate losses
+        mel_loss = self.mel_loss(speech_hat_, speech_)
+        kl_loss = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask)
+        dur_loss = torch.sum(dur_nll.float())
+        adv_loss = self.generator_adv_loss(p_hat)
+        feat_match_loss = self.feat_match_loss(p_hat, p)
+
+        mel_loss = mel_loss * self.lambda_mel
+        kl_loss = kl_loss * self.lambda_kl
+        dur_loss = dur_loss * self.lambda_dur
+        adv_loss = adv_loss * self.lambda_adv
+        feat_match_loss = feat_match_loss * self.lambda_feat_match
+        loss = mel_loss + kl_loss + dur_loss + adv_loss + feat_match_loss
+
+        stats = dict(
+            generator_loss=loss.item(),
+            generator_mel_loss=mel_loss.item(),
+            generator_kl_loss=kl_loss.item(),
+            generator_dur_loss=dur_loss.item(),
+            generator_adv_loss=adv_loss.item(),
+            generator_feat_match_loss=feat_match_loss.item(),
+        )
+
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+
+        # reset cache
+        if reuse_cache or not self.training:
+            self._cache = None
+
+        return {
+            "loss": loss,
+            "stats": stats,
+            "weight": weight,
+            "optim_idx": 0,  # needed for trainer
+        }
+
+    def _forward_discrminator(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        sids: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+    ) -> Dict[str, Any]:
+        """Perform discriminator forward.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+
+        Returns:
+            Dict[str, Any]:
+                * loss (Tensor): Loss scalar tensor.
+                * stats (Dict[str, float]): Statistics to be monitored.
+                * weight (Tensor): Weight tensor to summarize losses.
+                * optim_idx (int): Optimizer index (0 for G and 1 for D).
+
+        """
+        # setup
+        batch_size = text.size(0)
+        feats = feats.transpose(1, 2)
+        speech = speech.unsqueeze(1)
+
+        # calculate generator outputs
+        reuse_cache = True
+        if not self.cache_generator_outputs or self._cache is None:
+            reuse_cache = False
+            outs = self.generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+            )
+        else:
+            outs = self._cache
+
+        # store cache
+        if self.cache_generator_outputs and not reuse_cache:
+            self._cache = outs
+
+        # parse outputs
+        speech_hat_, _, _, start_idxs, *_ = outs
+        speech_ = get_segments(
+            x=speech,
+            start_idxs=start_idxs * self.generator.upsample_factor,
+            segment_size=self.generator.segment_size * self.generator.upsample_factor,
+        )
+
+        # calculate discriminator outputs
+        p_hat = self.discriminator(speech_hat_.detach())
+        p = self.discriminator(speech_)
+
+        # calculate losses
+        real_loss, fake_loss = self.discriminator_adv_loss(p_hat, p)
+        loss = real_loss + fake_loss
+
+        stats = dict(
+            discriminator_loss=loss.item(),
+            discriminator_real_loss=real_loss.item(),
+            discriminator_fake_loss=fake_loss.item(),
+        )
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+
+        # reset cache
+        if reuse_cache or not self.training:
+            self._cache = None
+
+        return {
+            "loss": loss,
+            "stats": stats,
+            "weight": weight,
+            "optim_idx": 1,  # needed for trainer
+        }
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        feats: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        durations: Optional[torch.Tensor] = None,
+        noise_scale: float = 0.667,
+        noise_scale_dur: float = 0.8,
+        alpha: float = 1.0,
+        max_len: Optional[int] = None,
+        use_teacher_forcing: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """Run inference.
+
+        Args:
+            text (Tensor): Input text index tensor (T_text,).
+            feats (Tensor): Feature tensor (T_feats, aux_channels).
+            sids (Tensor): Speaker index tensor (1,).
+            spembs (Optional[Tensor]): Speaker embedding tensor (spk_embed_dim,).
+            lids (Tensor): Language index tensor (1,).
+            durations (Tensor): Ground-truth duration tensor (T_text,).
+            noise_scale (float): Noise scale value for flow.
+            noise_scale_dur (float): Noise scale value for duration predictor.
+            alpha (float): Alpha parameter to control the speed of generated speech.
+            max_len (Optional[int]): Maximum length.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
+            Dict[str, Tensor]:
+                * wav (Tensor): Generated waveform tensor (T_wav,).
+                * att_w (Tensor): Monotonic attention weight tensor (T_feats, T_text).
+                * duration (Tensor): Predicted duration tensor (T_text,).
+
+        """
+        # setup
+        text = text[None]
+        text_lengths = torch.tensor(
+            [text.size(1)],
+            dtype=torch.long,
+            device=text.device,
+        )
+        if sids is not None:
+            sids = sids.view(1)
+        if lids is not None:
+            lids = lids.view(1)
+        if durations is not None:
+            durations = durations.view(1, 1, -1)
+
+        # inference
+        if use_teacher_forcing:
+            assert feats is not None
+            feats = feats[None].transpose(1, 2)
+            feats_lengths = torch.tensor(
+                [feats.size(2)],
+                dtype=torch.long,
+                device=feats.device,
+            )
+            wav, att_w, dur = self.generator.inference(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                max_len=max_len,
+                use_teacher_forcing=use_teacher_forcing,
+            )
+        else:
+            wav, att_w, dur = self.generator.inference(
+                text=text,
+                text_lengths=text_lengths,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                dur=durations,
+                noise_scale=noise_scale,
+                noise_scale_dur=noise_scale_dur,
+                alpha=alpha,
+                max_len=max_len,
+            )
+        return dict(wav=wav.view(-1), att_w=att_w[0], duration=dur[0])
diff --git a/espnet2/gan_tts/wavenet/__init__.py b/espnet2/gan_tts/wavenet/__init__.py
new file mode 100644
index 00000000000..4461d36d83a
--- /dev/null
+++ b/espnet2/gan_tts/wavenet/__init__.py
@@ -0,0 +1 @@
+from espnet2.gan_tts.wavenet.wavenet import WaveNet  # NOQA
diff --git a/espnet2/gan_tts/wavenet/residual_block.py b/espnet2/gan_tts/wavenet/residual_block.py
new file mode 100644
index 00000000000..e568c7e7aa5
--- /dev/null
+++ b/espnet2/gan_tts/wavenet/residual_block.py
@@ -0,0 +1,169 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Residual block modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+import math
+
+from typing import Optional
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+
+
+class Conv1d(torch.nn.Conv1d):
+    """Conv1d module with customized initialization."""
+
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv1d module."""
+        super().__init__(*args, **kwargs)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+        torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+
+
+class Conv1d1x1(Conv1d):
+    """1x1 Conv1d with customized initialization."""
+
+    def __init__(self, in_channels: int, out_channels: int, bias: bool):
+        """Initialize 1x1 Conv1d module."""
+        super().__init__(
+            in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias
+        )
+
+
+class ResidualBlock(torch.nn.Module):
+    """Residual block module in WaveNet."""
+
+    def __init__(
+        self,
+        kernel_size: int = 3,
+        residual_channels: int = 64,
+        gate_channels: int = 128,
+        skip_channels: int = 64,
+        aux_channels: int = 80,
+        global_channels: int = -1,
+        dropout_rate: float = 0.0,
+        dilation: int = 1,
+        bias: bool = True,
+        scale_residual: bool = False,
+    ):
+        """Initialize ResidualBlock module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            residual_channels (int): Number of channels for residual connection.
+            skip_channels (int): Number of channels for skip connection.
+            aux_channels (int): Number of local conditioning channels.
+            dropout (float): Dropout probability.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            scale_residual (bool): Whether to scale the residual outputs.
+
+        """
+        super().__init__()
+        self.dropout_rate = dropout_rate
+        self.residual_channels = residual_channels
+        self.skip_channels = skip_channels
+        self.scale_residual = scale_residual
+
+        # check
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        assert gate_channels % 2 == 0
+
+        # dilation conv
+        padding = (kernel_size - 1) // 2 * dilation
+        self.conv = Conv1d(
+            residual_channels,
+            gate_channels,
+            kernel_size,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+
+        # local conditioning
+        if aux_channels > 0:
+            self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False)
+        else:
+            self.conv1x1_aux = None
+
+        # global conditioning
+        if global_channels > 0:
+            self.conv1x1_glo = Conv1d1x1(global_channels, gate_channels, bias=False)
+        else:
+            self.conv1x1_glo = None
+
+        # conv output is split into two groups
+        gate_out_channels = gate_channels // 2
+
+        # NOTE(kan-bayashi): concat two convs into a single conv for the efficiency
+        #   (integrate res 1x1 + skip 1x1 convs)
+        self.conv1x1_out = Conv1d1x1(
+            gate_out_channels, residual_channels + skip_channels, bias=bias
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: Optional[torch.Tensor] = None,
+        c: Optional[torch.Tensor] = None,
+        g: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, residual_channels, T).
+            x_mask Optional[torch.Tensor]: Mask tensor (B, 1, T).
+            c (Optional[Tensor]): Local conditioning tensor (B, aux_channels, T).
+            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).
+
+        Returns:
+            Tensor: Output tensor for residual connection (B, residual_channels, T).
+            Tensor: Output tensor for skip connection (B, skip_channels, T).
+
+        """
+        residual = x
+        x = F.dropout(x, p=self.dropout_rate, training=self.training)
+        x = self.conv(x)
+
+        # split into two part for gated activation
+        splitdim = 1
+        xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
+
+        # local conditioning
+        if c is not None:
+            c = self.conv1x1_aux(c)
+            ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
+            xa, xb = xa + ca, xb + cb
+
+        # global conditioning
+        if g is not None:
+            g = self.conv1x1_glo(g)
+            ga, gb = g.split(g.size(splitdim) // 2, dim=splitdim)
+            xa, xb = xa + ga, xb + gb
+
+        x = torch.tanh(xa) * torch.sigmoid(xb)
+
+        # residual + skip 1x1 conv
+        x = self.conv1x1_out(x)
+        if x_mask is not None:
+            x = x * x_mask
+
+        # split integrated conv results
+        x, s = x.split([self.residual_channels, self.skip_channels], dim=1)
+
+        # for residual connection
+        x = x + residual
+        if self.scale_residual:
+            x = x * math.sqrt(0.5)
+
+        return x, s
diff --git a/espnet2/gan_tts/wavenet/wavenet.py b/espnet2/gan_tts/wavenet/wavenet.py
new file mode 100644
index 00000000000..cd91cf47710
--- /dev/null
+++ b/espnet2/gan_tts/wavenet/wavenet.py
@@ -0,0 +1,196 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""WaveNet modules.
+
+This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN.
+
+"""
+
+import logging
+import math
+
+from typing import Optional
+
+import torch
+
+from espnet2.gan_tts.wavenet.residual_block import Conv1d1x1
+from espnet2.gan_tts.wavenet.residual_block import ResidualBlock
+
+
+class WaveNet(torch.nn.Module):
+    """WaveNet with global conditioning."""
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        kernel_size: int = 3,
+        layers: int = 30,
+        stacks: int = 3,
+        base_dilation: int = 2,
+        residual_channels: int = 64,
+        aux_channels: int = -1,
+        gate_channels: int = 128,
+        skip_channels: int = 64,
+        global_channels: int = -1,
+        dropout_rate: float = 0.0,
+        bias: bool = True,
+        use_weight_norm: bool = True,
+        use_first_conv: bool = False,
+        use_last_conv: bool = False,
+        scale_residual: bool = False,
+        scale_skip_connect: bool = False,
+    ):
+        """Initialize WaveNet module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of dilated convolution.
+            layers (int): Number of residual block layers.
+            stacks (int): Number of stacks i.e., dilation cycles.
+            base_dilation (int): Base dilation factor.
+            residual_channels (int): Number of channels in residual conv.
+            gate_channels (int):  Number of channels in gated conv.
+            skip_channels (int): Number of channels in skip conv.
+            aux_channels (int): Number of channels for local conditioning feature.
+            global_channels (int): Number of channels for global conditioning feature.
+            dropout_rate (float): Dropout rate. 0.0 means no dropout applied.
+            bias (bool): Whether to use bias parameter in conv layer.
+            use_weight_norm (bool): Whether to use weight norm. If set to true, it will
+                be applied to all of the conv layers.
+            use_first_conv (bool): Whether to use the first conv layers.
+            use_last_conv (bool): Whether to use the last conv layers.
+            scale_residual (bool): Whether to scale the residual outputs.
+            scale_skip_connect (bool): Whether to scale the skip connection outputs.
+
+        """
+        super().__init__()
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+        self.base_dilation = base_dilation
+        self.use_first_conv = use_first_conv
+        self.use_last_conv = use_last_conv
+        self.scale_skip_connect = scale_skip_connect
+
+        # check the number of layers and stacks
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        # define first convolution
+        if self.use_first_conv:
+            self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True)
+
+        # define residual blocks
+        self.conv_layers = torch.nn.ModuleList()
+        for layer in range(layers):
+            dilation = base_dilation ** (layer % layers_per_stack)
+            conv = ResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=aux_channels,
+                global_channels=global_channels,
+                dilation=dilation,
+                dropout_rate=dropout_rate,
+                bias=bias,
+                scale_residual=scale_residual,
+            )
+            self.conv_layers += [conv]
+
+        # define output layers
+        if self.use_last_conv:
+            self.last_conv = torch.nn.Sequential(
+                torch.nn.ReLU(inplace=True),
+                Conv1d1x1(skip_channels, skip_channels, bias=True),
+                torch.nn.ReLU(inplace=True),
+                Conv1d1x1(skip_channels, out_channels, bias=True),
+            )
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: Optional[torch.Tensor] = None,
+        c: Optional[torch.Tensor] = None,
+        g: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T) if use_first_conv else
+                (B, residual_channels, T).
+            x_mask (Optional[Tensor]): Mask tensor (B, 1, T).
+            c (Optional[Tensor]): Local conditioning features (B, aux_channels, T).
+            g (Optional[Tensor]): Global conditioning features (B, global_channels, 1).
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T) if use_last_conv else
+                (B, residual_channels, T).
+
+        """
+        # encode to hidden representation
+        if self.use_first_conv:
+            x = self.first_conv(x)
+
+        # residual block
+        skips = 0.0
+        for f in self.conv_layers:
+            x, h = f(x, x_mask=x_mask, c=c, g=g)
+            skips = skips + h
+        x = skips
+        if self.scale_skip_connect:
+            x = x * math.sqrt(1.0 / len(self.conv_layers))
+
+        # apply final layers
+        if self.use_last_conv:
+            x = self.last_conv(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m: torch.nn.Module):
+            try:
+                logging.debug(f"Weight norm is removed from {m}.")
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m: torch.nn.Module):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
+                torch.nn.utils.weight_norm(m)
+                logging.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
+
+    @staticmethod
+    def _get_receptive_field_size(
+        layers: int,
+        stacks: int,
+        kernel_size: int,
+        base_dilation: int,
+    ) -> int:
+        assert layers % stacks == 0
+        layers_per_cycle = layers // stacks
+        dilations = [base_dilation ** (i % layers_per_cycle) for i in range(layers)]
+        return (kernel_size - 1) * sum(dilations) + 1
+
+    @property
+    def receptive_field_size(self) -> int:
+        """Return receptive field size."""
+        return self._get_receptive_field_size(
+            self.layers, self.stacks, self.kernel_size, self.base_dilation
+        )
diff --git a/espnet2/hubert/__init__.py b/espnet2/hubert/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/hubert/espnet_model.py b/espnet2/hubert/espnet_model.py
new file mode 100644
index 00000000000..bc5bd451bfd
--- /dev/null
+++ b/espnet2/hubert/espnet_model.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Thanks to Abdelrahman Mohamed and Wei-Ning Hsu's help in this implementation,
+# Their origial Hubert work is in:
+#     Paper: https://arxiv.org/pdf/2106.07447.pdf
+#     Code in Fairseq: https://github.com/pytorch/fairseq/tree/master/examples/hubert
+
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.e2e_asr_common import ErrorCalculator
+
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.hubert.hubert_loss import HubertPretrainLoss
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class HubertPretrainModel(AbsESPnetModel):
+    """Hubert Pretrain model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = False,
+        report_wer: bool = False,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        pred_masked_weight: float = 1.0,
+        pred_nomask_weight: float = 0.0,
+        loss_weights: float = 0.0,
+    ):
+        assert check_argument_types()
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.token_list = token_list.copy()
+
+        self.frontend = frontend
+        self.specaug = specaug
+        self.normalize = normalize
+        self.preencoder = preencoder
+        self.encoder = encoder
+        self.criterion_att = HubertPretrainLoss(
+            pred_masked_weight,
+            pred_nomask_weight,
+            loss_weights,
+        )
+        self.pred_masked_weight = pred_masked_weight
+        self.pred_nomask_weight = pred_nomask_weight
+        self.loss_weights = loss_weights
+
+        if report_cer or report_wer:
+            self.error_calculator = ErrorCalculator(
+                token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+        else:
+            self.error_calculator = None
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+        batch_size = speech.shape[0]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+
+        # 1. Encoder
+        encoder_out = self.encode(speech, speech_lengths, text, text_lengths)
+
+        # 2a. Hubert criterion
+        loss, acc_mask, acc_unmask = self._calc_hubert_loss(
+            encoder_out,
+        )
+
+        stats = dict(
+            loss=loss.detach(),
+            acc_mask=acc_mask,
+            acc_unmask=acc_unmask,
+            acc=acc_mask,
+        )
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        y_pad: torch.Tensor,
+        y_pad_length: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            y_pad: (Batch, Length, ...)
+            y_pad_length: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+        # 4. Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out = self.encoder(feats, feats_lengths, y_pad, y_pad_length)
+
+        if hasattr(self.encoder, "encoder"):
+            logp_m_list = self.encoder.encoder.get_logits(encoder_out, True)
+            assert self.pred_masked_weight == 0 or len(logp_m_list) > 0
+
+            logp_u_list = self.encoder.encoder.get_logits(encoder_out, False)
+            assert self.pred_nomask_weight == 0 or len(logp_u_list) > 0
+
+        return encoder_out
+
+    def _extract_feats(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert speech_lengths.dim() == 1, speech_lengths.shape
+
+        # for data-parallel
+        speech = speech[:, : speech_lengths.max()]
+
+        if self.frontend is not None:
+            # Frontend
+            #  e.g. STFT and Feature extract
+            #       data_loader may send time-domain signal in this case
+            # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
+            feats, feats_lengths = self.frontend(speech, speech_lengths)
+        else:
+            # No frontend and no feature extract
+            feats, feats_lengths = speech, speech_lengths
+        return feats, feats_lengths
+
+    def compute_correct(
+        self,
+        logits,
+    ):
+        if logits.numel() == 0:
+            return 0, 0
+        else:
+            assert logits.dim() > 1, logits.shape
+            max = logits.argmax(-1) == 0
+            min = logits.argmin(-1) == 0
+            both = max & min
+            corr = max.long().sum().item() - both.long().sum().item()
+            count = max.numel()
+            return corr, count
+
+    def _calc_hubert_loss(
+        self,
+        encoder_out: Dict[str, torch.Tensor],
+    ):
+
+        # 1. Compute attention loss
+        loss_att, logp_m_list, logp_u_list = self.criterion_att(
+            self.encoder.encoder, encoder_out
+        )
+
+        corr_masked, count_masked = 0, 0
+        corr_unmask, count_unmask = 0, 0
+        with torch.no_grad():
+            for i, logp_m in enumerate(logp_m_list):
+                corr_m, count_m = self.compute_correct(logp_m)
+                corr_masked += corr_m
+                count_masked += count_m
+            for i, logp_u in enumerate(logp_u_list):
+                corr_u, count_u = self.compute_correct(logp_u)
+                corr_unmask += corr_u
+                count_unmask += count_u
+
+        acc_att_m = corr_masked / (count_masked + 1e-10)
+        acc_att_u = corr_unmask / (count_unmask + 1e-10)
+
+        return loss_att, acc_att_m, acc_att_u
diff --git a/espnet2/hubert/hubert_loss.py b/espnet2/hubert/hubert_loss.py
new file mode 100644
index 00000000000..af790177068
--- /dev/null
+++ b/espnet2/hubert/hubert_loss.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# The HubertPretrainLoss Module uses code from Fairseq:
+#     https://github.com/pytorch/fairseq/blob/master/fairseq/criterions/hubert_criterion.py
+#
+# Thanks to Abdelrahman Mohamed and Wei-Ning Hsu's help in this implementation,
+# Their origial Hubert work is in:
+#     Paper: https://arxiv.org/pdf/2106.07447.pdf
+#     Code in Fairseq: https://github.com/pytorch/fairseq/tree/master/examples/hubert
+
+"""Hubert Pretrain Loss module."""
+
+from torch import nn
+import torch.nn.functional as F
+
+
+class HubertPretrainLoss(nn.Module):
+    """Hubert criterion module.
+
+    Args:
+        pred_masked_weight: weight for predictive loss for masked frames
+        pred_nomask_weight: weight for predictive loss for unmasked frames
+        loss_weights: weights for additional loss terms (not first one)
+    """
+
+    def __init__(
+        self,
+        pred_masked_weight: float = 1.0,
+        pred_nomask_weight: float = 0.0,
+        loss_weights: float = 10.0,
+    ):
+        super(HubertPretrainLoss, self).__init__()
+        self.pred_masked_weight = pred_masked_weight
+        self.pred_nomask_weight = pred_nomask_weight
+        self.loss_weights = loss_weights
+
+    def forward(self, model, enc_outputs, reduce=True):
+        loss = 0.0
+        sample_size = 0
+        reduction = "sum" if reduce else "none"
+
+        loss_m_list = []
+        logp_m_list = model.get_logits(enc_outputs, True)
+        targ_m_list = model.get_targets(enc_outputs, True)
+        for i, (logp_m, targ_m) in enumerate(zip(logp_m_list, targ_m_list)):
+            loss_m = F.cross_entropy(logp_m, targ_m, reduction=reduction)
+            loss_m_list.append(loss_m)
+        if self.pred_masked_weight > 0:
+            loss += self.pred_masked_weight * sum(loss_m_list)
+            sample_size += targ_m_list[0].numel()
+
+        loss_u_list = []
+        logp_u_list = model.get_logits(enc_outputs, False)
+        targ_u_list = model.get_targets(enc_outputs, False)
+        for i, (logp_u, targ_u) in enumerate(zip(logp_u_list, targ_u_list)):
+            loss_u = F.cross_entropy(logp_u, targ_u, reduction=reduction)
+            loss_u_list.append(loss_u)
+        if self.pred_nomask_weight > 0:
+            loss += self.pred_nomask_weight * sum(loss_u_list)
+            sample_size += targ_u_list[0].numel()
+
+        if self.loss_weights > 0:
+            assert hasattr(model, "get_extra_losses")
+            extra_losses, names = model.get_extra_losses(enc_outputs)
+
+            if isinstance(extra_losses, list):
+                extra_losses = extra_losses[0]
+                names = names[0]
+            else:
+                raise NotImplementedError("only support one extra loss")
+            loss += self.loss_weights * extra_losses.float() * sample_size
+
+        return loss, logp_m_list, logp_u_list
diff --git a/espnet2/iterators/chunk_iter_factory.py b/espnet2/iterators/chunk_iter_factory.py
index 0e20b950fac..828710ab92f 100644
--- a/espnet2/iterators/chunk_iter_factory.py
+++ b/espnet2/iterators/chunk_iter_factory.py
@@ -106,7 +106,7 @@ def build_iter(
 
         # NOTE(kamo):
         #   This iterator supports multiple chunk lengths and
-        #   keep chunks for each lenghts here until collecting specified numbers
+        #   keep chunks for each lengths here until collecting specified numbers
         cache_chunks_dict = {}
         cache_id_list_dict = {}
         for ids, batch in per_sample_loader:
@@ -145,7 +145,7 @@ def build_iter(
             cache_chunks = cache_chunks_dict.setdefault(W, {})
 
             # Shift width to the next chunk
-            S = int(L * self.chunk_shift_ratio)
+            S = int(W * self.chunk_shift_ratio)
             # Number of chunks
             N = (L - W) // S + 1
             if shuffle:
diff --git a/espnet2/layers/label_aggregation.py b/espnet2/layers/label_aggregation.py
index 2070a888a84..fbd845842e6 100644
--- a/espnet2/layers/label_aggregation.py
+++ b/espnet2/layers/label_aggregation.py
@@ -52,6 +52,10 @@ def forward(
             pad = self.win_length // 2
             max_length = max_length + 2 * pad
             input = torch.nn.functional.pad(input, (0, 0, pad, pad), "constant", 0)
+            input[:, :pad, :] = input[:, pad : (2 * pad), :]
+            input[:, (max_length - pad) : max_length, :] = input[
+                :, (max_length - 2 * pad) : (max_length - pad), :
+            ]
             nframe = (max_length - self.win_length) // self.hop_length + 1
 
         # Step2: framing
diff --git a/espnet2/layers/mask_along_axis.py b/espnet2/layers/mask_along_axis.py
index 4c6f03aa4c0..ecff6fa9659 100644
--- a/espnet2/layers/mask_along_axis.py
+++ b/espnet2/layers/mask_along_axis.py
@@ -1,3 +1,4 @@
+import math
 import torch
 from typeguard import check_argument_types
 from typing import Sequence
@@ -16,7 +17,7 @@ def mask_along_axis(
 
     Args:
         spec: (Batch, Length, Freq)
-        spec_lengths: (Length): Not using lenghts in this implementation
+        spec_lengths: (Length): Not using lengths in this implementation
         mask_width_range: Select the width randomly between this range
     """
 
@@ -126,3 +127,78 @@ def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
             num_mask=self.num_mask,
             replace_with_zero=self.replace_with_zero,
         )
+
+
+class MaskAlongAxisVariableMaxWidth(torch.nn.Module):
+    """Mask input spec along a specified axis with variable maximum width.
+
+    Formula:
+        max_width = max_width_ratio * seq_len
+    """
+
+    def __init__(
+        self,
+        mask_width_ratio_range: Union[float, Sequence[float]] = (0.0, 0.05),
+        num_mask: int = 2,
+        dim: Union[int, str] = "time",
+        replace_with_zero: bool = True,
+    ):
+        assert check_argument_types()
+        if isinstance(mask_width_ratio_range, float):
+            mask_width_ratio_range = (0.0, mask_width_ratio_range)
+        if len(mask_width_ratio_range) != 2:
+            raise TypeError(
+                f"mask_width_ratio_range must be a tuple of float and float values: "
+                f"{mask_width_ratio_range}",
+            )
+
+        assert mask_width_ratio_range[1] > mask_width_ratio_range[0]
+        if isinstance(dim, str):
+            if dim == "time":
+                dim = 1
+            elif dim == "freq":
+                dim = 2
+            else:
+                raise ValueError("dim must be int, 'time' or 'freq'")
+        if dim == 1:
+            self.mask_axis = "time"
+        elif dim == 2:
+            self.mask_axis = "freq"
+        else:
+            self.mask_axis = "unknown"
+
+        super().__init__()
+        self.mask_width_ratio_range = mask_width_ratio_range
+        self.num_mask = num_mask
+        self.dim = dim
+        self.replace_with_zero = replace_with_zero
+
+    def extra_repr(self):
+        return (
+            f"mask_width_ratio_range={self.mask_width_ratio_range}, "
+            f"num_mask={self.num_mask}, axis={self.mask_axis}"
+        )
+
+    def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
+        """Forward function.
+
+        Args:
+            spec: (Batch, Length, Freq)
+        """
+
+        max_seq_len = spec.shape[self.dim]
+        min_mask_width = math.floor(max_seq_len * self.mask_width_ratio_range[0])
+        min_mask_width = max([0, min_mask_width])
+        max_mask_width = math.floor(max_seq_len * self.mask_width_ratio_range[1])
+        max_mask_width = min([max_seq_len, max_mask_width])
+
+        if max_mask_width > min_mask_width:
+            return mask_along_axis(
+                spec,
+                spec_lengths,
+                mask_width_range=(min_mask_width, max_mask_width),
+                dim=self.dim,
+                num_mask=self.num_mask,
+                replace_with_zero=self.replace_with_zero,
+            )
+        return spec, spec_lengths
diff --git a/espnet2/layers/stft.py b/espnet2/layers/stft.py
index f0c96aa25b5..b888bfede82 100644
--- a/espnet2/layers/stft.py
+++ b/espnet2/layers/stft.py
@@ -8,7 +8,15 @@
 from typeguard import check_argument_types
 
 from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet2.enh.layers.complex_utils import is_complex
 from espnet2.layers.inversible_interface import InversibleInterface
+import librosa
+import numpy as np
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+is_torch_1_7_plus = LooseVersion(torch.__version__) >= LooseVersion("1.7")
 
 
 class Stft(torch.nn.Module, InversibleInterface):
@@ -81,16 +89,65 @@ def forward(
             )
         else:
             window = None
-        output = torch.stft(
-            input,
-            n_fft=self.n_fft,
-            win_length=self.win_length,
-            hop_length=self.hop_length,
-            center=self.center,
-            window=window,
-            normalized=self.normalized,
-            onesided=self.onesided,
-        )
+
+        # For the compatibility of ARM devices, which do not support
+        # torch.stft() due to the lake of MKL.
+        if input.is_cuda or torch.backends.mkl.is_available():
+            stft_kwargs = dict(
+                n_fft=self.n_fft,
+                win_length=self.win_length,
+                hop_length=self.hop_length,
+                center=self.center,
+                window=window,
+                normalized=self.normalized,
+                onesided=self.onesided,
+            )
+            if is_torch_1_7_plus:
+                stft_kwargs["return_complex"] = False
+            output = torch.stft(input, **stft_kwargs)
+        else:
+            if self.training:
+                raise NotImplementedError(
+                    "stft is implemented with librosa on this device, which does not "
+                    "support the training mode."
+                )
+
+            # use stft_kwargs to flexibly control different PyTorch versions' kwargs
+            stft_kwargs = dict(
+                n_fft=self.n_fft,
+                win_length=self.win_length,
+                hop_length=self.hop_length,
+                center=self.center,
+                window=window,
+            )
+
+            if window is not None:
+                # pad the given window to n_fft
+                n_pad_left = (self.n_fft - window.shape[0]) // 2
+                n_pad_right = self.n_fft - window.shape[0] - n_pad_left
+                stft_kwargs["window"] = torch.cat(
+                    [torch.zeros(n_pad_left), window, torch.zeros(n_pad_right)], 0
+                ).numpy()
+            else:
+                win_length = (
+                    self.win_length if self.win_length is not None else self.n_fft
+                )
+                stft_kwargs["window"] = torch.ones(win_length)
+
+            output = []
+            # iterate over istances in a batch
+            for i, instance in enumerate(input):
+                stft = librosa.stft(input[i].numpy(), **stft_kwargs)
+                output.append(torch.tensor(np.stack([stft.real, stft.imag], -1)))
+            output = torch.stack(output, 0)
+            if not self.onesided:
+                len_conj = self.n_fft - output.shape[1]
+                conj = output[:, 1 : 1 + len_conj].flip(1)
+                conj[:, :, :, -1].data *= -1
+                output = torch.cat([output, conj], 1)
+            if self.normalized:
+                output = output * (stft_kwargs["window"].shape[0] ** (-0.5))
+
         # output: (Batch, Freq, Frames, 2=real_imag)
         # -> (Batch, Frames, Freq, 2=real_imag)
         output = output.transpose(1, 2)
@@ -103,10 +160,10 @@ def forward(
 
         if ilens is not None:
             if self.center:
-                pad = self.win_length // 2
+                pad = self.n_fft // 2
                 ilens = ilens + 2 * pad
 
-            olens = (ilens - self.win_length) // self.hop_length + 1
+            olens = (ilens - self.n_fft) // self.hop_length + 1
             output.masked_fill_(make_pad_mask(olens, output, 1), 0.0)
         else:
             olens = None
@@ -143,15 +200,18 @@ def inverse(
 
         if self.window is not None:
             window_func = getattr(torch, f"{self.window}_window")
-            window = window_func(
-                self.win_length, dtype=input.dtype, device=input.device
-            )
+            if is_complex(input):
+                datatype = input.real.dtype
+            else:
+                datatype = input.dtype
+            window = window_func(self.win_length, dtype=datatype, device=input.device)
         else:
             window = None
 
-        if isinstance(input, ComplexTensor):
+        if is_complex(input):
             input = torch.stack([input.real, input.imag], dim=-1)
-        assert input.shape[-1] == 2
+        elif input.shape[-1] != 2:
+            raise TypeError("Invalid input type")
         input = input.transpose(1, 2)
 
         wavs = istft(
diff --git a/espnet2/layers/time_warp.py b/espnet2/layers/time_warp.py
index 52574aadbf9..1c14ad97a10 100644
--- a/espnet2/layers/time_warp.py
+++ b/espnet2/layers/time_warp.py
@@ -1,15 +1,9 @@
-from distutils.version import LooseVersion
-
+"""Time warp module."""
 import torch
 
 from espnet.nets.pytorch_backend.nets_utils import pad_list
 
-
-if LooseVersion(torch.__version__) >= LooseVersion("1.1"):
-    DEFAULT_TIME_WARP_MODE = "bicubic"
-else:
-    # pytorch1.0 doesn't implement bicubic
-    DEFAULT_TIME_WARP_MODE = "bilinear"
+DEFAULT_TIME_WARP_MODE = "bicubic"
 
 
 def time_warp(x: torch.Tensor, window: int = 80, mode: str = DEFAULT_TIME_WARP_MODE):
diff --git a/espnet2/lm/espnet_model.py b/espnet2/lm/espnet_model.py
index db6b0f7d62d..0309ee4ffb0 100644
--- a/espnet2/lm/espnet_model.py
+++ b/espnet2/lm/espnet_model.py
@@ -1,4 +1,5 @@
 from typing import Dict
+from typing import Optional
 from typing import Tuple
 
 import torch
@@ -23,11 +24,25 @@ def __init__(self, lm: AbsLM, vocab_size: int, ignore_id: int = 0):
         self.ignore_id = ignore_id
 
     def nll(
-        self, text: torch.Tensor, text_lengths: torch.Tensor
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        max_length: Optional[int] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute negative log likelihood(nll)
+
+        Normally, this function is called in batchify_nll.
+        Args:
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+            max_lengths: int
+        """
         batch_size = text.size(0)
         # For data parallel
-        text = text[:, : text_lengths.max()]
+        if max_length is None:
+            text = text[:, : text_lengths.max()]
+        else:
+            text = text[:, :max_length]
 
         # 1. Create a sentence pair like '<sos> w1 w2 w3' and 'w1 w2 w3 <eos>'
         # text: (Batch, Length) -> x, y: (Batch, Length + 1)
@@ -45,11 +60,59 @@ def nll(
         # nll: (BxL,)
         nll = F.cross_entropy(y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
         # nll: (BxL,) -> (BxL,)
-        nll.masked_fill_(make_pad_mask(x_lengths).to(nll.device).view(-1), 0.0)
+        if max_length is None:
+            nll.masked_fill_(make_pad_mask(x_lengths).to(nll.device).view(-1), 0.0)
+        else:
+            nll.masked_fill_(
+                make_pad_mask(x_lengths, maxlen=max_length + 1).to(nll.device).view(-1),
+                0.0,
+            )
         # nll: (BxL,) -> (B, L)
         nll = nll.view(batch_size, -1)
         return nll, x_lengths
 
+    def batchify_nll(
+        self, text: torch.Tensor, text_lengths: torch.Tensor, batch_size: int = 100
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute negative log likelihood(nll) from transformer language model
+
+        To avoid OOM, this fuction seperate the input into batches.
+        Then call nll for each batch and combine and return results.
+        Args:
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+            batch_size: int, samples each batch contain when computing nll,
+                        you may change this to avoid OOM or increase
+
+        """
+        total_num = text.size(0)
+        if total_num <= batch_size:
+            nll, x_lengths = self.nll(text, text_lengths)
+        else:
+            nlls = []
+            x_lengths = []
+            max_length = text_lengths.max()
+
+            start_idx = 0
+            while True:
+                end_idx = min(start_idx + batch_size, total_num)
+                batch_text = text[start_idx:end_idx, :]
+                batch_text_lengths = text_lengths[start_idx:end_idx]
+                # batch_nll: [B * T]
+                batch_nll, batch_x_lengths = self.nll(
+                    batch_text, batch_text_lengths, max_length=max_length
+                )
+                nlls.append(batch_nll)
+                x_lengths.append(batch_x_lengths)
+                start_idx = end_idx
+                if start_idx == total_num:
+                    break
+            nll = torch.cat(nlls)
+            x_lengths = torch.cat(x_lengths)
+        assert nll.size(0) == total_num
+        assert x_lengths.size(0) == total_num
+        return nll, x_lengths
+
     def forward(
         self, text: torch.Tensor, text_lengths: torch.Tensor
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
diff --git a/espnet2/lm/seq_rnn_lm.py b/espnet2/lm/seq_rnn_lm.py
index 10b18f639f6..9af85ed3cc7 100644
--- a/espnet2/lm/seq_rnn_lm.py
+++ b/espnet2/lm/seq_rnn_lm.py
@@ -79,6 +79,17 @@ def __init__(
         self.nhid = nhid
         self.nlayers = nlayers
 
+    def zero_state(self):
+        """Initialize LM state filled with zero values."""
+        if isinstance(self.rnn, torch.nn.LSTM):
+            h = torch.zeros((self.nlayers, self.nhid), dtype=torch.float)
+            c = torch.zeros((self.nlayers, self.nhid), dtype=torch.float)
+            state = h, c
+        else:
+            state = torch.zeros((self.nlayers, self.nhid), dtype=torch.float)
+
+        return state
+
     def forward(
         self, input: torch.Tensor, hidden: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
diff --git a/espnet2/main_funcs/average_nbest_models.py b/espnet2/main_funcs/average_nbest_models.py
index e025238e80e..4c278e23823 100644
--- a/espnet2/main_funcs/average_nbest_models.py
+++ b/espnet2/main_funcs/average_nbest_models.py
@@ -1,5 +1,6 @@
 import logging
 from pathlib import Path
+from typing import Optional
 from typing import Sequence
 from typing import Union
 import warnings
@@ -17,6 +18,7 @@ def average_nbest_models(
     reporter: Reporter,
     best_model_criterion: Sequence[Sequence[str]],
     nbest: Union[Collection[int], int],
+    suffix: Optional[str] = None,
 ) -> None:
     """Generate averaged model from n-best models
 
@@ -25,7 +27,8 @@ def average_nbest_models(
         reporter: Reporter instance
         best_model_criterion: Give criterions to decide the best model.
             e.g. [("valid", "loss", "min"), ("train", "acc", "max")]
-        nbest:
+        nbest: Number of best model files to be averaged
+        suffix: A suffix added to the averaged model file name
     """
     assert check_argument_types()
     if isinstance(nbest, int):
@@ -35,6 +38,11 @@ def average_nbest_models(
     if len(nbests) == 0:
         warnings.warn("At least 1 nbest values are required")
         nbests = [1]
+    if suffix is not None:
+        suffix = suffix + "."
+    else:
+        suffix = ""
+
     # 1. Get nbests: List[Tuple[str, str, List[Tuple[epoch, value]]]]
     nbest_epochs = [
         (ph, k, reporter.sort_epochs_and_values(ph, k, m)[: max(nbests)])
@@ -55,12 +63,12 @@ def average_nbest_models(
                 # The averaged model is same as the best model
                 e, _ = epoch_and_values[0]
                 op = output_dir / f"{e}epoch.pth"
-                sym_op = output_dir / f"{ph}.{cr}.ave_1best.pth"
+                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
                 if sym_op.is_symlink() or sym_op.exists():
                     sym_op.unlink()
                 sym_op.symlink_to(op.name)
             else:
-                op = output_dir / f"{ph}.{cr}.ave_{n}best.pth"
+                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
                 logging.info(
                     f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
                 )
@@ -96,8 +104,8 @@ def average_nbest_models(
                 torch.save(avg, op)
 
         # 3. *.*.ave.pth is a symlink to the max ave model
-        op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.pth"
-        sym_op = output_dir / f"{ph}.{cr}.ave.pth"
+        op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
+        sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
         if sym_op.is_symlink() or sym_op.exists():
             sym_op.unlink()
         sym_op.symlink_to(op.name)
diff --git a/espnet2/main_funcs/collect_stats.py b/espnet2/main_funcs/collect_stats.py
index 9916ae650d7..297f7bfda7f 100644
--- a/espnet2/main_funcs/collect_stats.py
+++ b/espnet2/main_funcs/collect_stats.py
@@ -92,7 +92,7 @@ def collect_stats(
                             seq = seq[None]
                         # Accumulate value, its square, and count
                         sum_dict[key] += seq.sum(0)
-                        sq_dict[key] += (seq ** 2).sum(0)
+                        sq_dict[key] += (seq**2).sum(0)
                         count_dict[key] += len(seq)
 
                         # 4. [Option] Write derived features as npy format file.
diff --git a/espnet2/main_funcs/pack_funcs.py b/espnet2/main_funcs/pack_funcs.py
index dade3e06764..ffa807e23b6 100644
--- a/espnet2/main_funcs/pack_funcs.py
+++ b/espnet2/main_funcs/pack_funcs.py
@@ -279,7 +279,7 @@ def pack(
     try:
         import torch
 
-        meta_objs.update(torch=torch.__version__)
+        meta_objs.update(torch=str(torch.__version__))
     except ImportError:
         pass
     try:
diff --git a/espnet2/mt/espnet_model.py b/espnet2/mt/espnet_model.py
new file mode 100644
index 00000000000..f93b5d417b2
--- /dev/null
+++ b/espnet2/mt/espnet_model.py
@@ -0,0 +1,280 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+import logging
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.e2e_mt_common import ErrorCalculator as MTErrorCalculator
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class ESPnetMTModel(AbsESPnetModel):
+    """Encoder-Decoder model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: AbsDecoder,
+        src_vocab_size: int = 0,
+        src_token_list: Union[Tuple[str, ...], List[str]] = [],
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_bleu: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        extract_feats_in_collect_stats: bool = True,
+        share_decoder_input_output_embed: bool = False,
+        share_encoder_decoder_input_embed: bool = False,
+    ):
+        assert check_argument_types()
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.src_vocab_size = src_vocab_size
+        self.ignore_id = ignore_id
+        self.token_list = token_list.copy()
+
+        if share_decoder_input_output_embed:
+            if decoder.output_layer is not None:
+                decoder.output_layer.weight = decoder.embed[0].weight
+                logging.info(
+                    "Decoder input embedding and output linear layer are shared"
+                )
+            else:
+                logging.warning(
+                    "Decoder has no output layer, so it cannot be shared "
+                    "with input embedding"
+                )
+
+        if share_encoder_decoder_input_embed:
+            if src_vocab_size == vocab_size:
+                frontend.embed[0].weight = decoder.embed[0].weight
+                logging.info("Encoder and decoder input embeddings are shared")
+            else:
+                logging.warning(
+                    f"src_vocab_size ({src_vocab_size}) does not equal tgt_vocab_size"
+                    f" ({vocab_size}), so the encoder and decoder input embeddings "
+                    "cannot be shared"
+                )
+
+        self.frontend = frontend
+        self.preencoder = preencoder
+        self.postencoder = postencoder
+        self.encoder = encoder
+        self.decoder = decoder
+
+        self.criterion_mt = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        # MT error calculator
+        if report_bleu:
+            self.mt_error_calculator = MTErrorCalculator(
+                token_list, sym_space, sym_blank, report_bleu
+            )
+        else:
+            self.mt_error_calculator = None
+
+        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: torch.Tensor,
+        src_text_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+            src_text: (Batch, length)
+            src_text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            text.shape[0]
+            == text_lengths.shape[0]
+            == src_text.shape[0]
+            == src_text_lengths.shape[0]
+        ), (text.shape, text_lengths.shape, src_text.shape, src_text_lengths.shape)
+
+        batch_size = src_text.shape[0]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        src_text = src_text[:, : src_text_lengths.max()]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(src_text, src_text_lengths)
+
+        # 2a. Attention-decoder branch (MT)
+        loss_mt_att, acc_mt_att, bleu_mt_att = self._calc_mt_att_loss(
+            encoder_out, encoder_out_lens, text, text_lengths
+        )
+
+        # 3. Loss computation
+        loss = loss_mt_att
+
+        stats = dict(
+            loss=loss.detach(),
+            acc=acc_mt_att,
+            bleu=bleu_mt_att,
+        )
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: torch.Tensor,
+        src_text_lengths: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        if self.extract_feats_in_collect_stats:
+            feats, feats_lengths = self._extract_feats(src_text, src_text_lengths)
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = src_text, src_text_lengths
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, src_text: torch.Tensor, src_text_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by mt_inference.py
+
+        Args:
+            src_text: (Batch, Length, ...)
+            src_text_lengths: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(src_text, src_text_lengths)
+
+            # 2. Data augmentation
+            # if self.specaug is not None and self.training:
+            #     feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+        # 4. Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+
+        # Post-encoder, e.g. NLU
+        if self.postencoder is not None:
+            encoder_out, encoder_out_lens = self.postencoder(
+                encoder_out, encoder_out_lens
+            )
+
+        assert encoder_out.size(0) == src_text.size(0), (
+            encoder_out.size(),
+            src_text.size(0),
+        )
+        assert encoder_out.size(1) <= encoder_out_lens.max(), (
+            encoder_out.size(),
+            encoder_out_lens.max(),
+        )
+
+        return encoder_out, encoder_out_lens
+
+    def _extract_feats(
+        self, src_text: torch.Tensor, src_text_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert src_text_lengths.dim() == 1, src_text_lengths.shape
+
+        # for data-parallel
+        src_text = src_text[:, : src_text_lengths.max()]
+        src_text, _ = add_sos_eos(src_text, self.sos, self.eos, self.ignore_id)
+        src_text_lengths = src_text_lengths + 1
+
+        if self.frontend is not None:
+            # Frontend
+            #  e.g. Embedding Lookup
+            # src_text (Batch, NSamples) -> feats: (Batch, NSamples, Dim)
+            feats, feats_lengths = self.frontend(src_text, src_text_lengths)
+        else:
+            # No frontend and no feature extract
+            feats, feats_lengths = src_text, src_text_lengths
+        return feats, feats_lengths
+
+    def _calc_mt_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_mt(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.mt_error_calculator is None:
+            bleu_att = None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            bleu_att = self.mt_error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, bleu_att
diff --git a/espnet2/mt/frontend/__init__.py b/espnet2/mt/frontend/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/mt/frontend/embedding.py b/espnet2/mt/frontend/embedding.py
new file mode 100644
index 00000000000..b9044c1385f
--- /dev/null
+++ b/espnet2/mt/frontend/embedding.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+#  2020, Technische Universität München;  Ludwig Kürzinger
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Embedding Frontend for text based inputs."""
+
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+import torch
+from typeguard import check_argument_types
+from typing import Tuple
+
+
+class Embedding(AbsFrontend):
+    """Embedding Frontend for text based inputs."""
+
+    def __init__(
+        self,
+        input_size: int = 400,
+        embed_dim: int = 400,
+        pos_enc_class=PositionalEncoding,
+        positional_dropout_rate: float = 0.1,
+    ):
+        """Initialize.
+
+        Args:
+            input_size: Number of input tokens.
+            embed_dim: Embedding Size.
+            pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+            positional_dropout_rate: dropout rate after adding positional encoding
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.embed_dim = embed_dim
+        # TODO(sdalmia): check for padding idx
+        self.embed = torch.nn.Sequential(
+            torch.nn.Embedding(input_size, embed_dim),
+            pos_enc_class(embed_dim, positional_dropout_rate),
+        )
+
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply a sliding window on the input.
+
+        Args:
+            input: Input (B, T) or (B, T,D), with D.
+            input_lengths: Input lengths within batch.
+
+        Returns:
+            Tensor: Output with dimensions (B, T, D).
+            Tensor: Output lengths within batch.
+        """
+        x = self.embed(input)
+
+        return x, input_lengths
+
+    def output_size(self) -> int:
+        """Return output length of feature dimension D, i.e. the embedding dim."""
+        return self.embed_dim
diff --git a/espnet2/samplers/build_batch_sampler.py b/espnet2/samplers/build_batch_sampler.py
index 1b645b371f8..0775dd962f7 100644
--- a/espnet2/samplers/build_batch_sampler.py
+++ b/espnet2/samplers/build_batch_sampler.py
@@ -15,7 +15,7 @@
 
 
 BATCH_TYPES = dict(
-    unsorted="UnsortedBatchSampler has nothing in paticular feature and "
+    unsorted="UnsortedBatchSampler has nothing in particular feature and "
     "just creates mini-batches which has constant batch_size. "
     "This sampler doesn't require any length "
     "information for each feature. "
diff --git a/espnet2/samplers/length_batch_sampler.py b/espnet2/samplers/length_batch_sampler.py
index d0ea4d1bfac..522a4b49e14 100644
--- a/espnet2/samplers/length_batch_sampler.py
+++ b/espnet2/samplers/length_batch_sampler.py
@@ -58,37 +58,26 @@ def __init__(
             raise RuntimeError(f"0 lines found: {shape_files[0]}")
 
         # Decide batch-sizes
-        start = 0
         batch_sizes = []
-        bs = 1
-        while True:
+        current_batch_keys = []
+        for key in keys:
+            current_batch_keys.append(key)
             # shape: (Length, dim1, dim2, ...)
             if padding:
-                max_lengths = [
-                    max(d[keys[i]][0] for i in range(start, start + bs))
-                    for d in utt2shapes
-                ]
                 # bins = bs x max_length
-                bins = sum(bs * lg for lg in max_lengths)
+                bins = sum(len(current_batch_keys) * sh[key][0] for sh in utt2shapes)
             else:
                 # bins = sum of lengths
-                bins = sum(
-                    d[keys[i]][0] for i in range(start, start + bs) for d in utt2shapes
-                )
-
-            if bins > batch_bins and bs >= min_batch_size:
-                batch_sizes.append(bs)
-                start += bs
-                bs = 1
-            else:
-                bs += 1
-            if start >= len(keys):
-                break
+                bins = sum(d[k][0] for k in current_batch_keys for d in utt2shapes)
 
-            if start + bs > len(keys):
-                if not self.drop_last or len(batch_sizes) == 0:
-                    batch_sizes.append(len(keys) - start)
-                break
+            if bins > batch_bins and len(current_batch_keys) >= min_batch_size:
+                batch_sizes.append(len(current_batch_keys))
+                current_batch_keys = []
+        else:
+            if len(current_batch_keys) != 0 and (
+                not self.drop_last or len(batch_sizes) == 0
+            ):
+                batch_sizes.append(len(current_batch_keys))
 
         if len(batch_sizes) == 0:
             # Maybe we can't reach here
@@ -106,21 +95,28 @@ def __init__(
 
         # Set mini-batch
         self.batch_list = []
-        start = 0
-        for bs in batch_sizes:
-            assert len(keys) >= start + bs, "Bug"
-            minibatch_keys = keys[start : start + bs]
-            start += bs
-            if sort_in_batch == "descending":
-                minibatch_keys.reverse()
-            elif sort_in_batch == "ascending":
-                # Key are already sorted in ascending
-                pass
-            else:
-                raise ValueError(
-                    f"sort_in_batch must be ascending or descending: {sort_in_batch}"
-                )
-            self.batch_list.append(tuple(minibatch_keys))
+        iter_bs = iter(batch_sizes)
+        bs = next(iter_bs)
+        minibatch_keys = []
+        for key in keys:
+            minibatch_keys.append(key)
+            if len(minibatch_keys) == bs:
+                if sort_in_batch == "descending":
+                    minibatch_keys.reverse()
+                elif sort_in_batch == "ascending":
+                    # Key are already sorted in ascending
+                    pass
+                else:
+                    raise ValueError(
+                        "sort_in_batch must be ascending"
+                        f" or descending: {sort_in_batch}"
+                    )
+                self.batch_list.append(tuple(minibatch_keys))
+                minibatch_keys = []
+                try:
+                    bs = next(iter_bs)
+                except StopIteration:
+                    break
 
         if sort_batch == "ascending":
             pass
diff --git a/espnet2/samplers/num_elements_batch_sampler.py b/espnet2/samplers/num_elements_batch_sampler.py
index c82adbc0ce3..46ff177b8f3 100644
--- a/espnet2/samplers/num_elements_batch_sampler.py
+++ b/espnet2/samplers/num_elements_batch_sampler.py
@@ -57,14 +57,7 @@ def __init__(
         keys = sorted(first_utt2shape, key=lambda k: first_utt2shape[k][0])
         if len(keys) == 0:
             raise RuntimeError(f"0 lines found: {shape_files[0]}")
-
         if padding:
-            for d, s in zip(utt2shapes, shape_files):
-                # shape: (Length, dim1, dim2, ...)
-                if not all(tuple(d[k][1:]) == tuple(d[keys[0]][1:]) for k in keys):
-                    raise RuntimeError(
-                        "If padding=True, the feature dimension must be unified: {s}",
-                    )
             # If padding case, the feat-dim must be same over whole corpus,
             # therefore the first sample is referred
             feat_dims = [np.prod(d[keys[0]][1:]) for d in utt2shapes]
@@ -72,37 +65,35 @@ def __init__(
             feat_dims = None
 
         # Decide batch-sizes
-        start = 0
         batch_sizes = []
-        bs = 1
-        while True:
+        current_batch_keys = []
+        for key in keys:
+            current_batch_keys.append(key)
             # shape: (Length, dim1, dim2, ...)
             if padding:
-                max_lengths = [
-                    max(d[keys[i]][0] for i in range(start, start + bs))
-                    for d in utt2shapes
-                ]
-                bins = sum(bs * lg * d for lg, d in zip(max_lengths, feat_dims))
-            else:
+                for d, s in zip(utt2shapes, shape_files):
+                    if tuple(d[key][1:]) != tuple(d[keys[0]][1:]):
+                        raise RuntimeError(
+                            "If padding=True, the "
+                            f"feature dimension must be unified: {s}",
+                        )
                 bins = sum(
-                    np.prod(d[keys[i]])
-                    for i in range(start, start + bs)
-                    for d in utt2shapes
+                    len(current_batch_keys) * sh[key][0] * d
+                    for sh, d in zip(utt2shapes, feat_dims)
                 )
-
-            if bins > batch_bins and bs >= min_batch_size:
-                batch_sizes.append(bs)
-                start += bs
-                bs = 1
             else:
-                bs += 1
-            if start >= len(keys):
-                break
+                bins = sum(
+                    np.prod(d[k]) for k in current_batch_keys for d in utt2shapes
+                )
 
-            if start + bs > len(keys):
-                if not self.drop_last or len(batch_sizes) == 0:
-                    batch_sizes.append(len(keys) - start)
-                break
+            if bins > batch_bins and len(current_batch_keys) >= min_batch_size:
+                batch_sizes.append(len(current_batch_keys))
+                current_batch_keys = []
+        else:
+            if len(current_batch_keys) != 0 and (
+                not self.drop_last or len(batch_sizes) == 0
+            ):
+                batch_sizes.append(len(current_batch_keys))
 
         if len(batch_sizes) == 0:
             # Maybe we can't reach here
@@ -120,21 +111,29 @@ def __init__(
 
         # Set mini-batch
         self.batch_list = []
-        start = 0
-        for bs in batch_sizes:
-            assert len(keys) >= start + bs, "Bug"
-            minibatch_keys = keys[start : start + bs]
-            start += bs
-            if sort_in_batch == "descending":
-                minibatch_keys.reverse()
-            elif sort_in_batch == "ascending":
-                # Key are already sorted in ascending
-                pass
-            else:
-                raise ValueError(
-                    f"sort_in_batch must be ascending or descending: {sort_in_batch}"
-                )
-            self.batch_list.append(tuple(minibatch_keys))
+        iter_bs = iter(batch_sizes)
+        bs = next(iter_bs)
+        minibatch_keys = []
+        for key in keys:
+            minibatch_keys.append(key)
+            if len(minibatch_keys) == bs:
+                if sort_in_batch == "descending":
+                    minibatch_keys.reverse()
+                elif sort_in_batch == "ascending":
+                    # Key are already sorted in ascending
+                    pass
+                else:
+                    raise ValueError(
+                        "sort_in_batch must be ascending"
+                        f" or descending: {sort_in_batch}"
+                    )
+
+                self.batch_list.append(tuple(minibatch_keys))
+                minibatch_keys = []
+                try:
+                    bs = next(iter_bs)
+                except StopIteration:
+                    break
 
         if sort_batch == "ascending":
             pass
diff --git a/espnet2/schedulers/abs_scheduler.py b/espnet2/schedulers/abs_scheduler.py
index de8087e417e..7395f259c3e 100644
--- a/espnet2/schedulers/abs_scheduler.py
+++ b/espnet2/schedulers/abs_scheduler.py
@@ -1,8 +1,6 @@
 from abc import ABC
 from abc import abstractmethod
-from distutils.version import LooseVersion
 
-import torch
 import torch.optim.lr_scheduler as L
 
 
@@ -77,6 +75,10 @@ def load_state_dict(self, state):
     L.CosineAnnealingLR,
 ]:
     AbsEpochStepScheduler.register(s)
-if LooseVersion(torch.__version__) >= LooseVersion("1.3.0"):
-    for s in [L.CyclicLR, L.OneCycleLR, L.CosineAnnealingWarmRestarts]:
-        AbsBatchStepScheduler.register(s)
+
+AbsBatchStepScheduler.register(L.CyclicLR)
+for s in [
+    L.OneCycleLR,
+    L.CosineAnnealingWarmRestarts,
+]:
+    AbsBatchStepScheduler.register(s)
diff --git a/espnet2/schedulers/noam_lr.py b/espnet2/schedulers/noam_lr.py
index e893c048169..1c9aeb152da 100644
--- a/espnet2/schedulers/noam_lr.py
+++ b/espnet2/schedulers/noam_lr.py
@@ -1,4 +1,4 @@
-from distutils.version import LooseVersion
+"""Noam learning rate scheduler module."""
 from typing import Union
 import warnings
 
@@ -31,8 +31,6 @@ def __init__(
         warmup_steps: Union[int, float] = 25000,
         last_epoch: int = -1,
     ):
-        if LooseVersion(torch.__version__) < LooseVersion("1.1.0"):
-            raise NotImplementedError(f"Require PyTorch>=1.1.0: {torch.__version__}")
         assert check_argument_types()
         self.model_size = model_size
         self.warmup_steps = warmup_steps
@@ -49,7 +47,7 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     def lr_for_WarmupLR(self, lr: float) -> float:
-        return lr / self.model_size ** 0.5 / self.warmup_steps ** 0.5
+        return lr / self.model_size**0.5 / self.warmup_steps**0.5
 
     def __repr__(self):
         return (
@@ -61,7 +59,7 @@ def get_lr(self):
         step_num = self.last_epoch + 1
         return [
             lr
-            * self.model_size ** -0.5
-            * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5)
+            * self.model_size**-0.5
+            * min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
             for lr in self.base_lrs
         ]
diff --git a/espnet2/schedulers/warmup_lr.py b/espnet2/schedulers/warmup_lr.py
index 2217d8b3761..9d029d000b8 100644
--- a/espnet2/schedulers/warmup_lr.py
+++ b/espnet2/schedulers/warmup_lr.py
@@ -1,4 +1,4 @@
-from distutils.version import LooseVersion
+"""Warm up learning rate scheduler module."""
 from typing import Union
 
 import torch
@@ -30,9 +30,6 @@ def __init__(
         warmup_steps: Union[int, float] = 25000,
         last_epoch: int = -1,
     ):
-        if LooseVersion(torch.__version__) < LooseVersion("1.1.0"):
-            raise NotImplementedError(f"Require PyTorch>=1.1.0: {torch.__version__}")
-
         assert check_argument_types()
         self.warmup_steps = warmup_steps
 
@@ -47,7 +44,7 @@ def get_lr(self):
         step_num = self.last_epoch + 1
         return [
             lr
-            * self.warmup_steps ** 0.5
-            * min(step_num ** -0.5, step_num * self.warmup_steps ** -1.5)
+            * self.warmup_steps**0.5
+            * min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
             for lr in self.base_lrs
         ]
diff --git a/espnet2/st/__init__.py b/espnet2/st/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/espnet2/st/espnet_model.py b/espnet2/st/espnet_model.py
new file mode 100644
index 00000000000..f4d59d1a0cc
--- /dev/null
+++ b/espnet2/st/espnet_model.py
@@ -0,0 +1,452 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+import logging
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+from typeguard import check_argument_types
+
+from espnet.nets.e2e_asr_common import ErrorCalculator as ASRErrorCalculator
+from espnet.nets.e2e_mt_common import ErrorCalculator as MTErrorCalculator
+from espnet.nets.pytorch_backend.nets_utils import th_accuracy
+from espnet.nets.pytorch_backend.transformer.add_sos_eos import add_sos_eos
+from espnet.nets.pytorch_backend.transformer.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class ESPnetSTModel(AbsESPnetModel):
+    """CTC-attention hybrid Encoder-Decoder model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: AbsDecoder,
+        extra_asr_decoder: Optional[AbsDecoder],
+        extra_mt_decoder: Optional[AbsDecoder],
+        ctc: CTC,
+        src_vocab_size: int = 0,
+        src_token_list: Union[Tuple[str, ...], List[str]] = [],
+        asr_weight: float = 0.0,
+        mt_weight: float = 0.0,
+        mtlalpha: float = 0.0,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        report_bleu: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        extract_feats_in_collect_stats: bool = True,
+    ):
+        assert check_argument_types()
+        assert 0.0 <= asr_weight < 1.0, "asr_weight should be [0.0, 1.0)"
+        assert 0.0 <= mt_weight < 1.0, "mt_weight should be [0.0, 1.0)"
+        assert 0.0 <= mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]"
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.src_vocab_size = src_vocab_size
+        self.ignore_id = ignore_id
+        self.asr_weight = asr_weight
+        self.mt_weight = mt_weight
+        self.mtlalpha = mtlalpha
+        self.token_list = token_list.copy()
+
+        self.frontend = frontend
+        self.specaug = specaug
+        self.normalize = normalize
+        self.preencoder = preencoder
+        self.postencoder = postencoder
+        self.encoder = encoder
+        self.decoder = (
+            decoder  # TODO(jiatong): directly implement multi-decoder structure at here
+        )
+
+        self.criterion_st = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        self.criterion_asr = LabelSmoothingLoss(
+            size=src_vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        # submodule for ASR task
+        if self.asr_weight > 0:
+            assert (
+                src_token_list is not None
+            ), "Missing src_token_list, cannot add asr module to st model"
+            if self.mtlalpha > 0.0:
+                self.ctc = ctc
+            if self.mtlalpha < 1.0:
+                self.extra_asr_decoder = extra_asr_decoder
+            elif extra_asr_decoder is not None:
+                logging.warning(
+                    "Not using extra_asr_decoder because "
+                    "mtlalpha is set as {} (== 1.0)".format(mtlalpha),
+                )
+
+        # submodule for MT task
+        if self.mt_weight > 0:
+            self.extra_mt_decoder = extra_mt_decoder
+        elif extra_mt_decoder is not None:
+            logging.warning(
+                "Not using extra_mt_decoder because "
+                "mt_weight is set as {} (== 0)".format(mt_weight),
+            )
+
+        # MT error calculator
+        if report_bleu:
+            self.mt_error_calculator = MTErrorCalculator(
+                token_list, sym_space, sym_blank, report_bleu
+            )
+        else:
+            self.mt_error_calculator = None
+
+        # ASR error calculator
+        if report_cer or report_wer:
+            assert (
+                src_token_list is not None
+            ), "Missing src_token_list, cannot add asr module to st model"
+            self.asr_error_calculator = ASRErrorCalculator(
+                src_token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+        else:
+            self.asr_error_calculator = None
+
+        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
+
+        # TODO(jiatong): add multilingual related functions
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: Optional[torch.Tensor],
+        src_text_lengths: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch,)
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+            src_text: (Batch, length)
+            src_text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+
+        # additional checks with valid src_text
+        if src_text is not None:
+            assert src_text_lengths.dim() == 1, src_text_lengths.shape
+            assert text.shape[0] == src_text.shape[0] == src_text_lengths.shape[0], (
+                text.shape,
+                src_text.shape,
+                src_text_lengths.shape,
+            )
+
+        batch_size = speech.shape[0]
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        if src_text is not None:
+            src_text = src_text[:, : src_text_lengths.max()]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+
+        # 2a. Attention-decoder branch (ST)
+        loss_st_att, acc_st_att, bleu_st_att = self._calc_mt_att_loss(
+            encoder_out, encoder_out_lens, text, text_lengths, st=True
+        )
+
+        # 2b. CTC branch
+        if self.asr_weight > 0:
+            assert src_text is not None, "missing source text for asr sub-task of ST"
+
+        if self.asr_weight > 0 and self.mtlalpha > 0:
+            loss_asr_ctc, cer_asr_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, src_text, src_text_lengths
+            )
+        else:
+            loss_asr_ctc, cer_asr_ctc = 0, None
+
+        # 2c. Attention-decoder branch (extra ASR)
+        if self.asr_weight > 0 and self.mtlalpha < 1.0:
+            (
+                loss_asr_att,
+                acc_asr_att,
+                cer_asr_att,
+                wer_asr_att,
+            ) = self._calc_asr_att_loss(
+                encoder_out, encoder_out_lens, src_text, src_text_lengths
+            )
+        else:
+            loss_asr_att, acc_asr_att, cer_asr_att, wer_asr_att = 0, None, None, None
+
+        # 2d. Attention-decoder branch (extra MT)
+        if self.mt_weight > 0:
+            loss_mt_att, acc_mt_att = self._calc_mt_att_loss(
+                encoder_out, encoder_out_lens, text, text_lengths, st=False
+            )
+        else:
+            loss_mt_att, acc_mt_att = 0, None
+
+        # 3. Loss computation
+        asr_ctc_weight = self.mtlalpha
+        loss_st = loss_st_att
+        if asr_ctc_weight == 1.0:
+            loss_asr = loss_asr_ctc
+        elif asr_ctc_weight == 0.0:
+            loss_asr = loss_asr_att
+        else:
+            loss_asr = (
+                asr_ctc_weight * loss_asr_ctc + (1 - asr_ctc_weight) * loss_asr_att
+            )
+        loss_mt = self.mt_weight * loss_mt_att
+        loss = (
+            (1 - self.asr_weight - self.mt_weight) * loss_st
+            + self.asr_weight * loss_asr
+            + self.mt_weight * loss_mt
+        )
+
+        stats = dict(
+            loss=loss.detach(),
+            loss_asr=loss_asr.detach() if type(loss_asr) is not float else loss_asr,
+            loss_mt=loss_mt.detach() if type(loss_mt) is not float else loss_mt,
+            loss_st=loss_st.detach(),
+            acc_asr=acc_asr_att,
+            acc_mt=acc_mt_att,
+            acc=acc_st_att,
+            cer_ctc=cer_asr_ctc,
+            cer=cer_asr_att,
+            wer=wer_asr_att,
+            bleu=bleu_st_att,
+        )
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        src_text: Optional[torch.Tensor],
+        src_text_lengths: Optional[torch.Tensor],
+    ) -> Dict[str, torch.Tensor]:
+        if self.extract_feats_in_collect_stats:
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+        else:
+            # Generate dummy stats if extract_feats_in_collect_stats is False
+            logging.warning(
+                "Generating dummy stats for feats and feats_lengths, "
+                "because encoder_conf.extract_feats_in_collect_stats is "
+                f"{self.extract_feats_in_collect_stats}"
+            )
+            feats, feats_lengths = speech, speech_lengths
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by st_inference.py
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+        # 4. Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+
+        # Post-encoder, e.g. NLU
+        if self.postencoder is not None:
+            encoder_out, encoder_out_lens = self.postencoder(
+                encoder_out, encoder_out_lens
+            )
+
+        assert encoder_out.size(0) == speech.size(0), (
+            encoder_out.size(),
+            speech.size(0),
+        )
+        assert encoder_out.size(1) <= encoder_out_lens.max(), (
+            encoder_out.size(),
+            encoder_out_lens.max(),
+        )
+
+        return encoder_out, encoder_out_lens
+
+    def _extract_feats(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert speech_lengths.dim() == 1, speech_lengths.shape
+
+        # for data-parallel
+        speech = speech[:, : speech_lengths.max()]
+
+        if self.frontend is not None:
+            # Frontend
+            #  e.g. STFT and Feature extract
+            #       data_loader may send time-domain signal in this case
+            # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
+            feats, feats_lengths = self.frontend(speech, speech_lengths)
+        else:
+            # No frontend and no feature extract
+            feats, feats_lengths = speech, speech_lengths
+        return feats, feats_lengths
+
+    def _calc_mt_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        st: bool = True,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        if st:
+            decoder_out, _ = self.decoder(
+                encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+            )
+        else:
+            decoder_out, _ = self.extra_mt_decoder(
+                encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+            )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_st(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.mt_error_calculator is None:
+            bleu_att = None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            bleu_att = self.mt_error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, bleu_att
+
+    def _calc_asr_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.extra_asr_decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_asr(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.asr_error_calculator is None:
+            cer_att, wer_att = None, None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            cer_att, wer_att = self.asr_error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, cer_att, wer_att
+
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Calc CTC loss
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.asr_error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.asr_error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc
diff --git a/espnet2/tasks/abs_task.py b/espnet2/tasks/abs_task.py
index 428b934c90c..5e8044be2ce 100644
--- a/espnet2/tasks/abs_task.py
+++ b/espnet2/tasks/abs_task.py
@@ -1,3 +1,4 @@
+"""Abstract task module."""
 from abc import ABC
 from abc import abstractmethod
 import argparse
@@ -26,7 +27,6 @@
 from torch.utils.data import DataLoader
 from typeguard import check_argument_types
 from typeguard import check_return_type
-import wandb
 import yaml
 
 from espnet import __version__
@@ -71,6 +71,11 @@
 from espnet2.utils.types import str_or_none
 from espnet2.utils.yaml_no_alias_safe_dump import yaml_no_alias_safe_dump
 
+try:
+    import wandb
+except Exception:
+    wandb = None
+
 if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"):
     from torch.multiprocessing.spawn import ProcessContext
 else:
@@ -79,6 +84,7 @@
 
 optim_classes = dict(
     adam=torch.optim.Adam,
+    adamw=torch.optim.AdamW,
     sgd=SGD,
     adadelta=torch.optim.Adadelta,
     adagrad=torch.optim.Adagrad,
@@ -88,8 +94,11 @@
     rmsprop=torch.optim.RMSprop,
     rprop=torch.optim.Rprop,
 )
-if LooseVersion(torch.__version__) >= LooseVersion("1.2.0"):
-    optim_classes["adamw"] = torch.optim.AdamW
+if LooseVersion(torch.__version__) >= LooseVersion("1.10.0"):
+    # From 1.10.0, RAdam is officially supported
+    optim_classes.update(
+        radam=torch.optim.RAdam,
+    )
 try:
     import torch_optimizer
 
@@ -104,10 +113,14 @@
         # torch_optimizer<=0.0.1a10 doesn't support
         # qhadam=torch_optimizer.QHAdam,
         qhm=torch_optimizer.QHM,
-        radam=torch_optimizer.RAdam,
         sgdw=torch_optimizer.SGDW,
         yogi=torch_optimizer.Yogi,
     )
+    if LooseVersion(torch_optimizer.__version__) < LooseVersion("0.2.0"):
+        # From 0.2.0, RAdam is dropped
+        optim_classes.update(
+            radam=torch_optimizer.RAdam,
+        )
     del torch_optimizer
 except ImportError:
     pass
@@ -136,19 +149,12 @@
     multisteplr=torch.optim.lr_scheduler.MultiStepLR,
     exponentiallr=torch.optim.lr_scheduler.ExponentialLR,
     CosineAnnealingLR=torch.optim.lr_scheduler.CosineAnnealingLR,
+    noamlr=NoamLR,
+    warmuplr=WarmupLR,
+    cycliclr=torch.optim.lr_scheduler.CyclicLR,
+    onecyclelr=torch.optim.lr_scheduler.OneCycleLR,
+    CosineAnnealingWarmRestarts=torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
 )
-if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"):
-    scheduler_classes.update(
-        noamlr=NoamLR,
-        warmuplr=WarmupLR,
-    )
-if LooseVersion(torch.__version__) >= LooseVersion("1.3.0"):
-    CosineAnnealingWarmRestarts = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
-    scheduler_classes.update(
-        cycliclr=torch.optim.lr_scheduler.CyclicLR,
-        onecyclelr=torch.optim.lr_scheduler.OneCycleLR,
-        CosineAnnealingWarmRestarts=CosineAnnealingWarmRestarts,
-    )
 # To lower keys
 optim_classes = {k.lower(): v for k, v in optim_classes.items()}
 scheduler_classes = {k.lower(): v for k, v in scheduler_classes.items()}
@@ -325,7 +331,8 @@ class ArgumentDefaultsRawTextHelpFormatter(
             type=int,
             default=3,
             help="The number images to plot the outputs from attention. "
-            "This option makes sense only when attention-based model",
+            "This option makes sense only when attention-based model. "
+            "We can also disable the attention plot by setting it 0",
         )
 
         group = parser.add_argument_group("distributed training related")
@@ -494,6 +501,12 @@ class ArgumentDefaultsRawTextHelpFormatter(
             default=[10],
             help="Remove previous snapshots excluding the n-best scored epochs",
         )
+        group.add_argument(
+            "--nbest_averaging_interval",
+            type=int,
+            default=0,
+            help="The epoch interval to apply model averaging and save nbest models",
+        )
         group.add_argument(
             "--grad_clip",
             type=float,
@@ -552,6 +565,12 @@ class ArgumentDefaultsRawTextHelpFormatter(
             "training phase. If None is given, it is decided according the number "
             "of training samples automatically .",
         )
+        group.add_argument(
+            "--use_matplotlib",
+            type=str2bool,
+            default=True,
+            help="Enable matplotlib logging",
+        )
         group.add_argument(
             "--use_tensorboard",
             type=str2bool,
@@ -576,6 +595,24 @@ class ArgumentDefaultsRawTextHelpFormatter(
             default=None,
             help="Specify wandb id",
         )
+        group.add_argument(
+            "--wandb_entity",
+            type=str,
+            default=None,
+            help="Specify wandb entity",
+        )
+        group.add_argument(
+            "--wandb_name",
+            type=str,
+            default=None,
+            help="Specify wandb run name",
+        )
+        group.add_argument(
+            "--wandb_model_log_interval",
+            type=int,
+            default=-1,
+            help="Set the model log period",
+        )
         group.add_argument(
             "--detect_anomaly",
             type=str2bool,
@@ -605,6 +642,12 @@ class ArgumentDefaultsRawTextHelpFormatter(
             "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n"
             "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n",
         )
+        group.add_argument(
+            "--ignore_init_mismatch",
+            type=str2bool,
+            default=False,
+            help="Ignore size mismatch when loading pre-trained model",
+        )
         group.add_argument(
             "--freeze_param",
             type=str,
@@ -1130,19 +1173,6 @@ def main_worker(cls, args: argparse.Namespace):
                 )
                 yaml_no_alias_safe_dump(vars(args), f, indent=4, sort_keys=False)
 
-        # 6. Loads pre-trained model
-        for p in args.init_param:
-            logging.info(f"Loading pretrained params from {p}")
-            load_pretrained_model(
-                model=model,
-                init_param=p,
-                # NOTE(kamo): "cuda" for torch.load always indicates cuda:0
-                #   in PyTorch<=1.4
-                map_location=f"cuda:{torch.cuda.current_device()}"
-                if args.ngpu > 0
-                else "cpu",
-            )
-
         if args.dry_run:
             pass
         elif args.collect_stats:
@@ -1193,6 +1223,19 @@ def main_worker(cls, args: argparse.Namespace):
                 write_collected_feats=args.write_collected_feats,
             )
         else:
+            # 6. Loads pre-trained model
+            for p in args.init_param:
+                logging.info(f"Loading pretrained params from {p}")
+                load_pretrained_model(
+                    model=model,
+                    init_param=p,
+                    ignore_init_mismatch=args.ignore_init_mismatch,
+                    # NOTE(kamo): "cuda" for torch.load always indicates cuda:0
+                    #   in PyTorch<=1.4
+                    map_location=f"cuda:{torch.cuda.current_device()}"
+                    if args.ngpu > 0
+                    else "cpu",
+                )
 
             # 7. Build iterator factories
             if args.multiple_iterator:
@@ -1212,6 +1255,10 @@ def main_worker(cls, args: argparse.Namespace):
                 distributed_option=distributed_option,
                 mode="valid",
             )
+            if not args.use_matplotlib and args.num_att_plot != 0:
+                args.num_att_plot = 0
+                logging.info("--use_matplotlib false => Changing --num_att_plot to 0")
+
             if args.num_att_plot != 0:
                 plot_attention_iter_factory = cls.build_iter_factory(
                     args=args,
@@ -1222,28 +1269,37 @@ def main_worker(cls, args: argparse.Namespace):
                 plot_attention_iter_factory = None
 
             # 8. Start training
+            if args.use_wandb:
+                if wandb is None:
+                    raise RuntimeError("Please install wandb")
+
+                try:
+                    wandb.login()
+                except wandb.errors.UsageError:
+                    logging.info("wandb not configured! run `wandb login` to enable")
+                    args.use_wandb = False
+
             if args.use_wandb:
                 if (
                     not distributed_option.distributed
                     or distributed_option.dist_rank == 0
                 ):
                     if args.wandb_project is None:
-                        project = (
-                            "ESPnet_"
-                            + cls.__name__
-                            + str(Path(".").resolve()).replace("/", "_")
-                        )
+                        project = "ESPnet_" + cls.__name__
                     else:
                         project = args.wandb_project
-                    if args.wandb_id is None:
-                        wandb_id = str(output_dir).replace("/", "_")
+
+                    if args.wandb_name is None:
+                        name = str(Path(".").resolve()).replace("/", "_")
                     else:
-                        wandb_id = args.wandb_id
+                        name = args.wandb_name
 
                     wandb.init(
+                        entity=args.wandb_entity,
                         project=project,
+                        name=name,
                         dir=output_dir,
-                        id=wandb_id,
+                        id=args.wandb_id,
                         resume="allow",
                     )
                     wandb.config.update(args)
@@ -1267,6 +1323,9 @@ def main_worker(cls, args: argparse.Namespace):
                 distributed_option=distributed_option,
             )
 
+            if wandb.run:
+                wandb.finish()
+
     @classmethod
     def build_iter_options(
         cls,
@@ -1706,32 +1765,16 @@ def build_streaming_iterator(
         else:
             kwargs = {}
 
-        # IterableDataset is supported from pytorch=1.2
-        if LooseVersion(torch.__version__) >= LooseVersion("1.2"):
-            dataset = IterableESPnetDataset(
-                data_path_and_name_and_type,
-                float_dtype=dtype,
-                preprocess=preprocess_fn,
-                key_file=key_file,
-            )
-            if dataset.apply_utt2category:
-                kwargs.update(batch_size=1)
-            else:
-                kwargs.update(batch_size=batch_size)
+        dataset = IterableESPnetDataset(
+            data_path_and_name_and_type,
+            float_dtype=dtype,
+            preprocess=preprocess_fn,
+            key_file=key_file,
+        )
+        if dataset.apply_utt2category:
+            kwargs.update(batch_size=1)
         else:
-            dataset = ESPnetDataset(
-                data_path_and_name_and_type,
-                float_dtype=dtype,
-                preprocess=preprocess_fn,
-            )
-            if key_file is None:
-                key_file = data_path_and_name_and_type[0][0]
-            batch_sampler = UnsortedBatchSampler(
-                batch_size=batch_size,
-                key_file=key_file,
-                drop_last=False,
-            )
-            kwargs.update(batch_sampler=batch_sampler)
+            kwargs.update(batch_size=batch_size)
 
         cls.check_task_requirements(
             dataset, allow_variable_data_keys, train=False, inference=inference
@@ -1748,20 +1791,29 @@ def build_streaming_iterator(
     @classmethod
     def build_model_from_file(
         cls,
-        config_file: Union[Path, str],
+        config_file: Union[Path, str] = None,
         model_file: Union[Path, str] = None,
         device: str = "cpu",
     ) -> Tuple[AbsESPnetModel, argparse.Namespace]:
-        """This method is used for inference or fine-tuning.
+        """Build model from the files.
+
+        This method is used for inference or fine-tuning.
 
         Args:
             config_file: The yaml file saved when training.
             model_file: The model file saved when training.
-            device:
+            device: Device type, "cpu", "cuda", or "cuda:N".
 
         """
         assert check_argument_types()
-        config_file = Path(config_file)
+        if config_file is None:
+            assert model_file is not None, (
+                "The argument 'model_file' must be provided "
+                "if the argument 'config_file' is not specified."
+            )
+            config_file = Path(model_file).parent / "config.yaml"
+        else:
+            config_file = Path(config_file)
 
         with config_file.open("r", encoding="utf-8") as f:
             args = yaml.safe_load(f)
diff --git a/espnet2/tasks/asr.py b/espnet2/tasks/asr.py
index 46c4d521d23..9ab3c9ca7fd 100644
--- a/espnet2/tasks/asr.py
+++ b/espnet2/tasks/asr.py
@@ -14,6 +14,7 @@
 
 from espnet2.asr.ctc import CTC
 from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
 from espnet2.asr.decoder.rnn_decoder import RNNDecoder
 from espnet2.asr.decoder.transformer_decoder import (
     DynamicConvolution2DTransformerDecoder,  # noqa: H301
@@ -28,26 +29,45 @@
 from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertPretrainEncoder
 from espnet2.asr.encoder.rnn_encoder import RNNEncoder
 from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
 from espnet2.asr.encoder.contextual_block_transformer_encoder import (
     ContextualBlockTransformerEncoder,  # noqa: H301
 )
+from espnet2.asr.encoder.contextual_block_conformer_encoder import (
+    ContextualBlockConformerEncoder,  # noqa: H301
+)
 from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
 from espnet2.asr.encoder.wav2vec2_encoder import FairSeqWav2Vec2Encoder
 from espnet2.asr.espnet_model import ESPnetASRModel
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.asr.frontend.fused import FusedFrontends
+from espnet2.asr.frontend.s3prl import S3prlFrontend
 from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.maskctc_model import MaskCTCModel
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
+    HuggingFaceTransformersPostEncoder,  # noqa: H301
+)
 from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.preencoder.linear import LinearProjection
 from espnet2.asr.preencoder.sinc import LightweightSincConvs
 from espnet2.asr.specaug.abs_specaug import AbsSpecAug
 from espnet2.asr.specaug.specaug import SpecAug
+from espnet2.asr.transducer.joint_network import JointNetwork
+from espnet2.asr.transducer.transducer_decoder import TransducerDecoder
 from espnet2.layers.abs_normalize import AbsNormalize
 from espnet2.layers.global_mvn import GlobalMVN
 from espnet2.layers.utterance_mvn import UtteranceMVN
 from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
+from espnet2.train.abs_espnet_model import AbsESPnetModel
 from espnet2.train.class_choices import ClassChoices
 from espnet2.train.collate_fn import CommonCollateFn
 from espnet2.train.preprocessor import CommonPreprocessor
@@ -61,13 +81,20 @@
 
 frontend_choices = ClassChoices(
     name="frontend",
-    classes=dict(default=DefaultFrontend, sliding_window=SlidingWindow),
+    classes=dict(
+        default=DefaultFrontend,
+        sliding_window=SlidingWindow,
+        s3prl=S3prlFrontend,
+        fused=FusedFrontends,
+    ),
     type_check=AbsFrontend,
     default="default",
 )
 specaug_choices = ClassChoices(
     name="specaug",
-    classes=dict(specaug=SpecAug),
+    classes=dict(
+        specaug=SpecAug,
+    ),
     type_check=AbsSpecAug,
     default=None,
     optional=True,
@@ -82,10 +109,20 @@
     default="utterance_mvn",
     optional=True,
 )
+model_choices = ClassChoices(
+    "model",
+    classes=dict(
+        espnet=ESPnetASRModel,
+        maskctc=MaskCTCModel,
+    ),
+    type_check=AbsESPnetModel,
+    default="espnet",
+)
 preencoder_choices = ClassChoices(
     name="preencoder",
     classes=dict(
         sinc=LightweightSincConvs,
+        linear=LinearProjection,
     ),
     type_check=AbsPreEncoder,
     default=None,
@@ -97,13 +134,26 @@
         conformer=ConformerEncoder,
         transformer=TransformerEncoder,
         contextual_block_transformer=ContextualBlockTransformerEncoder,
+        contextual_block_conformer=ContextualBlockConformerEncoder,
         vgg_rnn=VGGRNNEncoder,
         rnn=RNNEncoder,
         wav2vec2=FairSeqWav2Vec2Encoder,
+        hubert=FairseqHubertEncoder,
+        hubert_pretrain=FairseqHubertPretrainEncoder,
+        longformer=LongformerEncoder,
     ),
     type_check=AbsEncoder,
     default="rnn",
 )
+postencoder_choices = ClassChoices(
+    name="postencoder",
+    classes=dict(
+        hugging_face_transformers=HuggingFaceTransformersPostEncoder,
+    ),
+    type_check=AbsPostEncoder,
+    default=None,
+    optional=True,
+)
 decoder_choices = ClassChoices(
     "decoder",
     classes=dict(
@@ -113,6 +163,8 @@
         dynamic_conv=DynamicConvolutionTransformerDecoder,
         dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
         rnn=RNNDecoder,
+        transducer=TransducerDecoder,
+        mlm=MLMDecoder,
     ),
     type_check=AbsDecoder,
     default="rnn",
@@ -131,10 +183,14 @@ class ASRTask(AbsTask):
         specaug_choices,
         # --normalize and --normalize_conf
         normalize_choices,
+        # --model and --model_conf
+        model_choices,
         # --preencoder and --preencoder_conf
         preencoder_choices,
         # --encoder and --encoder_conf
         encoder_choices,
+        # --postencoder and --postencoder_conf
+        postencoder_choices,
         # --decoder and --decoder_conf
         decoder_choices,
     ]
@@ -186,10 +242,10 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
             help="The keyword arguments for CTC class.",
         )
         group.add_argument(
-            "--model_conf",
+            "--joint_net_conf",
             action=NestedDictAction,
-            default=get_default_kwargs(ESPnetASRModel),
-            help="The keyword arguments for model class.",
+            default=None,
+            help="The keyword arguments for joint network class.",
         )
 
         group = parser.add_argument_group(description="Preprocess related")
@@ -227,7 +283,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         parser.add_argument(
             "--g2p",
             type=str_or_none,
-            choices=[None, "g2p_en", "pyopenjtalk", "pyopenjtalk_kana"],
+            choices=g2p_choices,
             default=None,
             help="Specify g2p method if --token_type=phn",
         )
@@ -394,40 +450,70 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetASRModel:
         encoder_class = encoder_choices.get_class(args.encoder)
         encoder = encoder_class(input_size=input_size, **args.encoder_conf)
 
+        # 5. Post-encoder block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        encoder_output_size = encoder.output_size()
+        if getattr(args, "postencoder", None) is not None:
+            postencoder_class = postencoder_choices.get_class(args.postencoder)
+            postencoder = postencoder_class(
+                input_size=encoder_output_size, **args.postencoder_conf
+            )
+            encoder_output_size = postencoder.output_size()
+        else:
+            postencoder = None
+
         # 5. Decoder
         decoder_class = decoder_choices.get_class(args.decoder)
 
-        decoder = decoder_class(
-            vocab_size=vocab_size,
-            encoder_output_size=encoder.output_size(),
-            **args.decoder_conf,
-        )
+        if args.decoder == "transducer":
+            decoder = decoder_class(
+                vocab_size,
+                embed_pad=0,
+                **args.decoder_conf,
+            )
+
+            joint_network = JointNetwork(
+                vocab_size,
+                encoder.output_size(),
+                decoder.dunits,
+                **args.joint_net_conf,
+            )
+        else:
+            decoder = decoder_class(
+                vocab_size=vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.decoder_conf,
+            )
+
+            joint_network = None
 
         # 6. CTC
         ctc = CTC(
-            odim=vocab_size, encoder_output_sizse=encoder.output_size(), **args.ctc_conf
+            odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf
         )
 
-        # 7. RNN-T Decoder (Not implemented)
-        rnnt_decoder = None
-
-        # 8. Build model
-        model = ESPnetASRModel(
+        # 7. Build model
+        try:
+            model_class = model_choices.get_class(args.model)
+        except AttributeError:
+            model_class = model_choices.get_class("espnet")
+        model = model_class(
             vocab_size=vocab_size,
             frontend=frontend,
             specaug=specaug,
             normalize=normalize,
             preencoder=preencoder,
             encoder=encoder,
+            postencoder=postencoder,
             decoder=decoder,
             ctc=ctc,
-            rnnt_decoder=rnnt_decoder,
+            joint_network=joint_network,
             token_list=token_list,
             **args.model_conf,
         )
 
         # FIXME(kamo): Should be done in model?
-        # 9. Initialize
+        # 8. Initialize
         if args.init is not None:
             initialize(model, args.init)
 
diff --git a/espnet2/tasks/diar.py b/espnet2/tasks/diar.py
index fcea8a56ebc..e01a59532a0 100644
--- a/espnet2/tasks/diar.py
+++ b/espnet2/tasks/diar.py
@@ -17,7 +17,12 @@
 from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.asr.frontend.s3prl import S3prlFrontend
 from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.asr.specaug.specaug import SpecAug
+from espnet2.diar.attractor.abs_attractor import AbsAttractor
+from espnet2.diar.attractor.rnn_attractor import RnnAttractor
 from espnet2.diar.decoder.abs_decoder import AbsDecoder
 from espnet2.diar.decoder.linear_decoder import LinearDecoder
 from espnet2.diar.espnet_model import ESPnetDiarizationModel
@@ -39,10 +44,21 @@
 
 frontend_choices = ClassChoices(
     name="frontend",
-    classes=dict(default=DefaultFrontend, sliding_window=SlidingWindow),
+    classes=dict(
+        default=DefaultFrontend,
+        sliding_window=SlidingWindow,
+        s3prl=S3prlFrontend,
+    ),
     type_check=AbsFrontend,
     default="default",
 )
+specaug_choices = ClassChoices(
+    name="specaug",
+    classes=dict(specaug=SpecAug),
+    type_check=AbsSpecAug,
+    default=None,
+    optional=True,
+)
 normalize_choices = ClassChoices(
     "normalize",
     classes=dict(
@@ -74,6 +90,15 @@
     type_check=AbsDecoder,
     default="linear",
 )
+attractor_choices = ClassChoices(
+    "attractor",
+    classes=dict(
+        rnn=RnnAttractor,
+    ),
+    type_check=AbsAttractor,
+    default=None,
+    optional=True,
+)
 
 
 class DiarizationTask(AbsTask):
@@ -84,6 +109,8 @@ class DiarizationTask(AbsTask):
     class_choices_list = [
         # --frontend and --frontend_conf
         frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
         # --normalize and --normalize_conf
         normalize_choices,
         # --encoder and --encoder_conf
@@ -92,6 +119,8 @@ class DiarizationTask(AbsTask):
         decoder_choices,
         # --label_aggregator and --label_aggregator_conf
         label_aggregator_choices,
+        # --attractor and --attractor_conf
+        attractor_choices,
     ]
 
     # If you need to modify train() or eval() procedures, change Trainer class here
@@ -167,7 +196,7 @@ def build_preprocess_fn(
     ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
         assert check_argument_types()
         if args.use_preprocessor:
-            # FIXME (jiatong): add more arugment here
+            # FIXME (jiatong): add more argument here
             retval = CommonPreprocessor(train=train)
         else:
             retval = None
@@ -211,25 +240,32 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetDiarizationModel:
             frontend = None
             input_size = args.input_size
 
-        # 2. Normalization layer
+        # 2. Data augmentation for spectrogram
+        if args.specaug is not None:
+            specaug_class = specaug_choices.get_class(args.specaug)
+            specaug = specaug_class(**args.specaug_conf)
+        else:
+            specaug = None
+
+        # 3. Normalization layer
         if args.normalize is not None:
             normalize_class = normalize_choices.get_class(args.normalize)
             normalize = normalize_class(**args.normalize_conf)
         else:
             normalize = None
 
-        # 3. Label Aggregator layer
+        # 4. Label Aggregator layer
         label_aggregator_class = label_aggregator_choices.get_class(
             args.label_aggregator
         )
         label_aggregator = label_aggregator_class(**args.label_aggregator_conf)
 
-        # 3. Encoder
+        # 5. Encoder
         encoder_class = encoder_choices.get_class(args.encoder)
         # Note(jiatong): Diarization may not use subsampling when processing
         encoder = encoder_class(input_size=input_size, **args.encoder_conf)
 
-        # 4. Decoder
+        # 6a. Decoder
         decoder_class = decoder_choices.get_class(args.decoder)
         decoder = decoder_class(
             num_spk=args.num_spk,
@@ -237,18 +273,30 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetDiarizationModel:
             **args.decoder_conf,
         )
 
-        # 5. Build model
+        # 6b. Attractor
+        if getattr(args, "attractor", None) is not None:
+            attractor_class = attractor_choices.get_class(args.attractor)
+            attractor = attractor_class(
+                encoder_output_size=encoder.output_size(),
+                **args.attractor_conf,
+            )
+        else:
+            attractor = None
+
+        # 7. Build model
         model = ESPnetDiarizationModel(
             frontend=frontend,
+            specaug=specaug,
             normalize=normalize,
             label_aggregator=label_aggregator,
             encoder=encoder,
             decoder=decoder,
+            attractor=attractor,
             **args.model_conf,
         )
 
         # FIXME(kamo): Should be done in model?
-        # 6. Initialize
+        # 8. Initialize
         if args.init is not None:
             initialize(model, args.init)
 
diff --git a/espnet2/tasks/enh.py b/espnet2/tasks/enh.py
index eba489c9c14..633bcf1114c 100644
--- a/espnet2/tasks/enh.py
+++ b/espnet2/tasks/enh.py
@@ -20,12 +20,25 @@
 from espnet2.enh.encoder.null_encoder import NullEncoder
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
 from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.enh.loss.criterions.abs_loss import AbsEnhLoss
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
+from espnet2.enh.loss.criterions.time_domain import CISDRLoss
+from espnet2.enh.loss.criterions.time_domain import SISNRLoss
+from espnet2.enh.loss.criterions.time_domain import SNRLoss
+from espnet2.enh.loss.wrappers.abs_wrapper import AbsLossWrapper
+from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.loss.wrappers.pit_solver import PITSolver
 from espnet2.enh.separator.abs_separator import AbsSeparator
 from espnet2.enh.separator.asteroid_models import AsteroidModel_Converter
 from espnet2.enh.separator.conformer_separator import ConformerSeparator
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
+from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
+from espnet2.enh.separator.fasnet_separator import FaSNetSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 from espnet2.enh.separator.rnn_separator import RNNSeparator
+from espnet2.enh.separator.skim_separator import SkiMSeparator
 from espnet2.enh.separator.tcn_separator import TCNSeparator
 from espnet2.enh.separator.transformer_separator import TransformerSeparator
 from espnet2.tasks.abs_task import AbsTask
@@ -49,12 +62,16 @@
     name="separator",
     classes=dict(
         rnn=RNNSeparator,
+        skim=SkiMSeparator,
         tcn=TCNSeparator,
+        dc_crn=DC_CRNSeparator,
         dprnn=DPRNNSeparator,
+        dccrn=DCCRNSeparator,
         transformer=TransformerSeparator,
         conformer=ConformerSeparator,
         wpe_beamformer=NeuralBeamformer,
         asteroid=AsteroidModel_Converter,
+        fasnet=FaSNetSeparator,
     ),
     type_check=AbsSeparator,
     default="rnn",
@@ -67,6 +84,26 @@
     default="stft",
 )
 
+loss_wrapper_choices = ClassChoices(
+    name="loss_wrappers",
+    classes=dict(pit=PITSolver, fixed_order=FixedOrderSolver),
+    type_check=AbsLossWrapper,
+    default=None,
+)
+
+criterion_choices = ClassChoices(
+    name="criterions",
+    classes=dict(
+        snr=SNRLoss,
+        ci_sdr=CISDRLoss,
+        si_snr=SISNRLoss,
+        mse=FrequencyDomainMSE,
+        l1=FrequencyDomainL1,
+    ),
+    type_check=AbsEnhLoss,
+    default=None,
+)
+
 MAX_REFERENCE_NUM = 100
 
 
@@ -116,6 +153,20 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
             help="The keyword arguments for model class.",
         )
 
+        group.add_argument(
+            "--criterions",
+            action=NestedDictAction,
+            default=[
+                {
+                    "name": "si_snr",
+                    "conf": {},
+                    "wrapper": "fixed_order",
+                    "wrapper_conf": {},
+                },
+            ],
+            help="The criterions binded with the loss wrappers.",
+        )
+
         group = parser.add_argument_group(description="Preprocess related")
         group.add_argument(
             "--use_preprocessor",
@@ -181,9 +232,21 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhancementModel:
         )
         decoder = decoder_choices.get_class(args.decoder)(**args.decoder_conf)
 
+        loss_wrappers = []
+        for ctr in args.criterions:
+            criterion = criterion_choices.get_class(ctr["name"])(**ctr["conf"])
+            loss_wrapper = loss_wrapper_choices.get_class(ctr["wrapper"])(
+                criterion=criterion, **ctr["wrapper_conf"]
+            )
+            loss_wrappers.append(loss_wrapper)
+
         # 1. Build model
         model = ESPnetEnhancementModel(
-            encoder=encoder, separator=separator, decoder=decoder, **args.model_conf
+            encoder=encoder,
+            separator=separator,
+            decoder=decoder,
+            loss_wrappers=loss_wrappers,
+            **args.model_conf
         )
 
         # FIXME(kamo): Should be done in model?
diff --git a/espnet2/tasks/enh_asr.py b/espnet2/tasks/enh_asr.py
index 169b2066323..c452ab2201d 100644
--- a/espnet2/tasks/enh_asr.py
+++ b/espnet2/tasks/enh_asr.py
@@ -35,6 +35,7 @@
 from espnet2.layers.global_mvn import GlobalMVN
 from espnet2.layers.utterance_mvn import UtteranceMVN
 from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
 from espnet2.train.class_choices import ClassChoices
 from espnet2.train.collate_fn import CommonCollateFn
@@ -210,7 +211,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         parser.add_argument(
             "--g2p",
             type=str_or_none,
-            choices=[None, "g2p_en", "pyopenjtalk", "pyopenjtalk_kana"],
+            choices=g2p_choices,
             default=None,
             help="Specify g2p method if --token_type=phn",
         )
@@ -338,7 +339,7 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetEnhASRModel:
 
         # 6. CTC
         ctc = CTC(
-            odim=vocab_size, encoder_output_sizse=encoder.output_size(), **args.ctc_conf
+            odim=vocab_size, encoder_output_size=encoder.output_size(), **args.ctc_conf
         )
 
         # 7. RNN-T Decoder (Not implemented)
diff --git a/espnet2/tasks/gan_tts.py b/espnet2/tasks/gan_tts.py
new file mode 100644
index 00000000000..bfdc343407f
--- /dev/null
+++ b/espnet2/tasks/gan_tts.py
@@ -0,0 +1,411 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""GAN-based text-to-speech task."""
+
+import argparse
+import logging
+
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.gan_tts.abs_gan_tts import AbsGANTTS
+from espnet2.gan_tts.espnet_model import ESPnetGANTTSModel
+from espnet2.gan_tts.joint import JointText2Wav
+from espnet2.gan_tts.vits import VITS
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.layers.global_mvn import GlobalMVN
+from espnet2.layers.utterance_mvn import UtteranceMVN
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.tasks.abs_task import optim_classes
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.train.class_choices import ClassChoices
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.gan_trainer import GANTrainer
+from espnet2.train.preprocessor import CommonPreprocessor
+from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract
+from espnet2.tts.feats_extract.dio import Dio
+from espnet2.tts.feats_extract.energy import Energy
+from espnet2.tts.feats_extract.linear_spectrogram import LinearSpectrogram
+from espnet2.tts.feats_extract.log_mel_fbank import LogMelFbank
+from espnet2.tts.feats_extract.log_spectrogram import LogSpectrogram
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+feats_extractor_choices = ClassChoices(
+    "feats_extract",
+    classes=dict(
+        fbank=LogMelFbank,
+        log_spectrogram=LogSpectrogram,
+        linear_spectrogram=LinearSpectrogram,
+    ),
+    type_check=AbsFeatsExtract,
+    default="linear_spectrogram",
+)
+normalize_choices = ClassChoices(
+    "normalize",
+    classes=dict(
+        global_mvn=GlobalMVN,
+        utterance_mvn=UtteranceMVN,
+    ),
+    type_check=AbsNormalize,
+    default=None,
+    optional=True,
+)
+tts_choices = ClassChoices(
+    "tts",
+    classes=dict(
+        vits=VITS,
+        joint_text2wav=JointText2Wav,
+    ),
+    type_check=AbsGANTTS,
+    default="vits",
+)
+pitch_extractor_choices = ClassChoices(
+    "pitch_extract",
+    classes=dict(dio=Dio),
+    type_check=AbsFeatsExtract,
+    default=None,
+    optional=True,
+)
+energy_extractor_choices = ClassChoices(
+    "energy_extract",
+    classes=dict(energy=Energy),
+    type_check=AbsFeatsExtract,
+    default=None,
+    optional=True,
+)
+pitch_normalize_choices = ClassChoices(
+    "pitch_normalize",
+    classes=dict(
+        global_mvn=GlobalMVN,
+        utterance_mvn=UtteranceMVN,
+    ),
+    type_check=AbsNormalize,
+    default=None,
+    optional=True,
+)
+energy_normalize_choices = ClassChoices(
+    "energy_normalize",
+    classes=dict(
+        global_mvn=GlobalMVN,
+        utterance_mvn=UtteranceMVN,
+    ),
+    type_check=AbsNormalize,
+    default=None,
+    optional=True,
+)
+
+
+class GANTTSTask(AbsTask):
+    """GAN-based text-to-speech task."""
+
+    # GAN requires two optimizers
+    num_optimizers: int = 2
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --feats_extractor and --feats_extractor_conf
+        feats_extractor_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --tts and --tts_conf
+        tts_choices,
+        # --pitch_extract and --pitch_extract_conf
+        pitch_extractor_choices,
+        # --pitch_normalize and --pitch_normalize_conf
+        pitch_normalize_choices,
+        # --energy_extract and --energy_extract_conf
+        energy_extractor_choices,
+        # --energy_normalize and --energy_normalize_conf
+        energy_normalize_choices,
+    ]
+
+    # Use GANTrainer instead of Trainer
+    trainer = GANTrainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        # NOTE(kamo): Use '_' instead of '-' to avoid confusion
+        assert check_argument_types()
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token",
+        )
+        group.add_argument(
+            "--odim",
+            type=int_or_none,
+            default=None,
+            help="The number of dimension of output feature",
+        )
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetGANTTSModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=True,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="phn",
+            choices=["bpe", "char", "word", "phn"],
+            help="The text will be tokenized in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese", "korean_cleaner"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        return CommonCollateFn(
+            float_pad_value=0.0, int_pad_value=0, not_sequence=["spembs", "sids"]
+        )
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            retval = CommonPreprocessor(
+                train=train,
+                token_type=args.token_type,
+                token_list=args.token_list,
+                bpemodel=args.bpemodel,
+                non_linguistic_symbols=args.non_linguistic_symbols,
+                text_cleaner=args.cleaner,
+                g2p_type=args.g2p,
+            )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("text", "speech")
+        else:
+            # Inference mode
+            retval = ("text",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("spembs", "sids", "durations", "pitch", "energy")
+        else:
+            # Inference mode
+            retval = ("spembs", "sids", "speech", "durations", "pitch", "energy")
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> ESPnetGANTTSModel:
+        assert check_argument_types()
+        if isinstance(args.token_list, str):
+            with open(args.token_list, encoding="utf-8") as f:
+                token_list = [line.rstrip() for line in f]
+
+            # "args" is saved as it is in a yaml file by BaseTask.main().
+            # Overwriting token_list to keep it as "portable".
+            args.token_list = token_list.copy()
+        elif isinstance(args.token_list, (tuple, list)):
+            token_list = args.token_list.copy()
+        else:
+            raise RuntimeError("token_list must be str or dict")
+
+        vocab_size = len(token_list)
+        logging.info(f"Vocabulary size: {vocab_size }")
+
+        # 1. feats_extract
+        if args.odim is None:
+            # Extract features in the model
+            feats_extract_class = feats_extractor_choices.get_class(args.feats_extract)
+            feats_extract = feats_extract_class(**args.feats_extract_conf)
+            odim = feats_extract.output_size()
+        else:
+            # Give features from data-loader
+            args.feats_extract = None
+            args.feats_extract_conf = None
+            feats_extract = None
+            odim = args.odim
+
+        # 2. Normalization layer
+        if args.normalize is not None:
+            normalize_class = normalize_choices.get_class(args.normalize)
+            normalize = normalize_class(**args.normalize_conf)
+        else:
+            normalize = None
+
+        # 3. TTS
+        tts_class = tts_choices.get_class(args.tts)
+        tts = tts_class(idim=vocab_size, odim=odim, **args.tts_conf)
+
+        # 4. Extra components
+        pitch_extract = None
+        energy_extract = None
+        pitch_normalize = None
+        energy_normalize = None
+        if getattr(args, "pitch_extract", None) is not None:
+            pitch_extract_class = pitch_extractor_choices.get_class(
+                args.pitch_extract,
+            )
+            pitch_extract = pitch_extract_class(
+                **args.pitch_extract_conf,
+            )
+        if getattr(args, "energy_extract", None) is not None:
+            energy_extract_class = energy_extractor_choices.get_class(
+                args.energy_extract,
+            )
+            energy_extract = energy_extract_class(
+                **args.energy_extract_conf,
+            )
+        if getattr(args, "pitch_normalize", None) is not None:
+            pitch_normalize_class = pitch_normalize_choices.get_class(
+                args.pitch_normalize,
+            )
+            pitch_normalize = pitch_normalize_class(
+                **args.pitch_normalize_conf,
+            )
+        if getattr(args, "energy_normalize", None) is not None:
+            energy_normalize_class = energy_normalize_choices.get_class(
+                args.energy_normalize,
+            )
+            energy_normalize = energy_normalize_class(
+                **args.energy_normalize_conf,
+            )
+
+        # 5. Build model
+        model = ESPnetGANTTSModel(
+            feats_extract=feats_extract,
+            normalize=normalize,
+            pitch_extract=pitch_extract,
+            pitch_normalize=pitch_normalize,
+            energy_extract=energy_extract,
+            energy_normalize=energy_normalize,
+            tts=tts,
+            **args.model_conf,
+        )
+        assert check_return_type(model)
+        return model
+
+    @classmethod
+    def build_optimizers(
+        cls,
+        args: argparse.Namespace,
+        model: ESPnetGANTTSModel,
+    ) -> List[torch.optim.Optimizer]:
+        # check
+        assert hasattr(model.tts, "generator")
+        assert hasattr(model.tts, "discriminator")
+
+        # define generator optimizer
+        optim_g_class = optim_classes.get(args.optim)
+        if optim_g_class is None:
+            raise ValueError(f"must be one of {list(optim_classes)}: {args.optim}")
+        if args.sharded_ddp:
+            try:
+                import fairscale
+            except ImportError:
+                raise RuntimeError("Requiring fairscale. Do 'pip install fairscale'")
+            optim_g = fairscale.optim.oss.OSS(
+                params=model.tts.generator.parameters(),
+                optim=optim_g_class,
+                **args.optim_conf,
+            )
+        else:
+            optim_g = optim_g_class(
+                model.tts.generator.parameters(),
+                **args.optim_conf,
+            )
+        optimizers = [optim_g]
+
+        # define discriminator optimizer
+        optim_d_class = optim_classes.get(args.optim2)
+        if optim_d_class is None:
+            raise ValueError(f"must be one of {list(optim_classes)}: {args.optim2}")
+        if args.sharded_ddp:
+            try:
+                import fairscale
+            except ImportError:
+                raise RuntimeError("Requiring fairscale. Do 'pip install fairscale'")
+            optim_d = fairscale.optim.oss.OSS(
+                params=model.tts.discriminator.parameters(),
+                optim=optim_d_class,
+                **args.optim2_conf,
+            )
+        else:
+            optim_d = optim_d_class(
+                model.tts.discriminator.parameters(),
+                **args.optim2_conf,
+            )
+        optimizers += [optim_d]
+
+        return optimizers
diff --git a/espnet2/tasks/hubert.py b/espnet2/tasks/hubert.py
new file mode 100644
index 00000000000..2c4fc9634d2
--- /dev/null
+++ b/espnet2/tasks/hubert.py
@@ -0,0 +1,404 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Thanks to Abdelrahman Mohamed and Wei-Ning Hsu's help in this implementation,
+# Their origial Hubert work is in:
+#     Paper: https://arxiv.org/pdf/2106.07447.pdf
+#     Code in Fairseq: https://github.com/pytorch/fairseq/tree/master/examples/hubert
+import argparse
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.encoder.hubert_encoder import (
+    FairseqHubertPretrainEncoder,  # noqa: H301
+)
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.preencoder.sinc import LightweightSincConvs
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.asr.specaug.specaug import SpecAug
+from espnet2.hubert.espnet_model import HubertPretrainModel
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.layers.global_mvn import GlobalMVN
+from espnet2.layers.utterance_mvn import UtteranceMVN
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.class_choices import ClassChoices
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import CommonPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import float_or_none
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+frontend_choices = ClassChoices(
+    name="frontend",
+    classes=dict(default=DefaultFrontend, sliding_window=SlidingWindow),
+    type_check=AbsFrontend,
+    default="default",
+)
+specaug_choices = ClassChoices(
+    name="specaug",
+    classes=dict(specaug=SpecAug),
+    type_check=AbsSpecAug,
+    default=None,
+    optional=True,
+)
+normalize_choices = ClassChoices(
+    "normalize",
+    classes=dict(
+        global_mvn=GlobalMVN,
+        utterance_mvn=UtteranceMVN,
+    ),
+    type_check=AbsNormalize,
+    default="utterance_mvn",
+    optional=True,
+)
+preencoder_choices = ClassChoices(
+    name="preencoder",
+    classes=dict(
+        sinc=LightweightSincConvs,
+    ),
+    type_check=AbsPreEncoder,
+    default=None,
+    optional=True,
+)
+encoder_choices = ClassChoices(
+    "encoder",
+    classes=dict(
+        hubert_pretrain=FairseqHubertPretrainEncoder,
+    ),
+    type_check=AbsEncoder,
+    default="hubert_pretrain",
+)
+
+
+class HubertTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --preencoder and --preencoder_conf
+        preencoder_choices,
+        # --encoder and --encoder_conf
+        encoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(HubertPretrainModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=True,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+        parser.add_argument(
+            "--speech_volume_normalize",
+            type=float_or_none,
+            default=None,
+            help="Scale the maximum amplitude to the given value.",
+        )
+        parser.add_argument(
+            "--rir_scp",
+            type=str_or_none,
+            default=None,
+            help="The file path of rir scp file.",
+        )
+        parser.add_argument(
+            "--rir_apply_prob",
+            type=float,
+            default=1.0,
+            help="THe probability for applying RIR convolution.",
+        )
+        parser.add_argument(
+            "--noise_scp",
+            type=str_or_none,
+            default=None,
+            help="The file path of noise scp file.",
+        )
+        parser.add_argument(
+            "--noise_apply_prob",
+            type=float,
+            default=1.0,
+            help="The probability applying Noise adding.",
+        )
+        parser.add_argument(
+            "--noise_db_range",
+            type=str,
+            default="13_15",
+            help="The range of noise decibel level.",
+        )
+        parser.add_argument(
+            "--pred_masked_weight",
+            type=float,
+            default=1.0,
+            help="weight for predictive loss for masked frames",
+        )
+        parser.add_argument(
+            "--pred_nomask_weight",
+            type=float,
+            default=0.0,
+            help="weight for predictive loss for unmasked frames",
+        )
+        parser.add_argument(
+            "--loss_weights",
+            type=float,
+            default=0.0,
+            help="weights for additional loss terms (not first one)",
+        )
+        parser.add_argument(
+            "--hubert_dict",
+            type=str,
+            default="./dict.txt",
+            help="word-based target dictionary for Hubert pretraining stage",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            retval = CommonPreprocessor(
+                train=train,
+                token_type=args.token_type,
+                token_list=args.token_list,
+                bpemodel=args.bpemodel,
+                non_linguistic_symbols=args.non_linguistic_symbols,
+                text_cleaner=args.cleaner,
+                g2p_type=args.g2p,
+                # NOTE(kamo): Check attribute existence for backward compatibility
+                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+                rir_apply_prob=args.rir_apply_prob
+                if hasattr(args, "rir_apply_prob")
+                else 1.0,
+                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+                noise_apply_prob=args.noise_apply_prob
+                if hasattr(args, "noise_apply_prob")
+                else 1.0,
+                noise_db_range=args.noise_db_range
+                if hasattr(args, "noise_db_range")
+                else "13_15",
+                speech_volume_normalize=args.speech_volume_normalize
+                if hasattr(args, "rir_scp")
+                else None,
+            )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("speech", "text")
+        else:
+            # Recognition mode
+            retval = ("speech",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        retval = ()
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> HubertPretrainModel:
+        assert check_argument_types()
+        if isinstance(args.token_list, str):
+            with open(args.token_list, encoding="utf-8") as f:
+                token_list = [line.rstrip() for line in f]
+            # Overwriting token_list to keep it as "portable".
+            args.token_list = list(token_list)
+        elif isinstance(args.token_list, (tuple, list)):
+            token_list = list(args.token_list)
+        else:
+            raise RuntimeError("token_list must be str or list")
+        vocab_size = len(token_list)
+        logging.info(f"Vocabulary size: {vocab_size }")
+
+        # 1. frontend
+        if args.input_size is None:
+            # Extract features in the model
+            frontend_class = frontend_choices.get_class(args.frontend)
+            frontend = frontend_class(**args.frontend_conf)
+            input_size = frontend.output_size()
+        else:
+            # Give features from data-loader
+            args.frontend = None
+            args.frontend_conf = {}
+            frontend = None
+            input_size = args.input_size
+
+        # 2. Data augmentation for spectrogram
+        if args.specaug is not None:
+            specaug_class = specaug_choices.get_class(args.specaug)
+            specaug = specaug_class(**args.specaug_conf)
+        else:
+            specaug = None
+
+        # 3. Normalization layer
+        if args.normalize is not None:
+            normalize_class = normalize_choices.get_class(args.normalize)
+            normalize = normalize_class(**args.normalize_conf)
+        else:
+            normalize = None
+
+        # 4. Pre-encoder input block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        if getattr(args, "preencoder", None) is not None:
+            preencoder_class = preencoder_choices.get_class(args.preencoder)
+            preencoder = preencoder_class(**args.preencoder_conf)
+            input_size = preencoder.output_size()
+        else:
+            preencoder = None
+
+        # 4. Encoder
+        encoder_class = encoder_choices.get_class(args.encoder)
+        encoder = encoder_class(
+            input_size=input_size,
+            use_amp=args.use_amp,
+            hubert_dict=args.hubert_dict,
+            **args.encoder_conf,
+        )
+
+        # 8. Build model
+        model = HubertPretrainModel(
+            vocab_size=vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            token_list=token_list,
+            **args.model_conf,
+        )
+
+        # 9. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model
diff --git a/espnet2/tasks/lm.py b/espnet2/tasks/lm.py
index 282778244a4..eea17464ca5 100644
--- a/espnet2/tasks/lm.py
+++ b/espnet2/tasks/lm.py
@@ -17,6 +17,7 @@
 from espnet2.lm.seq_rnn_lm import SequentialRNNLM
 from espnet2.lm.transformer_lm import TransformerLM
 from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
 from espnet2.train.class_choices import ClassChoices
 from espnet2.train.collate_fn import CommonCollateFn
@@ -122,7 +123,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         parser.add_argument(
             "--g2p",
             type=str_or_none,
-            choices=[None, "g2p_en", "pyopenjtalk", "pyopenjtalk_kana"],
+            choices=g2p_choices,
             default=None,
             help="Specify g2p method if --token_type=phn",
         )
diff --git a/espnet2/tasks/mt.py b/espnet2/tasks/mt.py
new file mode 100644
index 00000000000..496b48b96e7
--- /dev/null
+++ b/espnet2/tasks/mt.py
@@ -0,0 +1,395 @@
+import argparse
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.decoder.rnn_decoder import RNNDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    DynamicConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import DynamicConvolutionTransformerDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolutionTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.rnn_encoder import RNNEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,  # noqa: H301
+)
+from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
+    HuggingFaceTransformersPostEncoder,  # noqa: H301
+)
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.preencoder.linear import LinearProjection
+from espnet2.asr.preencoder.sinc import LightweightSincConvs
+from espnet2.mt.espnet_model import ESPnetMTModel
+from espnet2.mt.frontend.embedding import Embedding
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.class_choices import ClassChoices
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import MutliTokenizerCommonPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+frontend_choices = ClassChoices(
+    name="frontend",
+    classes=dict(
+        embed=Embedding,
+    ),
+    type_check=AbsFrontend,
+    default="embed",
+)
+preencoder_choices = ClassChoices(
+    name="preencoder",
+    classes=dict(
+        sinc=LightweightSincConvs,
+        linear=LinearProjection,
+    ),
+    type_check=AbsPreEncoder,
+    default=None,
+    optional=True,
+)
+encoder_choices = ClassChoices(
+    "encoder",
+    classes=dict(
+        conformer=ConformerEncoder,
+        transformer=TransformerEncoder,
+        contextual_block_transformer=ContextualBlockTransformerEncoder,
+        vgg_rnn=VGGRNNEncoder,
+        rnn=RNNEncoder,
+    ),
+    type_check=AbsEncoder,
+    default="rnn",
+)
+postencoder_choices = ClassChoices(
+    name="postencoder",
+    classes=dict(
+        hugging_face_transformers=HuggingFaceTransformersPostEncoder,
+    ),
+    type_check=AbsPostEncoder,
+    default=None,
+    optional=True,
+)
+decoder_choices = ClassChoices(
+    "decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+
+
+class MTTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --preencoder and --preencoder_conf
+        preencoder_choices,
+        # --encoder and --encoder_conf
+        encoder_choices,
+        # --postencoder and --postencoder_conf
+        postencoder_choices,
+        # --decoder and --decoder_conf
+        decoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["src_token_list", "token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for target language)",
+        )
+        group.add_argument(
+            "--src_token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for source language)",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetMTModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=True,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The target text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--src_token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The source text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for target language)",
+        )
+        group.add_argument(
+            "--src_bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for source language)",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            retval = MutliTokenizerCommonPreprocessor(
+                train=train,
+                token_type=[args.token_type, args.src_token_type],
+                token_list=[args.token_list, args.src_token_list],
+                bpemodel=[args.bpemodel, args.src_bpemodel],
+                non_linguistic_symbols=args.non_linguistic_symbols,
+                text_cleaner=args.cleaner,
+                g2p_type=args.g2p,
+                text_name=["text", "src_text"],
+            )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("src_text", "text")
+        else:
+            # Recognition mode
+            retval = ("src_text",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ()
+        else:
+            retval = ()
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> ESPnetMTModel:
+        assert check_argument_types()
+        if isinstance(args.token_list, str):
+            with open(args.token_list, encoding="utf-8") as f:
+                token_list = [line.rstrip() for line in f]
+
+            # Overwriting token_list to keep it as "portable".
+            args.token_list = list(token_list)
+        elif isinstance(args.token_list, (tuple, list)):
+            token_list = list(args.token_list)
+        else:
+            raise RuntimeError("token_list must be str or list")
+        vocab_size = len(token_list)
+        logging.info(f"Vocabulary size: {vocab_size }")
+
+        if args.src_token_list is not None:
+            if isinstance(args.src_token_list, str):
+                with open(args.src_token_list, encoding="utf-8") as f:
+                    src_token_list = [line.rstrip() for line in f]
+
+                # Overwriting src_token_list to keep it as "portable".
+                args.src_token_list = list(src_token_list)
+            elif isinstance(args.src_token_list, (tuple, list)):
+                src_token_list = list(args.src_token_list)
+            else:
+                raise RuntimeError("token_list must be str or list")
+            src_vocab_size = len(src_token_list)
+            logging.info(f"Source vocabulary size: {src_vocab_size }")
+        else:
+            src_token_list, src_vocab_size = None, None
+
+        # 1. frontend
+        if args.input_size is None:
+            # Extract features in the model
+            frontend_class = frontend_choices.get_class(args.frontend)
+            frontend = frontend_class(input_size=src_vocab_size, **args.frontend_conf)
+            input_size = frontend.output_size()
+        else:
+            # Give features from data-loader
+            args.frontend = None
+            args.frontend_conf = {}
+            frontend = None
+            input_size = args.input_size
+
+        # 3. Pre-encoder input block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        if getattr(args, "preencoder", None) is not None:
+            preencoder_class = preencoder_choices.get_class(args.preencoder)
+            preencoder = preencoder_class(**args.preencoder_conf)
+            input_size = preencoder.output_size()
+        else:
+            preencoder = None
+
+        # 4. Encoder
+        encoder_class = encoder_choices.get_class(args.encoder)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+
+        # 5. Post-encoder block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        encoder_output_size = encoder.output_size()
+        if getattr(args, "postencoder", None) is not None:
+            postencoder_class = postencoder_choices.get_class(args.postencoder)
+            postencoder = postencoder_class(
+                input_size=encoder_output_size, **args.postencoder_conf
+            )
+            encoder_output_size = postencoder.output_size()
+        else:
+            postencoder = None
+
+        # 5. Decoder
+        decoder_class = decoder_choices.get_class(args.decoder)
+
+        decoder = decoder_class(
+            vocab_size=vocab_size,
+            encoder_output_size=encoder_output_size,
+            **args.decoder_conf,
+        )
+
+        # 8. Build model
+        model = ESPnetMTModel(
+            vocab_size=vocab_size,
+            src_vocab_size=src_vocab_size,
+            frontend=frontend,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            token_list=token_list,
+            src_token_list=src_token_list,
+            **args.model_conf,
+        )
+
+        # FIXME(kamo): Should be done in model?
+        # 9. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model
diff --git a/espnet2/tasks/st.py b/espnet2/tasks/st.py
new file mode 100644
index 00000000000..182a335cc56
--- /dev/null
+++ b/espnet2/tasks/st.py
@@ -0,0 +1,579 @@
+import argparse
+import logging
+from typing import Callable
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.decoder.rnn_decoder import RNNDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    DynamicConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import DynamicConvolutionTransformerDecoder
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolution2DTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import (
+    LightweightConvolutionTransformerDecoder,  # noqa: H301
+)
+from espnet2.asr.decoder.transformer_decoder import TransformerDecoder
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
+from espnet2.asr.encoder.hubert_encoder import FairseqHubertPretrainEncoder
+from espnet2.asr.encoder.rnn_encoder import RNNEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,  # noqa: H301
+)
+from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
+from espnet2.asr.encoder.wav2vec2_encoder import FairSeqWav2Vec2Encoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.asr.frontend.s3prl import S3prlFrontend
+from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
+from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
+    HuggingFaceTransformersPostEncoder,  # noqa: H301
+)
+from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
+from espnet2.asr.preencoder.linear import LinearProjection
+from espnet2.asr.preencoder.sinc import LightweightSincConvs
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.asr.specaug.specaug import SpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.layers.global_mvn import GlobalMVN
+from espnet2.layers.utterance_mvn import UtteranceMVN
+from espnet2.st.espnet_model import ESPnetSTModel
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.class_choices import ClassChoices
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import MutliTokenizerCommonPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import float_or_none
+from espnet2.utils.types import int_or_none
+from espnet2.utils.types import str2bool
+from espnet2.utils.types import str_or_none
+
+frontend_choices = ClassChoices(
+    name="frontend",
+    classes=dict(
+        default=DefaultFrontend,
+        sliding_window=SlidingWindow,
+        s3prl=S3prlFrontend,
+    ),
+    type_check=AbsFrontend,
+    default="default",
+)
+specaug_choices = ClassChoices(
+    name="specaug",
+    classes=dict(specaug=SpecAug),
+    type_check=AbsSpecAug,
+    default=None,
+    optional=True,
+)
+normalize_choices = ClassChoices(
+    "normalize",
+    classes=dict(
+        global_mvn=GlobalMVN,
+        utterance_mvn=UtteranceMVN,
+    ),
+    type_check=AbsNormalize,
+    default="utterance_mvn",
+    optional=True,
+)
+preencoder_choices = ClassChoices(
+    name="preencoder",
+    classes=dict(
+        sinc=LightweightSincConvs,
+        linear=LinearProjection,
+    ),
+    type_check=AbsPreEncoder,
+    default=None,
+    optional=True,
+)
+encoder_choices = ClassChoices(
+    "encoder",
+    classes=dict(
+        conformer=ConformerEncoder,
+        transformer=TransformerEncoder,
+        contextual_block_transformer=ContextualBlockTransformerEncoder,
+        vgg_rnn=VGGRNNEncoder,
+        rnn=RNNEncoder,
+        wav2vec2=FairSeqWav2Vec2Encoder,
+        hubert=FairseqHubertEncoder,
+        hubert_pretrain=FairseqHubertPretrainEncoder,
+    ),
+    type_check=AbsEncoder,
+    default="rnn",
+)
+postencoder_choices = ClassChoices(
+    name="postencoder",
+    classes=dict(
+        hugging_face_transformers=HuggingFaceTransformersPostEncoder,
+    ),
+    type_check=AbsPostEncoder,
+    default=None,
+    optional=True,
+)
+decoder_choices = ClassChoices(
+    "decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+extra_asr_decoder_choices = ClassChoices(
+    "extra_asr_decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+extra_mt_decoder_choices = ClassChoices(
+    "extra_mt_decoder",
+    classes=dict(
+        transformer=TransformerDecoder,
+        lightweight_conv=LightweightConvolutionTransformerDecoder,
+        lightweight_conv2d=LightweightConvolution2DTransformerDecoder,
+        dynamic_conv=DynamicConvolutionTransformerDecoder,
+        dynamic_conv2d=DynamicConvolution2DTransformerDecoder,
+        rnn=RNNDecoder,
+    ),
+    type_check=AbsDecoder,
+    default="rnn",
+)
+
+
+class STTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --preencoder and --preencoder_conf
+        preencoder_choices,
+        # --encoder and --encoder_conf
+        encoder_choices,
+        # --postencoder and --postencoder_conf
+        postencoder_choices,
+        # --decoder and --decoder_conf
+        decoder_choices,
+        # --extra_asr_decoder and --extra_asr_decoder_conf
+        extra_asr_decoder_choices,
+        # --extra_mt_decoder and --extra_mt_decoder_conf
+        extra_mt_decoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["src_token_list", "token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for target language)",
+        )
+        group.add_argument(
+            "--src_token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token (for source language)",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--ctc_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(CTC),
+            help="The keyword arguments for CTC class.",
+        )
+        group.add_argument(
+            "--model_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(ESPnetSTModel),
+            help="The keyword arguments for model class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=True,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The target text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--src_token_type",
+            type=str,
+            default="bpe",
+            choices=["bpe", "char", "word", "phn"],
+            help="The source text will be tokenized " "in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for target language)",
+        )
+        group.add_argument(
+            "--src_bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece (for source language)",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        parser.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            default=None,
+            help="Apply text cleaning",
+        )
+        parser.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+        parser.add_argument(
+            "--speech_volume_normalize",
+            type=float_or_none,
+            default=None,
+            help="Scale the maximum amplitude to the given value.",
+        )
+        parser.add_argument(
+            "--rir_scp",
+            type=str_or_none,
+            default=None,
+            help="The file path of rir scp file.",
+        )
+        parser.add_argument(
+            "--rir_apply_prob",
+            type=float,
+            default=1.0,
+            help="THe probability for applying RIR convolution.",
+        )
+        parser.add_argument(
+            "--noise_scp",
+            type=str_or_none,
+            default=None,
+            help="The file path of noise scp file.",
+        )
+        parser.add_argument(
+            "--noise_apply_prob",
+            type=float,
+            default=1.0,
+            help="The probability applying Noise adding.",
+        )
+        parser.add_argument(
+            "--noise_db_range",
+            type=str,
+            default="13_15",
+            help="The range of noise decibel level.",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        assert check_argument_types()
+        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        assert check_argument_types()
+        if args.use_preprocessor:
+            retval = MutliTokenizerCommonPreprocessor(
+                train=train,
+                token_type=[args.token_type, args.src_token_type],
+                token_list=[args.token_list, args.src_token_list],
+                bpemodel=[args.bpemodel, args.src_bpemodel],
+                non_linguistic_symbols=args.non_linguistic_symbols,
+                text_cleaner=args.cleaner,
+                g2p_type=args.g2p,
+                # NOTE(kamo): Check attribute existence for backward compatibility
+                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+                rir_apply_prob=args.rir_apply_prob
+                if hasattr(args, "rir_apply_prob")
+                else 1.0,
+                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+                noise_apply_prob=args.noise_apply_prob
+                if hasattr(args, "noise_apply_prob")
+                else 1.0,
+                noise_db_range=args.noise_db_range
+                if hasattr(args, "noise_db_range")
+                else "13_15",
+                speech_volume_normalize=args.speech_volume_normalize
+                if hasattr(args, "speech_volume_normalize")
+                else None,
+                speech_name="speech",
+                text_name=["text", "src_text"],
+            )
+        else:
+            retval = None
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("speech", "text")
+        else:
+            # Recognition mode
+            retval = ("speech",)
+        return retval
+
+    @classmethod
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("src_text",)
+        else:
+            retval = ()
+        assert check_return_type(retval)
+        return retval
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace) -> ESPnetSTModel:
+        assert check_argument_types()
+        if isinstance(args.token_list, str):
+            with open(args.token_list, encoding="utf-8") as f:
+                token_list = [line.rstrip() for line in f]
+
+            # Overwriting token_list to keep it as "portable".
+            args.token_list = list(token_list)
+        elif isinstance(args.token_list, (tuple, list)):
+            token_list = list(args.token_list)
+        else:
+            raise RuntimeError("token_list must be str or list")
+        vocab_size = len(token_list)
+        logging.info(f"Vocabulary size: {vocab_size }")
+
+        if args.src_token_list is not None:
+            if isinstance(args.src_token_list, str):
+                with open(args.src_token_list, encoding="utf-8") as f:
+                    src_token_list = [line.rstrip() for line in f]
+
+                # Overwriting src_token_list to keep it as "portable".
+                args.src_token_list = list(src_token_list)
+            elif isinstance(args.src_token_list, (tuple, list)):
+                src_token_list = list(args.src_token_list)
+            else:
+                raise RuntimeError("token_list must be str or list")
+            src_vocab_size = len(src_token_list)
+            logging.info(f"Source vocabulary size: {src_vocab_size }")
+        else:
+            src_token_list, src_vocab_size = None, None
+
+        # 1. frontend
+        if args.input_size is None:
+            # Extract features in the model
+            frontend_class = frontend_choices.get_class(args.frontend)
+            frontend = frontend_class(**args.frontend_conf)
+            input_size = frontend.output_size()
+        else:
+            # Give features from data-loader
+            args.frontend = None
+            args.frontend_conf = {}
+            frontend = None
+            input_size = args.input_size
+
+        # 2. Data augmentation for spectrogram
+        if args.specaug is not None:
+            specaug_class = specaug_choices.get_class(args.specaug)
+            specaug = specaug_class(**args.specaug_conf)
+        else:
+            specaug = None
+
+        # 3. Normalization layer
+        if args.normalize is not None:
+            normalize_class = normalize_choices.get_class(args.normalize)
+            normalize = normalize_class(**args.normalize_conf)
+        else:
+            normalize = None
+
+        # 4. Pre-encoder input block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        if getattr(args, "preencoder", None) is not None:
+            preencoder_class = preencoder_choices.get_class(args.preencoder)
+            preencoder = preencoder_class(**args.preencoder_conf)
+            input_size = preencoder.output_size()
+        else:
+            preencoder = None
+
+        # 4. Encoder
+        encoder_class = encoder_choices.get_class(args.encoder)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+
+        # 5. Post-encoder block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        encoder_output_size = encoder.output_size()
+        if getattr(args, "postencoder", None) is not None:
+            postencoder_class = postencoder_choices.get_class(args.postencoder)
+            postencoder = postencoder_class(
+                input_size=encoder_output_size, **args.postencoder_conf
+            )
+            encoder_output_size = postencoder.output_size()
+        else:
+            postencoder = None
+
+        # 5. Decoder
+        decoder_class = decoder_choices.get_class(args.decoder)
+
+        decoder = decoder_class(
+            vocab_size=vocab_size,
+            encoder_output_size=encoder_output_size,
+            **args.decoder_conf,
+        )
+
+        # 6. CTC
+        if src_token_list is not None:
+            ctc = CTC(
+                odim=src_vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.ctc_conf,
+            )
+        else:
+            ctc = None
+
+        # 7. ASR extra decoder
+        if (
+            getattr(args, "extra_asr_decoder", None) is not None
+            and src_token_list is not None
+        ):
+            extra_asr_decoder_class = extra_asr_decoder_choices.get_class(
+                args.extra_asr_decoder
+            )
+            extra_asr_decoder = extra_asr_decoder_class(
+                vocab_size=src_vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.extra_asr_decoder_conf,
+            )
+        else:
+            extra_asr_decoder = None
+
+        # 8. MT extra decoder
+        if getattr(args, "extra_mt_decoder", None) is not None:
+            extra_mt_decoder_class = extra_mt_decoder_choices.get_class(
+                args.extra_mt_decoder
+            )
+            extra_mt_decoder = extra_mt_decoder_class(
+                vocab_size=vocab_size,
+                encoder_output_size=encoder_output_size,
+                **args.extra_mt_decoder_conf,
+            )
+        else:
+            extra_asr_decoder = None
+
+        # 8. Build model
+        model = ESPnetSTModel(
+            vocab_size=vocab_size,
+            src_vocab_size=src_vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            extra_asr_decoder=extra_asr_decoder,
+            extra_mt_decoder=extra_mt_decoder,
+            token_list=token_list,
+            src_token_list=src_token_list,
+            **args.model_conf,
+        )
+
+        # FIXME(kamo): Should be done in model?
+        # 9. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model
diff --git a/espnet2/tasks/tts.py b/espnet2/tasks/tts.py
index b021762c7f6..df21faf9365 100644
--- a/espnet2/tasks/tts.py
+++ b/espnet2/tasks/tts.py
@@ -1,20 +1,30 @@
+"""Text-to-speech task."""
+
 import argparse
 import logging
+import yaml
+
+from pathlib import Path
 from typing import Callable
 from typing import Collection
 from typing import Dict
 from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import Union
 
 import numpy as np
 import torch
+
 from typeguard import check_argument_types
 from typeguard import check_return_type
 
+from espnet2.gan_tts.joint import JointText2Wav
+from espnet2.gan_tts.vits import VITS
 from espnet2.layers.abs_normalize import AbsNormalize
 from espnet2.layers.global_mvn import GlobalMVN
 from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.train.class_choices import ClassChoices
 from espnet2.train.collate_fn import CommonCollateFn
 from espnet2.train.preprocessor import CommonPreprocessor
@@ -26,11 +36,14 @@
 from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract
 from espnet2.tts.feats_extract.dio import Dio
 from espnet2.tts.feats_extract.energy import Energy
+from espnet2.tts.feats_extract.linear_spectrogram import LinearSpectrogram
 from espnet2.tts.feats_extract.log_mel_fbank import LogMelFbank
 from espnet2.tts.feats_extract.log_spectrogram import LogSpectrogram
 from espnet2.tts.tacotron2 import Tacotron2
 from espnet2.tts.transformer import Transformer
+from espnet2.tts.utils import ParallelWaveGANPretrainedVocoder
 from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.griffin_lim import Spectrogram2Waveform
 from espnet2.utils.nested_dict_action import NestedDictAction
 from espnet2.utils.types import int_or_none
 from espnet2.utils.types import str2bool
@@ -38,7 +51,11 @@
 
 feats_extractor_choices = ClassChoices(
     "feats_extract",
-    classes=dict(fbank=LogMelFbank, spectrogram=LogSpectrogram),
+    classes=dict(
+        fbank=LogMelFbank,
+        spectrogram=LogSpectrogram,
+        linear_spectrogram=LinearSpectrogram,
+    ),
     type_check=AbsFeatsExtract,
     default="fbank",
 )
@@ -84,6 +101,9 @@
         transformer=Transformer,
         fastspeech=FastSpeech,
         fastspeech2=FastSpeech2,
+        # NOTE(kan-bayashi): available only for inference
+        vits=VITS,
+        joint_text2wav=JointText2Wav,
     ),
     type_check=AbsTTS,
     default="tacotron2",
@@ -173,25 +193,14 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
         parser.add_argument(
             "--cleaner",
             type=str_or_none,
-            choices=[None, "tacotron", "jaconv", "vietnamese"],
+            choices=[None, "tacotron", "jaconv", "vietnamese", "korean_cleaner"],
             default=None,
             help="Apply text cleaning",
         )
         parser.add_argument(
             "--g2p",
             type=str_or_none,
-            choices=[
-                None,
-                "g2p_en",
-                "g2p_en_no_space",
-                "pyopenjtalk",
-                "pyopenjtalk_kana",
-                "pyopenjtalk_accent",
-                "pyopenjtalk_accent_with_pause",
-                "pypinyin_g2p",
-                "pypinyin_g2p_phone",
-                "espeak_ng_arabic",
-            ],
+            choices=g2p_choices,
             default=None,
             help="Specify g2p method if --token_type=phn",
         )
@@ -210,7 +219,9 @@ def build_collate_fn(
     ]:
         assert check_argument_types()
         return CommonCollateFn(
-            float_pad_value=0.0, int_pad_value=0, not_sequence=["spembs"]
+            float_pad_value=0.0,
+            int_pad_value=0,
+            not_sequence=["spembs", "sids", "lids"],
         )
 
     @classmethod
@@ -249,10 +260,10 @@ def optional_data_names(
         cls, train: bool = True, inference: bool = False
     ) -> Tuple[str, ...]:
         if not inference:
-            retval = ("spembs", "durations", "pitch", "energy")
+            retval = ("spembs", "durations", "pitch", "energy", "sids", "lids")
         else:
             # Inference mode
-            retval = ("spembs", "speech", "durations")
+            retval = ("spembs", "speech", "durations", "sids", "lids")
         return retval
 
     @classmethod
@@ -350,3 +361,41 @@ def build_model(cls, args: argparse.Namespace) -> ESPnetTTSModel:
         )
         assert check_return_type(model)
         return model
+
+    @classmethod
+    def build_vocoder_from_file(
+        cls,
+        vocoder_config_file: Union[Path, str] = None,
+        vocoder_file: Union[Path, str] = None,
+        model: Optional[ESPnetTTSModel] = None,
+        device: str = "cpu",
+    ):
+        # Build vocoder
+        if vocoder_file is None:
+            # If vocoder file is not provided, use griffin-lim as a vocoder
+            vocoder_conf = {}
+            if vocoder_config_file is not None:
+                vocoder_config_file = Path(vocoder_config_file)
+                with vocoder_config_file.open("r", encoding="utf-8") as f:
+                    vocoder_conf = yaml.safe_load(f)
+            if model.feats_extract is not None:
+                vocoder_conf.update(model.feats_extract.get_parameters())
+            if (
+                "n_fft" in vocoder_conf
+                and "n_shift" in vocoder_conf
+                and "fs" in vocoder_conf
+            ):
+                return Spectrogram2Waveform(**vocoder_conf)
+            else:
+                logging.warning("Vocoder is not available. Skipped its building.")
+                return None
+
+        elif str(vocoder_file).endswith(".pkl"):
+            # If the extension is ".pkl", the model is trained with parallel_wavegan
+            vocoder = ParallelWaveGANPretrainedVocoder(
+                vocoder_file, vocoder_config_file
+            )
+            return vocoder.to(device)
+
+        else:
+            raise ValueError(f"{vocoder_file} is not supported format.")
diff --git a/espnet2/text/build_tokenizer.py b/espnet2/text/build_tokenizer.py
index 66ca54d455c..70c2b868b17 100644
--- a/espnet2/text/build_tokenizer.py
+++ b/espnet2/text/build_tokenizer.py
@@ -56,6 +56,7 @@ def build_tokenizer(
             space_symbol=space_symbol,
             remove_non_linguistic_symbols=remove_non_linguistic_symbols,
         )
+
     else:
         raise ValueError(
             f"token_mode must be one of bpe, word, char or phn: " f"{token_type}"
diff --git a/espnet2/text/char_tokenizer.py b/espnet2/text/char_tokenizer.py
index 59003528624..765f124cf20 100644
--- a/espnet2/text/char_tokenizer.py
+++ b/espnet2/text/char_tokenizer.py
@@ -2,6 +2,7 @@
 from typing import Iterable
 from typing import List
 from typing import Union
+import warnings
 
 from typeguard import check_argument_types
 
@@ -21,8 +22,12 @@ def __init__(
             self.non_linguistic_symbols = set()
         elif isinstance(non_linguistic_symbols, (Path, str)):
             non_linguistic_symbols = Path(non_linguistic_symbols)
-            with non_linguistic_symbols.open("r", encoding="utf-8") as f:
-                self.non_linguistic_symbols = set(line.rstrip() for line in f)
+            try:
+                with non_linguistic_symbols.open("r", encoding="utf-8") as f:
+                    self.non_linguistic_symbols = set(line.rstrip() for line in f)
+            except FileNotFoundError:
+                warnings.warn(f"{non_linguistic_symbols} doesn't exist.")
+                self.non_linguistic_symbols = set()
         else:
             self.non_linguistic_symbols = set(non_linguistic_symbols)
         self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
diff --git a/espnet2/text/cleaner.py b/espnet2/text/cleaner.py
index 714eb7cbdb4..687ff6afd9c 100644
--- a/espnet2/text/cleaner.py
+++ b/espnet2/text/cleaner.py
@@ -9,6 +9,8 @@
 except ImportError:
     vietnamese_cleaners = None
 
+from espnet2.text.korean_cleaner import KoreanCleaner
+
 
 class TextCleaner:
     """Text cleaner.
@@ -40,6 +42,8 @@ def __call__(self, text: str) -> str:
                 if vietnamese_cleaners is None:
                     raise RuntimeError("Please install underthesea")
                 text = vietnamese_cleaners.vietnamese_cleaner(text)
+            elif t == "korean_cleaner":
+                text = KoreanCleaner.normalize_text(text)
             else:
                 raise RuntimeError(f"Not supported: type={t}")
 
diff --git a/espnet2/text/korean_cleaner.py b/espnet2/text/korean_cleaner.py
new file mode 100644
index 00000000000..ee556d42af3
--- /dev/null
+++ b/espnet2/text/korean_cleaner.py
@@ -0,0 +1,77 @@
+# Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
+
+import re
+
+
+class KoreanCleaner:
+    @classmethod
+    def _normalize_numbers(cls, text):
+        number_to_kor = {
+            "0": "영",
+            "1": "일",
+            "2": "이",
+            "3": "삼",
+            "4": "사",
+            "5": "오",
+            "6": "육",
+            "7": "칠",
+            "8": "팔",
+            "9": "구",
+        }
+        new_text = "".join(
+            number_to_kor[char] if char in number_to_kor.keys() else char
+            for char in text
+        )
+        return new_text
+
+    @classmethod
+    def _normalize_english_text(cls, text):
+        upper_alphabet_to_kor = {
+            "A": "에이",
+            "B": "비",
+            "C": "씨",
+            "D": "디",
+            "E": "이",
+            "F": "에프",
+            "G": "지",
+            "H": "에이치",
+            "I": "아이",
+            "J": "제이",
+            "K": "케이",
+            "L": "엘",
+            "M": "엠",
+            "N": "엔",
+            "O": "오",
+            "P": "피",
+            "Q": "큐",
+            "R": "알",
+            "S": "에스",
+            "T": "티",
+            "U": "유",
+            "V": "브이",
+            "W": "더블유",
+            "X": "엑스",
+            "Y": "와이",
+            "Z": "지",
+        }
+        new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
+        new_text = "".join(
+            upper_alphabet_to_kor[char]
+            if char in upper_alphabet_to_kor.keys()
+            else char
+            for char in new_text
+        )
+
+        return new_text
+
+    @classmethod
+    def normalize_text(cls, text):
+        # stage 0 : text strip
+        text = text.strip()
+
+        # stage 1 : normalize numbers
+        text = cls._normalize_numbers(text)
+
+        # stage 2 : normalize english text
+        text = cls._normalize_english_text(text)
+        return text
diff --git a/espnet2/text/phoneme_tokenizer.py b/espnet2/text/phoneme_tokenizer.py
index a1298a18cc6..e6791d1818c 100644
--- a/espnet2/text/phoneme_tokenizer.py
+++ b/espnet2/text/phoneme_tokenizer.py
@@ -1,17 +1,54 @@
+import logging
 from pathlib import Path
+import re
 from typing import Iterable
 from typing import List
 from typing import Optional
 from typing import Union
+import warnings
 
 import g2p_en
+import jamo
 from typeguard import check_argument_types
 
 from espnet2.text.abs_tokenizer import AbsTokenizer
 
 
+g2p_choices = [
+    None,
+    "g2p_en",
+    "g2p_en_no_space",
+    "pyopenjtalk",
+    "pyopenjtalk_kana",
+    "pyopenjtalk_accent",
+    "pyopenjtalk_accent_with_pause",
+    "pyopenjtalk_prosody",
+    "pypinyin_g2p",
+    "pypinyin_g2p_phone",
+    "espeak_ng_arabic",
+    "espeak_ng_german",
+    "espeak_ng_french",
+    "espeak_ng_spanish",
+    "espeak_ng_russian",
+    "espeak_ng_greek",
+    "espeak_ng_finnish",
+    "espeak_ng_hungarian",
+    "espeak_ng_dutch",
+    "espeak_ng_english_us_vits",
+    "espeak_ng_hindi",
+    "g2pk",
+    "g2pk_no_space",
+    "korean_jaso",
+    "korean_jaso_no_space",
+]
+
+
 def split_by_space(text) -> List[str]:
-    return text.split(" ")
+    if "   " in text:
+        text = text.replace("   ", " <space> ")
+        return [c.replace("<space>", " ") for c in text.split(" ")]
+    else:
+        return text.split(" ")
 
 
 def pyopenjtalk_g2p(text) -> List[str]:
@@ -29,7 +66,7 @@ def pyopenjtalk_g2p_accent(text) -> List[str]:
 
     phones = []
     for labels in pyopenjtalk.run_frontend(text)[1]:
-        p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9])", labels)
+        p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels)
         if len(p) == 1:
             phones += [p[0][0], p[0][2], p[0][1]]
     return phones
@@ -44,7 +81,7 @@ def pyopenjtalk_g2p_accent_with_pause(text) -> List[str]:
         if labels.split("-")[1].split("+")[0] == "pau":
             phones += ["pau"]
             continue
-        p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9])", labels)
+        p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels)
         if len(p) == 1:
             phones += [p[0][0], p[0][2], p[0][1]]
     return phones
@@ -57,6 +94,92 @@ def pyopenjtalk_g2p_kana(text) -> List[str]:
     return list(kanas)
 
 
+def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> List[str]:
+    """Extract phoneme + prosoody symbol sequence from input full-context labels.
+
+    The algorithm is based on `Prosodic features control by symbols as input of
+    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
+
+    Args:
+        text (str): Input text.
+        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
+
+    Returns:
+        List[str]: List of phoneme + prosody symbols.
+
+    Examples:
+        >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
+        >>> pyopenjtalk_g2p_prosody("こんにちは。")
+        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
+
+    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
+        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
+
+    """
+    import pyopenjtalk
+
+    labels = pyopenjtalk.run_frontend(text)[1]
+    N = len(labels)
+
+    phones = []
+    for n in range(N):
+        lab_curr = labels[n]
+
+        # current phoneme
+        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
+
+        # deal unvoiced vowels as normal vowels
+        if drop_unvoiced_vowels and p3 in "AEIOU":
+            p3 = p3.lower()
+
+        # deal with sil at the beginning and the end of text
+        if p3 == "sil":
+            assert n == 0 or n == N - 1
+            if n == 0:
+                phones.append("^")
+            elif n == N - 1:
+                # check question form or not
+                e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
+                if e3 == 0:
+                    phones.append("$")
+                elif e3 == 1:
+                    phones.append("?")
+            continue
+        elif p3 == "pau":
+            phones.append("_")
+            continue
+        else:
+            phones.append(p3)
+
+        # accent type and position info (forward or backward)
+        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
+        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
+        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
+
+        # number of mora in accent phrase
+        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
+
+        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
+        # accent phrase border
+        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
+            phones.append("#")
+        # pitch falling
+        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
+            phones.append("]")
+        # pitch rising
+        elif a2 == 1 and a2_next == 2:
+            phones.append("[")
+
+    return phones
+
+
+def _numeric_feature_by_regex(regex, s):
+    match = re.search(regex, s)
+    if match is None:
+        return -50
+    return int(match.group(1))
+
+
 def pypinyin_g2p(text) -> List[str]:
     from pypinyin import pinyin
     from pypinyin import Style
@@ -107,6 +230,77 @@ def __call__(self, text) -> List[str]:
         return phones
 
 
+class G2pk:
+    """On behalf of g2pk.G2p.
+
+    g2pk.G2p isn't pickalable and it can't be copied to the other processes
+    via multiprocessing module.
+    As a workaround, g2pk.G2p is instantiated upon calling this class.
+
+    """
+
+    def __init__(
+        self, descritive=False, group_vowels=False, to_syl=False, no_space=False
+    ):
+        self.descritive = descritive
+        self.group_vowels = group_vowels
+        self.to_syl = to_syl
+        self.no_space = no_space
+        self.g2p = None
+
+    def __call__(self, text) -> List[str]:
+        if self.g2p is None:
+            import g2pk
+
+            self.g2p = g2pk.G2p()
+
+        phones = list(
+            self.g2p(
+                text,
+                descriptive=self.descritive,
+                group_vowels=self.group_vowels,
+                to_syl=self.to_syl,
+            )
+        )
+        if self.no_space:
+            # remove space which represents word serapater
+            phones = list(filter(lambda s: s != " ", phones))
+        return phones
+
+
+class Jaso:
+    PUNC = "!'(),-.:;?"
+    SPACE = " "
+
+    JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
+    JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
+    JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
+
+    VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE
+
+    def __init__(self, space_symbol=" ", no_space=False):
+        self.space_symbol = space_symbol
+        self.no_space = no_space
+
+    def _text_to_jaso(self, line: str) -> List[str]:
+        jasos = list(jamo.hangul_to_jamo(line))
+        return jasos
+
+    def _remove_non_korean_characters(self, tokens):
+        new_tokens = [token for token in tokens if token in self.VALID_CHARS]
+        return new_tokens
+
+    def __call__(self, text) -> List[str]:
+        graphemes = [x for x in self._text_to_jaso(text)]
+        graphemes = self._remove_non_korean_characters(graphemes)
+
+        if self.no_space:
+            graphemes = list(filter(lambda s: s != " ", graphemes))
+        else:
+            graphemes = [x if x != " " else self.space_symbol for x in graphemes]
+        return graphemes
+
+
 class Phonemizer:
     """Phonemizer module for various languages.
 
@@ -120,26 +314,47 @@ class Phonemizer:
 
     def __init__(
         self,
+        backend,
         word_separator: Optional[str] = None,
         syllable_separator: Optional[str] = None,
-        **phonemize_kwargs,
+        phone_separator: Optional[str] = " ",
+        strip=False,
+        split_by_single_token: bool = False,
+        **phonemizer_kwargs,
     ):
         # delayed import
-        from phonemizer import phonemize
+        from phonemizer.backend import BACKENDS
         from phonemizer.separator import Separator
 
-        self.phonemize = phonemize
         self.separator = Separator(
-            word=word_separator, syllable=syllable_separator, phone=" "
+            word=word_separator,
+            syllable=syllable_separator,
+            phone=phone_separator,
         )
-        self.phonemize_kwargs = phonemize_kwargs
+
+        # define logger to suppress the warning in phonemizer
+        logger = logging.getLogger("phonemizer")
+        logger.setLevel(logging.ERROR)
+        self.phonemizer = BACKENDS[backend](
+            **phonemizer_kwargs,
+            logger=logger,
+        )
+        self.strip = strip
+        self.split_by_single_token = split_by_single_token
 
     def __call__(self, text) -> List[str]:
-        return self.phonemize(
-            text,
+        tokens = self.phonemizer.phonemize(
+            [text],
             separator=self.separator,
-            **self.phonemize_kwargs,
-        ).split()
+            strip=self.strip,
+            njobs=1,
+        )[0]
+        if not self.split_by_single_token:
+            return tokens.split()
+        else:
+            # "a: ab" -> ["a", ":", "<space>",  "a", "b"]
+            # TODO(kan-bayashi): space replacement should be dealt in PhonemeTokenizer
+            return [c.replace(" ", "<space>") for c in tokens]
 
 
 class PhonemeTokenizer(AbsTokenizer):
@@ -165,12 +380,103 @@ def __init__(
             self.g2p = pyopenjtalk_g2p_accent
         elif g2p_type == "pyopenjtalk_accent_with_pause":
             self.g2p = pyopenjtalk_g2p_accent_with_pause
+        elif g2p_type == "pyopenjtalk_prosody":
+            self.g2p = pyopenjtalk_g2p_prosody
         elif g2p_type == "pypinyin_g2p":
             self.g2p = pypinyin_g2p
         elif g2p_type == "pypinyin_g2p_phone":
             self.g2p = pypinyin_g2p_phone
         elif g2p_type == "espeak_ng_arabic":
-            self.g2p = Phonemizer(language="ar", backend="espeak", with_stress=True)
+            self.g2p = Phonemizer(
+                language="ar",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_german":
+            self.g2p = Phonemizer(
+                language="de",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_french":
+            self.g2p = Phonemizer(
+                language="fr-fr",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_spanish":
+            self.g2p = Phonemizer(
+                language="es",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_russian":
+            self.g2p = Phonemizer(
+                language="ru",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_greek":
+            self.g2p = Phonemizer(
+                language="el",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_finnish":
+            self.g2p = Phonemizer(
+                language="fi",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_hungarian":
+            self.g2p = Phonemizer(
+                language="hu",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_dutch":
+            self.g2p = Phonemizer(
+                language="nl",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "espeak_ng_hindi":
+            self.g2p = Phonemizer(
+                language="hi",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+            )
+        elif g2p_type == "g2pk":
+            self.g2p = G2pk(no_space=False)
+        elif g2p_type == "g2pk_no_space":
+            self.g2p = G2pk(no_space=True)
+        elif g2p_type == "espeak_ng_english_us_vits":
+            # VITS official implementation-like processing
+            # Reference: https://github.com/jaywalnut310/vits
+            self.g2p = Phonemizer(
+                language="en-us",
+                backend="espeak",
+                with_stress=True,
+                preserve_punctuation=True,
+                strip=True,
+                word_separator=" ",
+                phone_separator="",
+                split_by_single_token=True,
+            )
+        elif g2p_type == "korean_jaso":
+            self.g2p = Jaso(space_symbol=space_symbol, no_space=False)
+        elif g2p_type == "korean_jaso_no_space":
+            self.g2p = Jaso(no_space=True)
         else:
             raise NotImplementedError(f"Not supported: g2p_type={g2p_type}")
 
@@ -180,8 +486,12 @@ def __init__(
             self.non_linguistic_symbols = set()
         elif isinstance(non_linguistic_symbols, (Path, str)):
             non_linguistic_symbols = Path(non_linguistic_symbols)
-            with non_linguistic_symbols.open("r", encoding="utf-8") as f:
-                self.non_linguistic_symbols = set(line.rstrip() for line in f)
+            try:
+                with non_linguistic_symbols.open("r", encoding="utf-8") as f:
+                    self.non_linguistic_symbols = set(line.rstrip() for line in f)
+            except FileNotFoundError:
+                warnings.warn(f"{non_linguistic_symbols} doesn't exist.")
+                self.non_linguistic_symbols = set()
         else:
             self.non_linguistic_symbols = set(non_linguistic_symbols)
         self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
@@ -192,7 +502,7 @@ def __repr__(self):
             f'g2p_type="{self.g2p_type}", '
             f'space_symbol="{self.space_symbol}", '
             f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
-            f")"
+            ")"
         )
 
     def text2tokens(self, line: str) -> List[str]:
diff --git a/espnet2/text/word_tokenizer.py b/espnet2/text/word_tokenizer.py
index 7b729ecf91d..2788bc03e65 100644
--- a/espnet2/text/word_tokenizer.py
+++ b/espnet2/text/word_tokenizer.py
@@ -29,8 +29,12 @@ def __init__(
             self.non_linguistic_symbols = set()
         elif isinstance(non_linguistic_symbols, (Path, str)):
             non_linguistic_symbols = Path(non_linguistic_symbols)
-            with non_linguistic_symbols.open("r", encoding="utf-8") as f:
-                self.non_linguistic_symbols = set(line.rstrip() for line in f)
+            try:
+                with non_linguistic_symbols.open("r", encoding="utf-8") as f:
+                    self.non_linguistic_symbols = set(line.rstrip() for line in f)
+            except FileNotFoundError:
+                warnings.warn(f"{non_linguistic_symbols} doesn't exist.")
+                self.non_linguistic_symbols = set()
         else:
             self.non_linguistic_symbols = set(non_linguistic_symbols)
         self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
diff --git a/espnet2/torch_utils/add_gradient_noise.py b/espnet2/torch_utils/add_gradient_noise.py
index dd488dd75b9..654e928ec00 100644
--- a/espnet2/torch_utils/add_gradient_noise.py
+++ b/espnet2/torch_utils/add_gradient_noise.py
@@ -23,7 +23,7 @@ def add_gradient_noise(
         scale_factor: {0.55}: The scale of `sigma`.
     """
     interval = (iteration // duration) + 1
-    sigma = eta / interval ** scale_factor
+    sigma = eta / interval**scale_factor
     for param in model.parameters():
         if param.grad is not None:
             _shape = param.grad.size()
diff --git a/espnet2/torch_utils/initialize.py b/espnet2/torch_utils/initialize.py
index b8d4a071160..2c0e7a43579 100644
--- a/espnet2/torch_utils/initialize.py
+++ b/espnet2/torch_utils/initialize.py
@@ -80,13 +80,23 @@ def initialize(model: torch.nn.Module, init: str):
 
         # reset some modules with default init
         for m in model.modules():
-            if isinstance(m, (torch.nn.Embedding, torch.nn.LayerNorm)):
+            if isinstance(
+                m, (torch.nn.Embedding, torch.nn.LayerNorm, torch.nn.GroupNorm)
+            ):
                 m.reset_parameters()
             if hasattr(m, "espnet_initialization_fn"):
                 m.espnet_initialization_fn()
 
-        # TODO(xkc): Hacking wav2vec2 initialization
+        # TODO(xkc): Hacking s3prl_frontend and wav2vec2encoder initialization
         if getattr(model, "encoder", None) and getattr(
             model.encoder, "reload_pretrained_parameters", None
         ):
             model.encoder.reload_pretrained_parameters()
+        if getattr(model, "frontend", None) and getattr(
+            model.frontend, "reload_pretrained_parameters", None
+        ):
+            model.frontend.reload_pretrained_parameters()
+        if getattr(model, "postencoder", None) and getattr(
+            model.postencoder, "reload_pretrained_parameters", None
+        ):
+            model.postencoder.reload_pretrained_parameters()
diff --git a/espnet2/torch_utils/load_pretrained_model.py b/espnet2/torch_utils/load_pretrained_model.py
index dbce4d95c51..49c7bc6b558 100644
--- a/espnet2/torch_utils/load_pretrained_model.py
+++ b/espnet2/torch_utils/load_pretrained_model.py
@@ -1,13 +1,47 @@
 from typing import Any
+from typing import Dict
+from typing import Union
 
+import logging
 import torch
 import torch.nn
 import torch.optim
 
 
+def filter_state_dict(
+    dst_state: Dict[str, Union[float, torch.Tensor]],
+    src_state: Dict[str, Union[float, torch.Tensor]],
+):
+    """Filter name, size mismatch instances between dicts.
+
+    Args:
+        dst_state: reference state dict for filtering
+        src_state: target state dict for filtering
+
+    """
+    match_state = {}
+    for key, value in src_state.items():
+        if key in dst_state and (dst_state[key].size() == src_state[key].size()):
+            match_state[key] = value
+        else:
+            if key not in dst_state:
+                logging.warning(
+                    f"Filter out {key} from pretrained dict"
+                    + " because of name not found in target dict"
+                )
+            else:
+                logging.warning(
+                    f"Filter out {key} from pretrained dict"
+                    + " because of size mismatch"
+                    + f"({dst_state[key].size()}-{src_state[key].size()})"
+                )
+    return match_state
+
+
 def load_pretrained_model(
     init_param: str,
     model: torch.nn.Module,
+    ignore_init_mismatch: bool,
     map_location: str = "cpu",
 ):
     """Load a model state and set it to the model.
@@ -77,5 +111,7 @@ def get_attr(obj: Any, key: str):
         }
 
     dst_state = obj.state_dict()
+    if ignore_init_mismatch:
+        src_state = filter_state_dict(dst_state, src_state)
     dst_state.update(src_state)
     obj.load_state_dict(dst_state)
diff --git a/espnet2/torch_utils/model_summary.py b/espnet2/torch_utils/model_summary.py
index df34b0b9a7f..8d7f14f8c79 100644
--- a/espnet2/torch_utils/model_summary.py
+++ b/espnet2/torch_utils/model_summary.py
@@ -35,7 +35,7 @@ def get_human_readable_count(number: int) -> str:
     num_groups = int(np.ceil(num_digits / 3))
     num_groups = min(num_groups, len(labels))  # don't abbreviate beyond trillions
     shift = -3 * (num_groups - 1)
-    number = number * (10 ** shift)
+    number = number * (10**shift)
     index = num_groups - 1
     return f"{number:.2f} {labels[index]}"
 
diff --git a/espnet2/torch_utils/recursive_op.py b/espnet2/torch_utils/recursive_op.py
index c3b70fb3fa5..286a92dafc5 100644
--- a/espnet2/torch_utils/recursive_op.py
+++ b/espnet2/torch_utils/recursive_op.py
@@ -1,14 +1,8 @@
-from distutils.version import LooseVersion
-
+"""Torch utility module."""
 import torch
 
 if torch.distributed.is_available():
-    if LooseVersion(torch.__version__) > LooseVersion("1.0.1"):
-        from torch.distributed import ReduceOp
-    else:
-        from torch.distributed import reduce_op as ReduceOp
-else:
-    ReduceOp = None
+    from torch.distributed import ReduceOp
 
 
 def recursive_sum(obj, weight: torch.Tensor, distributed: bool = False):
diff --git a/espnet2/train/abs_gan_espnet_model.py b/espnet2/train/abs_gan_espnet_model.py
new file mode 100644
index 00000000000..6e78ecfdca4
--- /dev/null
+++ b/espnet2/train/abs_gan_espnet_model.py
@@ -0,0 +1,72 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""ESPnetModel abstract class for GAN-based training."""
+
+from abc import ABC
+from abc import abstractmethod
+from typing import Dict
+from typing import Union
+
+import torch
+
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+
+
+class AbsGANESPnetModel(AbsESPnetModel, torch.nn.Module, ABC):
+    """The common abstract class among each GAN-based task.
+
+    "ESPnetModel" is referred to a class which inherits torch.nn.Module,
+    and makes the dnn-models "forward" as its member field, a.k.a delegate
+    pattern. And "forward" must accept the argument "forward_generator" and
+    Return the dict of "loss", "stats", "weight", and "optim_idx".
+    "optim_idx" for generator must be 0 and that for discriminator must be 1.
+
+    Example:
+        >>> from espnet2.tasks.abs_task import AbsTask
+        >>> class YourESPnetModel(AbsGANESPnetModel):
+        ...     def forward(self, input, input_lengths, forward_generator=True):
+        ...         ...
+        ...         if forward_generator:
+        ...             # return loss for the generator
+        ...             # optim idx 0 indicates generator optimizer
+        ...             return dict(loss=loss, stats=stats, weight=weight, optim_idx=0)
+        ...         else:
+        ...             # return loss for the discriminator
+        ...             # optim idx 1 indicates discriminator optimizer
+        ...             return dict(loss=loss, stats=stats, weight=weight, optim_idx=1)
+        >>> class YourTask(AbsTask):
+        ...     @classmethod
+        ...     def build_model(cls, args: argparse.Namespace) -> YourESPnetModel:
+    """
+
+    @abstractmethod
+    def forward(
+        self,
+        forward_generator: bool = True,
+        **batch: torch.Tensor,
+    ) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor], int]]:
+        """Return the generator loss or the discrimiantor loss.
+
+        This method must have an argument "forward_generator" to switch the generator
+        loss calculation and the discrimiantor loss calculation. If forward_generator
+        is true, return the generator loss with optim_idx 0. If forward_generator is
+        false, return the discrimiantor loss with optim_idx 1.
+
+        Args:
+            forward_generator (bool): Whether to return the generator loss or the
+                discrimiantor loss. This must have the default value.
+
+        Returns:
+            Dict[str, Any]:
+                * loss (Tensor): Loss scalar tensor.
+                * stats (Dict[str, float]): Statistics to be monitored.
+                * weight (Tensor): Weight tensor to summarize losses.
+                * optim_idx (int): Optimizer index (0 for G and 1 for D).
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError
diff --git a/espnet2/train/dataset.py b/espnet2/train/dataset.py
index 5601d399f8b..0c47366e94a 100644
--- a/espnet2/train/dataset.py
+++ b/espnet2/train/dataset.py
@@ -57,7 +57,7 @@ def __getitem__(self, key: str) -> np.ndarray:
             if isinstance(retval[0], int) and isinstance(retval[1], np.ndarray):
                 # sound scp case
                 rate, array = retval
-            elif isinstance(retval[0], int) and isinstance(retval[1], np.ndarray):
+            elif isinstance(retval[1], int) and isinstance(retval[0], np.ndarray):
                 # Extended ark format case
                 array, rate = retval
             else:
@@ -361,7 +361,7 @@ def _build_loader(
                         name = func.__name__
                     else:
                         name = str(func)
-                    logging.error(f"An error happend with {name}({path})")
+                    logging.error(f"An error happened with {name}({path})")
                     raise
         else:
             raise RuntimeError(f"Not supported: loader_type={loader_type}")
diff --git a/espnet2/train/gan_trainer.py b/espnet2/train/gan_trainer.py
new file mode 100644
index 00000000000..0d3cc59bea0
--- /dev/null
+++ b/espnet2/train/gan_trainer.py
@@ -0,0 +1,373 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Trainer module for GAN-based training."""
+
+import argparse
+import dataclasses
+import logging
+import time
+
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import Iterable
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet2.schedulers.abs_scheduler import AbsBatchStepScheduler
+from espnet2.schedulers.abs_scheduler import AbsScheduler
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.recursive_op import recursive_average
+from espnet2.train.distributed_utils import DistributedOption
+from espnet2.train.reporter import SubReporter
+from espnet2.train.trainer import Trainer
+from espnet2.train.trainer import TrainerOptions
+from espnet2.utils.build_dataclass import build_dataclass
+from espnet2.utils.types import str2bool
+
+if torch.distributed.is_available():
+    from torch.distributed import ReduceOp
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+    from torch.cuda.amp import GradScaler
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):  # NOQA
+        yield
+
+    GradScaler = None
+
+try:
+    import fairscale
+except ImportError:
+    fairscale = None
+
+
+@dataclasses.dataclass
+class GANTrainerOptions(TrainerOptions):
+    """Trainer option dataclass for GANTrainer."""
+
+    generator_first: bool
+
+
+class GANTrainer(Trainer):
+    """Trainer for GAN-based training.
+
+    If you'd like to use this trainer, the model must inherit
+    espnet.train.abs_gan_espnet_model.AbsGANESPnetModel.
+
+    """
+
+    @classmethod
+    def build_options(cls, args: argparse.Namespace) -> TrainerOptions:
+        """Build options consumed by train(), eval(), and plot_attention()."""
+        assert check_argument_types()
+        return build_dataclass(GANTrainerOptions, args)
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        """Add additional arguments for GAN-trainer."""
+        parser.add_argument(
+            "--generator_first",
+            type=str2bool,
+            default=False,
+            help="Whether to update generator first.",
+        )
+
+    @classmethod
+    def train_one_epoch(
+        cls,
+        model: torch.nn.Module,
+        iterator: Iterable[Tuple[List[str], Dict[str, torch.Tensor]]],
+        optimizers: Sequence[torch.optim.Optimizer],
+        schedulers: Sequence[Optional[AbsScheduler]],
+        scaler: Optional[GradScaler],
+        reporter: SubReporter,
+        summary_writer,
+        options: GANTrainerOptions,
+        distributed_option: DistributedOption,
+    ) -> bool:
+        """Train one epoch."""
+        assert check_argument_types()
+
+        grad_noise = options.grad_noise
+        accum_grad = options.accum_grad
+        grad_clip = options.grad_clip
+        grad_clip_type = options.grad_clip_type
+        log_interval = options.log_interval
+        no_forward_run = options.no_forward_run
+        ngpu = options.ngpu
+        use_wandb = options.use_wandb
+        generator_first = options.generator_first
+        distributed = distributed_option.distributed
+
+        # Check unavailable options
+        # TODO(kan-bayashi): Support the use of these options
+        if accum_grad > 1:
+            raise NotImplementedError(
+                "accum_grad > 1 is not supported in GAN-based training."
+            )
+        if grad_noise:
+            raise NotImplementedError(
+                "grad_noise is not supported in GAN-based training."
+            )
+
+        if log_interval is None:
+            try:
+                log_interval = max(len(iterator) // 20, 10)
+            except TypeError:
+                log_interval = 100
+
+        model.train()
+        all_steps_are_invalid = True
+        # [For distributed] Because iteration counts are not always equals between
+        # processes, send stop-flag to the other processes if iterator is finished
+        iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
+
+        start_time = time.perf_counter()
+        for iiter, (_, batch) in enumerate(
+            reporter.measure_iter_time(iterator, "iter_time"), 1
+        ):
+            assert isinstance(batch, dict), type(batch)
+
+            if distributed:
+                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
+                if iterator_stop > 0:
+                    break
+
+            batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
+            if no_forward_run:
+                all_steps_are_invalid = False
+                continue
+
+            turn_start_time = time.perf_counter()
+            if generator_first:
+                turns = ["generator", "discriminator"]
+            else:
+                turns = ["discriminator", "generator"]
+            for turn in turns:
+                with autocast(scaler is not None):
+                    with reporter.measure_time(f"{turn}_forward_time"):
+                        retval = model(forward_generator=turn == "generator", **batch)
+
+                        # Note(kamo):
+                        # Supporting two patterns for the returned value from the model
+                        #   a. dict type
+                        if isinstance(retval, dict):
+                            loss = retval["loss"]
+                            stats = retval["stats"]
+                            weight = retval["weight"]
+                            optim_idx = retval.get("optim_idx")
+                            if optim_idx is not None and not isinstance(optim_idx, int):
+                                if not isinstance(optim_idx, torch.Tensor):
+                                    raise RuntimeError(
+                                        "optim_idx must be int or 1dim torch.Tensor, "
+                                        f"but got {type(optim_idx)}"
+                                    )
+                                if optim_idx.dim() >= 2:
+                                    raise RuntimeError(
+                                        "optim_idx must be int or 1dim torch.Tensor, "
+                                        f"but got {optim_idx.dim()}dim tensor"
+                                    )
+                                if optim_idx.dim() == 1:
+                                    for v in optim_idx:
+                                        if v != optim_idx[0]:
+                                            raise RuntimeError(
+                                                "optim_idx must be 1dim tensor "
+                                                "having same values for all entries"
+                                            )
+                                    optim_idx = optim_idx[0].item()
+                                else:
+                                    optim_idx = optim_idx.item()
+
+                        # b. tuple or list type
+                        else:
+                            raise RuntimeError("model output must be dict.")
+
+                    stats = {k: v for k, v in stats.items() if v is not None}
+                    if ngpu > 1 or distributed:
+                        # Apply weighted averaging for loss and stats
+                        loss = (loss * weight.type(loss.dtype)).sum()
+
+                        # if distributed, this method can also apply all_reduce()
+                        stats, weight = recursive_average(stats, weight, distributed)
+
+                        # Now weight is summation over all workers
+                        loss /= weight
+
+                    if distributed:
+                        # NOTE(kamo): Multiply world_size since DistributedDataParallel
+                        # automatically normalizes the gradient by world_size.
+                        loss *= torch.distributed.get_world_size()
+
+                reporter.register(stats, weight)
+
+                with reporter.measure_time(f"{turn}_backward_time"):
+                    if scaler is not None:
+                        # Scales loss.  Calls backward() on scaled loss
+                        # to create scaled gradients.
+                        # Backward passes under autocast are not recommended.
+                        # Backward ops run in the same dtype autocast chose
+                        # for corresponding forward ops.
+                        scaler.scale(loss).backward()
+                    else:
+                        loss.backward()
+
+                if scaler is not None:
+                    # Unscales the gradients of optimizer's assigned params in-place
+                    for iopt, optimizer in enumerate(optimizers):
+                        if optim_idx is not None and iopt != optim_idx:
+                            continue
+                        scaler.unscale_(optimizer)
+
+                # TODO(kan-bayashi): Compute grad norm without clipping
+                grad_norm = None
+                if grad_clip > 0.0:
+                    # compute the gradient norm to check if it is normal or not
+                    grad_norm = torch.nn.utils.clip_grad_norm_(
+                        model.parameters(),
+                        max_norm=grad_clip,
+                        norm_type=grad_clip_type,
+                    )
+                    # PyTorch<=1.4, clip_grad_norm_ returns float value
+                    if not isinstance(grad_norm, torch.Tensor):
+                        grad_norm = torch.tensor(grad_norm)
+
+                if grad_norm is None or torch.isfinite(grad_norm):
+                    all_steps_are_invalid = False
+                    with reporter.measure_time(f"{turn}_optim_step_time"):
+                        for iopt, (optimizer, scheduler) in enumerate(
+                            zip(optimizers, schedulers)
+                        ):
+                            if optim_idx is not None and iopt != optim_idx:
+                                continue
+                            if scaler is not None:
+                                # scaler.step() first unscales the gradients of
+                                # the optimizer's assigned params.
+                                scaler.step(optimizer)
+                                # Updates the scale for next iteration.
+                                scaler.update()
+                            else:
+                                optimizer.step()
+                            if isinstance(scheduler, AbsBatchStepScheduler):
+                                scheduler.step()
+                else:
+                    logging.warning(
+                        f"The grad norm is {grad_norm}. " "Skipping updating the model."
+                    )
+                    # Must invoke scaler.update() if unscale_() is used in the
+                    # iteration to avoid the following error:
+                    #   RuntimeError: unscale_() has already been called
+                    #   on this optimizer since the last update().
+                    # Note that if the gradient has inf/nan values,
+                    # scaler.step skips optimizer.step().
+                    if scaler is not None:
+                        for iopt, optimizer in enumerate(optimizers):
+                            if optim_idx is not None and iopt != optim_idx:
+                                continue
+                            scaler.step(optimizer)
+                            scaler.update()
+
+                for iopt, optimizer in enumerate(optimizers):
+                    # NOTE(kan-bayashi): In the case of GAN, we need to clear
+                    #   the gradient of both optimizers after every update.
+                    optimizer.zero_grad()
+
+                # Register lr and train/load time[sec/step],
+                # where step refers to accum_grad * mini-batch
+                reporter.register(
+                    {
+                        f"optim{optim_idx}_lr{i}": pg["lr"]
+                        for i, pg in enumerate(optimizers[optim_idx].param_groups)
+                        if "lr" in pg
+                    },
+                )
+                reporter.register(
+                    {f"{turn}_train_time": time.perf_counter() - turn_start_time}
+                )
+                turn_start_time = time.perf_counter()
+
+            reporter.register({"train_time": time.perf_counter() - start_time})
+            start_time = time.perf_counter()
+
+            # NOTE(kamo): Call log_message() after next()
+            reporter.next()
+            if iiter % log_interval == 0:
+                logging.info(reporter.log_message(-log_interval))
+                if summary_writer is not None:
+                    reporter.tensorboard_add_scalar(summary_writer, -log_interval)
+                if use_wandb:
+                    reporter.wandb_log()
+
+        else:
+            if distributed:
+                iterator_stop.fill_(1)
+                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
+
+        return all_steps_are_invalid
+
+    @classmethod
+    @torch.no_grad()
+    def validate_one_epoch(
+        cls,
+        model: torch.nn.Module,
+        iterator: Iterable[Dict[str, torch.Tensor]],
+        reporter: SubReporter,
+        options: GANTrainerOptions,
+        distributed_option: DistributedOption,
+    ) -> None:
+        """Validate one epoch."""
+        assert check_argument_types()
+        ngpu = options.ngpu
+        no_forward_run = options.no_forward_run
+        distributed = distributed_option.distributed
+        generator_first = options.generator_first
+
+        model.eval()
+
+        # [For distributed] Because iteration counts are not always equals between
+        # processes, send stop-flag to the other processes if iterator is finished
+        iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu")
+        for (_, batch) in iterator:
+            assert isinstance(batch, dict), type(batch)
+            if distributed:
+                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
+                if iterator_stop > 0:
+                    break
+
+            batch = to_device(batch, "cuda" if ngpu > 0 else "cpu")
+            if no_forward_run:
+                continue
+
+            if generator_first:
+                turns = ["generator", "discriminator"]
+            else:
+                turns = ["discriminator", "generator"]
+            for turn in turns:
+                retval = model(forward_generator=turn == "generator", **batch)
+                if isinstance(retval, dict):
+                    stats = retval["stats"]
+                    weight = retval["weight"]
+                else:
+                    _, stats, weight = retval
+                if ngpu > 1 or distributed:
+                    # Apply weighted averaging for stats.
+                    # if distributed, this method can also apply all_reduce()
+                    stats, weight = recursive_average(stats, weight, distributed)
+                reporter.register(stats, weight)
+
+            reporter.next()
+
+        else:
+            if distributed:
+                iterator_stop.fill_(1)
+                torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
diff --git a/espnet2/train/iterable_dataset.py b/espnet2/train/iterable_dataset.py
index cf1ccd33cc5..ccf606726f3 100644
--- a/espnet2/train/iterable_dataset.py
+++ b/espnet2/train/iterable_dataset.py
@@ -1,11 +1,11 @@
+"""Iterable dataset module."""
 import copy
-from distutils.version import LooseVersion
 from io import StringIO
 from pathlib import Path
 from typing import Callable
 from typing import Collection
 from typing import Dict
-from typing import Iterable
+from typing import Iterator
 from typing import Tuple
 from typing import Union
 
@@ -13,15 +13,11 @@
 import numpy as np
 import soundfile
 import torch
+from torch.utils.data.dataset import IterableDataset
 from typeguard import check_argument_types
 
 from espnet2.train.dataset import ESPnetDataset
 
-if LooseVersion(torch.__version__) >= LooseVersion("1.2"):
-    from torch.utils.data.dataset import IterableDataset
-else:
-    from torch.utils.data.dataset import Dataset as IterableDataset
-
 
 def load_kaldi(input):
     retval = kaldiio.load_mat(input)
@@ -142,7 +138,7 @@ def __repr__(self):
         _mes += f"\n  preprocess: {self.preprocess})"
         return _mes
 
-    def __iter__(self) -> Iterable[Tuple[Union[str, int], Dict[str, np.ndarray]]]:
+    def __iter__(self) -> Iterator[Tuple[Union[str, int], Dict[str, np.ndarray]]]:
         if self.key_file is not None:
             uid_iter = (
                 line.rstrip().split(maxsplit=1)[0]
diff --git a/espnet2/train/preprocessor.py b/espnet2/train/preprocessor.py
index 0197990e64d..bdf1c6437e8 100644
--- a/espnet2/train/preprocessor.py
+++ b/espnet2/train/preprocessor.py
@@ -4,6 +4,7 @@
 from typing import Collection
 from typing import Dict
 from typing import Iterable
+from typing import List
 from typing import Union
 
 import numpy as np
@@ -102,9 +103,9 @@ def detect_non_silence(
     )
     framed_w *= scipy.signal.get_window(window, frame_length).astype(framed_w.dtype)
     # power: (C, T)
-    power = (framed_w ** 2).mean(axis=-1)
-    # mean_power: (C,)
-    mean_power = power.mean(axis=-1)
+    power = (framed_w**2).mean(axis=-1)
+    # mean_power: (C, 1)
+    mean_power = np.mean(power, axis=-1, keepdims=True)
     if np.all(mean_power == 0):
         return np.full(x.shape, fill_value=True, dtype=np.bool)
     # detect_frames: (C, T)
@@ -208,13 +209,12 @@ def __init__(
         else:
             self.noises = None
 
-    def __call__(
-        self, uid: str, data: Dict[str, Union[str, np.ndarray]]
-    ) -> Dict[str, np.ndarray]:
+    def _speech_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, Union[str, np.ndarray]]:
         assert check_argument_types()
-
         if self.speech_name in data:
-            if self.train and self.rirs is not None and self.noises is not None:
+            if self.train and (self.rirs is not None or self.noises is not None):
                 speech = data[self.speech_name]
                 nsamples = len(speech)
 
@@ -249,7 +249,7 @@ def __call__(
                 # 2. Add Noise
                 if (
                     self.noises is not None
-                    and self.rir_apply_prob >= np.random.random()
+                    and self.noise_apply_prob >= np.random.random()
                 ):
                     noise_path = np.random.choice(self.noises)
                     if noise_path is not None:
@@ -281,7 +281,7 @@ def __call__(
                         # noise: (Nmic, Time)
                         noise = noise.T
 
-                        noise_power = (noise ** 2).mean()
+                        noise_power = (noise**2).mean()
                         scale = (
                             10 ** (-noise_db / 20)
                             * np.sqrt(power)
@@ -299,7 +299,12 @@ def __call__(
                 speech = data[self.speech_name]
                 ma = np.max(np.abs(speech))
                 data[self.speech_name] = speech * self.speech_volume_normalize / ma
+        assert check_return_type(data)
+        return data
 
+    def _text_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
         if self.text_name in data and self.tokenizer is not None:
             text = data[self.text_name]
             text = self.text_cleaner(text)
@@ -309,6 +314,15 @@ def __call__(
         assert check_return_type(data)
         return data
 
+    def __call__(
+        self, uid: str, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
+        assert check_argument_types()
+
+        data = self._speech_process(data)
+        data = self._text_process(data)
+        return data
+
 
 class CommonPreprocessor_multi(AbsPreprocessor):
     def __init__(
@@ -324,7 +338,7 @@ def __init__(
         non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
         delimiter: str = None,
         speech_name: str = "speech",
-        text_name: list = ["text"],
+        text_name: List[str] = ["text"],
     ):
         super().__init__(train)
         self.train = train
@@ -353,6 +367,19 @@ def __init__(
             self.tokenizer = None
             self.token_id_converter = None
 
+    def _text_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
+        for text_n in self.text_name:
+            if text_n in data and self.tokenizer is not None:
+                text = data[text_n]
+                text = self.text_cleaner(text)
+                tokens = self.tokenizer.text2tokens(text)
+                text_ints = self.token_id_converter.tokens2ids(tokens)
+                data[text_n] = np.array(text_ints, dtype=np.int64)
+        assert check_return_type(data)
+        return data
+
     def __call__(
         self, uid: str, data: Dict[str, Union[str, np.ndarray]]
     ) -> Dict[str, np.ndarray]:
@@ -366,12 +393,99 @@ def __call__(
             # - Data augmentation
             pass
 
-        for text_n in self.text_name:
-            if text_n in data and self.tokenizer is not None:
-                text = data[text_n]
+        data = self._text_process(data)
+        return data
+
+
+class MutliTokenizerCommonPreprocessor(CommonPreprocessor):
+    def __init__(
+        self,
+        train: bool,
+        token_type: List[str] = [None],
+        token_list: List[Union[Path, str, Iterable[str]]] = [None],
+        bpemodel: List[Union[Path, str, Iterable[str]]] = [None],
+        text_cleaner: Collection[str] = None,
+        g2p_type: str = None,
+        unk_symbol: str = "<unk>",
+        space_symbol: str = "<space>",
+        non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
+        delimiter: str = None,
+        rir_scp: str = None,
+        rir_apply_prob: float = 1.0,
+        noise_scp: str = None,
+        noise_apply_prob: float = 1.0,
+        noise_db_range: str = "3_10",
+        speech_volume_normalize: float = None,
+        speech_name: str = "speech",
+        text_name: List[str] = ["text"],
+    ):
+        # TODO(jiatong): sync with Kamo and Jing on interface for preprocessor
+        super().__init__(
+            train=train,
+            token_type=token_type[0],
+            token_list=token_list[0],
+            bpemodel=bpemodel[0],
+            text_cleaner=text_cleaner,
+            g2p_type=g2p_type,
+            unk_symbol=unk_symbol,
+            space_symbol=space_symbol,
+            non_linguistic_symbols=non_linguistic_symbols,
+            delimiter=delimiter,
+            speech_name=speech_name,
+            text_name=text_name[0],
+            rir_scp=rir_scp,
+            rir_apply_prob=rir_apply_prob,
+            noise_scp=noise_scp,
+            noise_apply_prob=noise_apply_prob,
+            noise_db_range=noise_db_range,
+            speech_volume_normalize=speech_volume_normalize,
+        )
+
+        assert (
+            len(token_type) == len(token_list) == len(bpemodel) == len(text_name)
+        ), "token_type, token_list, bpemodel, or processing text_name mismatched"
+        self.num_tokenizer = len(token_type)
+        self.tokenizer = []
+        self.token_id_converter = []
+
+        for i in range(self.num_tokenizer):
+            if token_type[i] is not None:
+                if token_list[i] is None:
+                    raise ValueError("token_list is required if token_type is not None")
+
+                self.tokenizer.append(
+                    build_tokenizer(
+                        token_type=token_type[i],
+                        bpemodel=bpemodel[i],
+                        delimiter=delimiter,
+                        space_symbol=space_symbol,
+                        non_linguistic_symbols=non_linguistic_symbols,
+                        g2p_type=g2p_type,
+                    )
+                )
+                self.token_id_converter.append(
+                    TokenIDConverter(
+                        token_list=token_list[i],
+                        unk_symbol=unk_symbol,
+                    )
+                )
+            else:
+                self.tokenizer.append(None)
+                self.token_id_converter.append(None)
+
+        self.text_cleaner = TextCleaner(text_cleaner)
+        self.text_name = text_name  # override the text_name from CommonPreprocessor
+
+    def _text_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
+        for i in range(self.num_tokenizer):
+            text_name = self.text_name[i]
+            if text_name in data and self.tokenizer[i] is not None:
+                text = data[text_name]
                 text = self.text_cleaner(text)
-                tokens = self.tokenizer.text2tokens(text)
-                text_ints = self.token_id_converter.tokens2ids(tokens)
-                data[text_n] = np.array(text_ints, dtype=np.int64)
+                tokens = self.tokenizer[i].text2tokens(text)
+                text_ints = self.token_id_converter[i].tokens2ids(tokens)
+                data[text_name] = np.array(text_ints, dtype=np.int64)
         assert check_return_type(data)
         return data
diff --git a/espnet2/train/reporter.py b/espnet2/train/reporter.py
index 61218ec5871..a3c03995b54 100644
--- a/espnet2/train/reporter.py
+++ b/espnet2/train/reporter.py
@@ -1,3 +1,4 @@
+"""Reporter module."""
 from collections import defaultdict
 from contextlib import contextmanager
 import dataclasses
@@ -20,12 +21,7 @@
 import torch
 from typeguard import check_argument_types
 from typeguard import check_return_type
-import wandb
 
-if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"):
-    from torch.utils.tensorboard import SummaryWriter
-else:
-    from tensorboardX import SummaryWriter
 
 Num = Union[float, int, complex, torch.Tensor, np.ndarray]
 
@@ -97,6 +93,16 @@ def aggregate(values: Sequence["ReportedValue"]) -> Num:
     return retval
 
 
+def wandb_get_prefix(key: str):
+    if key.startswith("valid"):
+        return "valid/"
+    if key.startswith("train"):
+        return "train/"
+    if key.startswith("attn"):
+        return "attn/"
+    return "metrics/"
+
+
 class ReportedValue:
     pass
 
@@ -220,7 +226,7 @@ def log_message(self, start: int = None, end: int = None) -> str:
                 message += f"{key2}={v:.3e}"
         return message
 
-    def tensorboard_add_scalar(self, summary_writer: SummaryWriter, start: int = None):
+    def tensorboard_add_scalar(self, summary_writer, start: int = None):
         if start is None:
             start = 0
         if start < 0:
@@ -231,9 +237,11 @@ def tensorboard_add_scalar(self, summary_writer: SummaryWriter, start: int = Non
             # values: List[ReportValue]
             values = stats_list[start:]
             v = aggregate(values)
-            summary_writer.add_scalar(key2, v, self.total_count)
+            summary_writer.add_scalar(f"{key2}", v, self.total_count)
+
+    def wandb_log(self, start: int = None):
+        import wandb
 
-    def wandb_log(self, start: int = None, commit: bool = True):
         if start is None:
             start = 0
         if start < 0:
@@ -245,9 +253,9 @@ def wandb_log(self, start: int = None, commit: bool = True):
             # values: List[ReportValue]
             values = stats_list[start:]
             v = aggregate(values)
-            d[key2] = v
+            d[wandb_get_prefix(key2) + key2] = v
         d["iteration"] = self.total_count
-        wandb.log(d, commit=commit)
+        wandb.log(d)
 
     def finished(self) -> None:
         self._finished = True
@@ -352,11 +360,11 @@ def finish_epoch(self, sub_reporter: SubReporter) -> None:
         if LooseVersion(torch.__version__) >= LooseVersion("1.4.0"):
             if torch.cuda.is_initialized():
                 stats["gpu_max_cached_mem_GB"] = (
-                    torch.cuda.max_memory_reserved() / 2 ** 30
+                    torch.cuda.max_memory_reserved() / 2**30
                 )
         else:
             if torch.cuda.is_available() and torch.cuda.max_memory_cached() > 0:
-                stats["gpu_cached_mem_GB"] = torch.cuda.max_memory_cached() / 2 ** 30
+                stats["gpu_cached_mem_GB"] = torch.cuda.max_memory_cached() / 2**30
 
         self.stats.setdefault(self.epoch, {})[sub_reporter.key] = stats
         sub_reporter.finished()
@@ -536,21 +544,29 @@ def _plot_stats(self, keys: Sequence[str], key2: str):
 
         return plt
 
-    def tensorboard_add_scalar(self, summary_writer: SummaryWriter, epoch: int = None):
+    def tensorboard_add_scalar(
+        self, summary_writer, epoch: int = None, key1: str = None
+    ):
         if epoch is None:
             epoch = self.get_epoch()
+            total_count = self.stats[epoch]["train"]["total_count"]
+            if key1 == "train":
+                summary_writer.add_scalar("iter_epoch", epoch, total_count)
 
-        for key1 in self.get_keys(epoch):
-            for key2 in self.stats[epoch][key1]:
-                if key2 in ("time", "total_count"):
-                    continue
+        if key1 is not None:
+            key1_iterator = tuple([key1])
+        else:
+            key1_iterator = self.get_keys(epoch)
+
+        for key1 in key1_iterator:
+            for key2 in self.get_keys2(key1):
                 summary_writer.add_scalar(
-                    f"{key1}_{key2}_epoch",
-                    self.stats[epoch][key1][key2],
-                    epoch,
+                    f"{key2}", self.stats[epoch][key1][key2], total_count
                 )
 
-    def wandb_log(self, epoch: int = None, commit: bool = True):
+    def wandb_log(self, epoch: int = None):
+        import wandb
+
         if epoch is None:
             epoch = self.get_epoch()
 
@@ -559,9 +575,10 @@ def wandb_log(self, epoch: int = None, commit: bool = True):
             for key2 in self.stats[epoch][key1]:
                 if key2 in ("time", "total_count"):
                     continue
-                d[f"{key1}_{key2}_epoch"] = self.stats[epoch][key1][key2]
+                key = f"{key1}_{key2}_epoch"
+                d[wandb_get_prefix(key) + key] = self.stats[epoch][key1][key2]
         d["epoch"] = epoch
-        wandb.log(d, commit=commit)
+        wandb.log(d)
 
     def state_dict(self):
         return {"stats": self.stats, "epoch": self.epoch}
diff --git a/espnet2/train/trainer.py b/espnet2/train/trainer.py
index 019c93e385b..766651ddbaa 100644
--- a/espnet2/train/trainer.py
+++ b/espnet2/train/trainer.py
@@ -1,3 +1,4 @@
+"""Trainer module."""
 import argparse
 from contextlib import contextmanager
 import dataclasses
@@ -38,17 +39,8 @@
 from espnet2.train.reporter import SubReporter
 from espnet2.utils.build_dataclass import build_dataclass
 
-if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"):
-    from torch.utils.tensorboard import SummaryWriter
-else:
-    from tensorboardX import SummaryWriter
 if torch.distributed.is_available():
-    if LooseVersion(torch.__version__) > LooseVersion("1.0.1"):
-        from torch.distributed import ReduceOp
-    else:
-        from torch.distributed import reduce_op as ReduceOp
-else:
-    ReduceOp = None
+    from torch.distributed import ReduceOp
 
 if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
     from torch.cuda.amp import autocast
@@ -79,6 +71,7 @@ class TrainerOptions:
     grad_clip_type: float
     log_interval: Optional[int]
     no_forward_run: bool
+    use_matplotlib: bool
     use_tensorboard: bool
     use_wandb: bool
     output_dir: Union[Path, str]
@@ -87,10 +80,12 @@ class TrainerOptions:
     sharded_ddp: bool
     patience: Optional[int]
     keep_nbest_models: Union[int, List[int]]
+    nbest_averaging_interval: int
     early_stopping_criterion: Sequence[str]
     best_model_criterion: Sequence[Sequence[str]]
     val_scheduler_criterion: Sequence[str]
     unused_parameters: bool
+    wandb_model_log_interval: int
 
 
 class Trainer:
@@ -178,12 +173,12 @@ def run(
         assert len(optimizers) == len(schedulers), (len(optimizers), len(schedulers))
 
         if isinstance(trainer_options.keep_nbest_models, int):
-            keep_nbest_models = trainer_options.keep_nbest_models
+            keep_nbest_models = [trainer_options.keep_nbest_models]
         else:
             if len(trainer_options.keep_nbest_models) == 0:
                 logging.warning("No keep_nbest_models is given. Change to [1]")
                 trainer_options.keep_nbest_models = [1]
-            keep_nbest_models = max(trainer_options.keep_nbest_models)
+            keep_nbest_models = trainer_options.keep_nbest_models
 
         output_dir = Path(trainer_options.output_dir)
         reporter = Reporter()
@@ -247,7 +242,6 @@ def run(
             dp_model = torch.nn.parallel.DataParallel(
                 model,
                 device_ids=list(range(distributed_option.ngpu)),
-                find_unused_parameters=trainer_options.unused_parameters,
             )
         else:
             # NOTE(kamo): DataParallel also should work with ngpu=1,
@@ -257,9 +251,16 @@ def run(
         if trainer_options.use_tensorboard and (
             not distributed_option.distributed or distributed_option.dist_rank == 0
         ):
-            summary_writer = SummaryWriter(str(output_dir / "tensorboard"))
+            from torch.utils.tensorboard import SummaryWriter
+
+            train_summary_writer = SummaryWriter(
+                str(output_dir / "tensorboard" / "train")
+            )
+            valid_summary_writer = SummaryWriter(
+                str(output_dir / "tensorboard" / "valid")
+            )
         else:
-            summary_writer = None
+            train_summary_writer = None
 
         start_time = time.perf_counter()
         for iepoch in range(start_epoch, trainer_options.max_epoch + 1):
@@ -289,7 +290,7 @@ def run(
                     iterator=train_iter_factory.build_iter(iepoch),
                     reporter=sub_reporter,
                     scaler=scaler,
-                    summary_writer=summary_writer,
+                    summary_writer=train_summary_writer,
                     options=trainer_options,
                     distributed_option=distributed_option,
                 )
@@ -302,7 +303,6 @@ def run(
                     options=trainer_options,
                     distributed_option=distributed_option,
                 )
-
             if not distributed_option.distributed or distributed_option.dist_rank == 0:
                 # att_plot doesn't support distributed
                 if plot_attention_iter_factory is not None:
@@ -310,7 +310,7 @@ def run(
                         cls.plot_attention(
                             model=model,
                             output_dir=output_dir / "att_ws",
-                            summary_writer=summary_writer,
+                            summary_writer=train_summary_writer,
                             iterator=plot_attention_iter_factory.build_iter(iepoch),
                             reporter=sub_reporter,
                             options=trainer_options,
@@ -332,9 +332,11 @@ def run(
             if not distributed_option.distributed or distributed_option.dist_rank == 0:
                 # 3. Report the results
                 logging.info(reporter.log_message())
-                reporter.matplotlib_plot(output_dir / "images")
-                if summary_writer is not None:
-                    reporter.tensorboard_add_scalar(summary_writer)
+                if trainer_options.use_matplotlib:
+                    reporter.matplotlib_plot(output_dir / "images")
+                if train_summary_writer is not None:
+                    reporter.tensorboard_add_scalar(train_summary_writer, key1="train")
+                    reporter.tensorboard_add_scalar(valid_summary_writer, key1="valid")
                 if trainer_options.use_wandb:
                     reporter.wandb_log()
 
@@ -353,7 +355,7 @@ def run(
                     output_dir / "checkpoint.pth",
                 )
 
-                # 5. Save the model and update the link to the best model
+                # 5. Save and log the model and update the link to the best model
                 torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")
 
                 # Creates a sym link latest.pth -> {iepoch}epoch.pth
@@ -381,16 +383,50 @@ def run(
                         "The best model has been updated: " + ", ".join(_improved)
                     )
 
+                log_model = (
+                    trainer_options.wandb_model_log_interval > 0
+                    and iepoch % trainer_options.wandb_model_log_interval == 0
+                )
+                if log_model and trainer_options.use_wandb:
+                    import wandb
+
+                    logging.info("Logging Model on this epoch :::::")
+                    artifact = wandb.Artifact(
+                        name=f"model_{wandb.run.id}",
+                        type="model",
+                        metadata={"improved": _improved},
+                    )
+                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
+                    aliases = [
+                        f"epoch-{iepoch}",
+                        "best" if best_epoch == iepoch else "",
+                    ]
+                    wandb.log_artifact(artifact, aliases=aliases)
+
                 # 6. Remove the model files excluding n-best epoch and latest epoch
                 _removed = []
                 # Get the union set of the n-best among multiple criterion
                 nbests = set().union(
                     *[
-                        set(reporter.sort_epochs(ph, k, m)[:keep_nbest_models])
+                        set(reporter.sort_epochs(ph, k, m)[: max(keep_nbest_models)])
                         for ph, k, m in trainer_options.best_model_criterion
                         if reporter.has(ph, k)
                     ]
                 )
+
+                # Generated n-best averaged model
+                if (
+                    trainer_options.nbest_averaging_interval > 0
+                    and iepoch % trainer_options.nbest_averaging_interval == 0
+                ):
+                    average_nbest_models(
+                        reporter=reporter,
+                        output_dir=output_dir,
+                        best_model_criterion=trainer_options.best_model_criterion,
+                        nbest=keep_nbest_models,
+                        suffix=f"till{iepoch}epoch",
+                    )
+
                 for e in range(1, iepoch):
                     p = output_dir / f"{e}epoch.pth"
                     if p.exists() and e not in nbests:
@@ -419,8 +455,8 @@ def run(
                 f"The training was finished at {trainer_options.max_epoch} epochs "
             )
 
+        # Generated n-best averaged model
         if not distributed_option.distributed or distributed_option.dist_rank == 0:
-            # Generated n-best averaged model
             average_nbest_models(
                 reporter=reporter,
                 output_dir=output_dir,
@@ -437,7 +473,7 @@ def train_one_epoch(
         schedulers: Sequence[Optional[AbsScheduler]],
         scaler: Optional[GradScaler],
         reporter: SubReporter,
-        summary_writer: Optional[SummaryWriter],
+        summary_writer,
         options: TrainerOptions,
         distributed_option: DistributedOption,
     ) -> bool:
@@ -614,7 +650,10 @@ def train_one_epoch(
                                 optimizer.step()
                             if isinstance(scheduler, AbsBatchStepScheduler):
                                 scheduler.step()
-                            optimizer.zero_grad()
+                for iopt, optimizer in enumerate(optimizers):
+                    if optim_idx is not None and iopt != optim_idx:
+                        continue
+                    optimizer.zero_grad()
 
                 # Register lr and train/load time[sec/step],
                 # where step refers to accum_grad * mini-batch
@@ -644,7 +683,6 @@ def train_one_epoch(
             if distributed:
                 iterator_stop.fill_(1)
                 torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
-
         return all_steps_are_invalid
 
     @classmethod
@@ -703,7 +741,7 @@ def plot_attention(
         cls,
         model: torch.nn.Module,
         output_dir: Optional[Path],
-        summary_writer: Optional[SummaryWriter],
+        summary_writer,
         iterator: Iterable[Tuple[List[str], Dict[str, torch.Tensor]]],
         reporter: SubReporter,
         options: TrainerOptions,
@@ -769,4 +807,9 @@ def plot_attention(
                         summary_writer.add_figure(
                             f"{k}_{id_}", fig, reporter.get_epoch()
                         )
+
+                    if options.use_wandb:
+                        import wandb
+
+                        wandb.log({f"attention plot/{k}_{id_}": wandb.Image(fig)})
             reporter.next()
diff --git a/espnet2/tts/abs_tts.py b/espnet2/tts/abs_tts.py
index d226b678069..08eab189ad8 100644
--- a/espnet2/tts/abs_tts.py
+++ b/espnet2/tts/abs_tts.py
@@ -1,3 +1,8 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Text-to-speech abstrast class."""
+
 from abc import ABC
 from abc import abstractmethod
 from typing import Dict
@@ -7,24 +12,35 @@
 
 
 class AbsTTS(torch.nn.Module, ABC):
+    """TTS abstract class."""
+
     @abstractmethod
     def forward(
         self,
         text: torch.Tensor,
         text_lengths: torch.Tensor,
-        speech: torch.Tensor,
-        speech_lengths: torch.Tensor,
-        spembs: torch.Tensor = None,
-        spcs: torch.Tensor = None,
-        spcs_lengths: torch.Tensor = None,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Calculate outputs and return the loss tensor."""
         raise NotImplementedError
 
     @abstractmethod
     def inference(
         self,
         text: torch.Tensor,
-        spembs: torch.Tensor = None,
         **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> Dict[str, torch.Tensor]:
+        """Return predicted output as a dict."""
         raise NotImplementedError
+
+    @property
+    def require_raw_speech(self):
+        """Return whether or not raw_speech is required."""
+        return False
+
+    @property
+    def require_vocoder(self):
+        """Return whether or not vocoder is required."""
+        return True
diff --git a/espnet2/tts/espnet_model.py b/espnet2/tts/espnet_model.py
index cab5bbbc670..986c7d029a0 100644
--- a/espnet2/tts/espnet_model.py
+++ b/espnet2/tts/espnet_model.py
@@ -1,3 +1,8 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Text-to-speech ESPnet model."""
+
 from contextlib import contextmanager
 from distutils.version import LooseVersion
 from typing import Dict
@@ -5,6 +10,7 @@
 from typing import Tuple
 
 import torch
+
 from typeguard import check_argument_types
 
 from espnet2.layers.abs_normalize import AbsNormalize
@@ -18,11 +24,13 @@
 else:
     # Nothing to do if torch<1.6.0
     @contextmanager
-    def autocast(enabled=True):
+    def autocast(enabled=True):  # NOQA
         yield
 
 
 class ESPnetTTSModel(AbsESPnetModel):
+    """ESPnet model for text-to-speech task."""
+
     def __init__(
         self,
         feats_extract: Optional[AbsFeatsExtract],
@@ -33,6 +41,7 @@ def __init__(
         energy_normalize: Optional[AbsNormalize and InversibleInterface],
         tts: AbsTTS,
     ):
+        """Initialize ESPnetTTSModel module."""
         assert check_argument_types()
         super().__init__()
         self.feats_extract = feats_extract
@@ -49,20 +58,45 @@ def forward(
         text_lengths: torch.Tensor,
         speech: torch.Tensor,
         speech_lengths: torch.Tensor,
-        durations: torch.Tensor = None,
-        durations_lengths: torch.Tensor = None,
-        pitch: torch.Tensor = None,
-        pitch_lengths: torch.Tensor = None,
-        energy: torch.Tensor = None,
-        energy_lengths: torch.Tensor = None,
-        spembs: torch.Tensor = None,
-        **kwargs,
+        durations: Optional[torch.Tensor] = None,
+        durations_lengths: Optional[torch.Tensor] = None,
+        pitch: Optional[torch.Tensor] = None,
+        pitch_lengths: Optional[torch.Tensor] = None,
+        energy: Optional[torch.Tensor] = None,
+        energy_lengths: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Caclualte outputs and return the loss tensor.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+            duration (Optional[Tensor]): Duration tensor.
+            duration_lengths (Optional[Tensor]): Duration length tensor (B,).
+            pitch (Optional[Tensor]): Pitch tensor.
+            pitch_lengths (Optional[Tensor]): Pitch length tensor (B,).
+            energy (Optional[Tensor]): Energy tensor.
+            energy_lengths (Optional[Tensor]): Energy length tensor (B,).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
+            sids (Optional[Tensor]): Speaker ID tensor (B, 1).
+            lids (Optional[Tensor]): Language ID tensor (B, 1).
+
+        Returns:
+            Tensor: Loss scalar tensor.
+            Dict[str, float]: Statistics to be monitored.
+            Tensor: Weight tensor to summarize losses.
+
+        """
         with autocast(False):
             # Extract features
             if self.feats_extract is not None:
                 feats, feats_lengths = self.feats_extract(speech, speech_lengths)
             else:
+                # Use precalculated feats (feats_type != raw case)
                 feats, feats_lengths = speech, speech_lengths
 
             # Extract auxiliary features
@@ -91,23 +125,31 @@ def forward(
             if self.energy_normalize is not None:
                 energy, energy_lengths = self.energy_normalize(energy, energy_lengths)
 
-        # Update kwargs for additional auxiliary inputs
+        # Make batch for tts inputs
+        batch = dict(
+            text=text,
+            text_lengths=text_lengths,
+            feats=feats,
+            feats_lengths=feats_lengths,
+        )
+
+        # Update batch for additional auxiliary inputs
         if spembs is not None:
-            kwargs.update(spembs=spembs)
+            batch.update(spembs=spembs)
+        if sids is not None:
+            batch.update(sids=sids)
+        if lids is not None:
+            batch.update(lids=lids)
         if durations is not None:
-            kwargs.update(durations=durations, durations_lengths=durations_lengths)
+            batch.update(durations=durations, durations_lengths=durations_lengths)
         if self.pitch_extract is not None and pitch is not None:
-            kwargs.update(pitch=pitch, pitch_lengths=pitch_lengths)
+            batch.update(pitch=pitch, pitch_lengths=pitch_lengths)
         if self.energy_extract is not None and energy is not None:
-            kwargs.update(energy=energy, energy_lengths=energy_lengths)
+            batch.update(energy=energy, energy_lengths=energy_lengths)
+        if self.tts.require_raw_speech:
+            batch.update(speech=speech, speech_lengths=speech_lengths)
 
-        return self.tts(
-            text=text,
-            text_lengths=text_lengths,
-            speech=feats,
-            speech_lengths=feats_lengths,
-            **kwargs,
-        )
+        return self.tts(**batch)
 
     def collect_feats(
         self,
@@ -115,20 +157,43 @@ def collect_feats(
         text_lengths: torch.Tensor,
         speech: torch.Tensor,
         speech_lengths: torch.Tensor,
-        durations: torch.Tensor = None,
-        durations_lengths: torch.Tensor = None,
-        pitch: torch.Tensor = None,
-        pitch_lengths: torch.Tensor = None,
-        energy: torch.Tensor = None,
-        energy_lengths: torch.Tensor = None,
-        spembs: torch.Tensor = None,
+        durations: Optional[torch.Tensor] = None,
+        durations_lengths: Optional[torch.Tensor] = None,
+        pitch: Optional[torch.Tensor] = None,
+        pitch_lengths: Optional[torch.Tensor] = None,
+        energy: Optional[torch.Tensor] = None,
+        energy_lengths: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
     ) -> Dict[str, torch.Tensor]:
+        """Caclualte features and return them as a dict.
+
+        Args:
+            text (Tensor): Text index tensor (B, T_text).
+            text_lengths (Tensor): Text length tensor (B,).
+            speech (Tensor): Speech waveform tensor (B, T_wav).
+            speech_lengths (Tensor): Speech length tensor (B,).
+            durations (Optional[Tensor): Duration tensor.
+            durations_lengths (Optional[Tensor): Duration length tensor (B,).
+            pitch (Optional[Tensor): Pitch tensor.
+            pitch_lengths (Optional[Tensor): Pitch length tensor (B,).
+            energy (Optional[Tensor): Energy tensor.
+            energy_lengths (Optional[Tensor): Energy length tensor (B,).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, D).
+            sids (Optional[Tensor]): Speaker ID tensor (B, 1).
+            lids (Optional[Tensor]): Language ID tensor (B, 1).
+
+        Returns:
+            Dict[str, Tensor]: Dict of features.
+
+        """
+        # feature extraction
         if self.feats_extract is not None:
             feats, feats_lengths = self.feats_extract(speech, speech_lengths)
         else:
+            # Use precalculated feats (feats_type != raw case)
             feats, feats_lengths = speech, speech_lengths
-        feats_dict = {"feats": feats, "feats_lengths": feats_lengths}
-
         if self.pitch_extract is not None:
             pitch, pitch_lengths = self.pitch_extract(
                 speech,
@@ -145,6 +210,9 @@ def collect_feats(
                 durations=durations,
                 durations_lengths=durations_lengths,
             )
+
+        # store in dict
+        feats_dict = dict(feats=feats, feats_lengths=feats_lengths)
         if pitch is not None:
             feats_dict.update(pitch=pitch, pitch_lengths=pitch_lengths)
         if energy is not None:
@@ -155,28 +223,49 @@ def collect_feats(
     def inference(
         self,
         text: torch.Tensor,
-        speech: torch.Tensor = None,
-        spembs: torch.Tensor = None,
-        durations: torch.Tensor = None,
-        pitch: torch.Tensor = None,
-        energy: torch.Tensor = None,
+        speech: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        durations: Optional[torch.Tensor] = None,
+        pitch: Optional[torch.Tensor] = None,
+        energy: Optional[torch.Tensor] = None,
         **decode_config,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        kwargs = {}
+    ) -> Dict[str, torch.Tensor]:
+        """Caclualte features and return them as a dict.
+
+        Args:
+            text (Tensor): Text index tensor (T_text).
+            speech (Tensor): Speech waveform tensor (T_wav).
+            spembs (Optional[Tensor]): Speaker embedding tensor (D,).
+            sids (Optional[Tensor]): Speaker ID tensor (1,).
+            lids (Optional[Tensor]): Language ID tensor (1,).
+            durations (Optional[Tensor): Duration tensor.
+            pitch (Optional[Tensor): Pitch tensor.
+            energy (Optional[Tensor): Energy tensor.
+
+        Returns:
+            Dict[str, Tensor]: Dict of outputs.
+
+        """
+        input_dict = dict(text=text)
         if decode_config["use_teacher_forcing"] or getattr(self.tts, "use_gst", False):
             if speech is None:
                 raise RuntimeError("missing required argument: 'speech'")
             if self.feats_extract is not None:
                 feats = self.feats_extract(speech[None])[0][0]
             else:
+                # Use precalculated feats (feats_type != raw case)
                 feats = speech
             if self.normalize is not None:
                 feats = self.normalize(feats[None])[0][0]
-            kwargs["speech"] = feats
+            input_dict.update(feats=feats)
+            if self.tts.require_raw_speech:
+                input_dict.update(speech=speech)
 
         if decode_config["use_teacher_forcing"]:
             if durations is not None:
-                kwargs["durations"] = durations
+                input_dict.update(durations=durations)
 
             if self.pitch_extract is not None:
                 pitch = self.pitch_extract(
@@ -187,7 +276,7 @@ def inference(
             if self.pitch_normalize is not None:
                 pitch = self.pitch_normalize(pitch[None])[0][0]
             if pitch is not None:
-                kwargs["pitch"] = pitch
+                input_dict.update(pitch=pitch)
 
             if self.energy_extract is not None:
                 energy = self.energy_extract(
@@ -198,16 +287,22 @@ def inference(
             if self.energy_normalize is not None:
                 energy = self.energy_normalize(energy[None])[0][0]
             if energy is not None:
-                kwargs["energy"] = energy
+                input_dict.update(energy=energy)
 
         if spembs is not None:
-            kwargs["spembs"] = spembs
+            input_dict.update(spembs=spembs)
+        if sids is not None:
+            input_dict.update(sids=sids)
+        if lids is not None:
+            input_dict.update(lids=lids)
 
-        outs, probs, att_ws = self.tts.inference(text=text, **kwargs, **decode_config)
+        output_dict = self.tts.inference(**input_dict, **decode_config)
 
-        if self.normalize is not None:
+        if self.normalize is not None and output_dict.get("feat_gen") is not None:
             # NOTE: normalize.inverse is in-place operation
-            outs_denorm = self.normalize.inverse(outs.clone()[None])[0][0]
-        else:
-            outs_denorm = outs
-        return outs, outs_denorm, probs, att_ws
+            feat_gen_denorm = self.normalize.inverse(
+                output_dict["feat_gen"].clone()[None]
+            )[0][0]
+            output_dict.update(feat_gen_denorm=feat_gen_denorm)
+
+        return output_dict
diff --git a/espnet2/tts/fastspeech.py b/espnet2/tts/fastspeech.py
deleted file mode 100644
index 7a9da965ebb..00000000000
--- a/espnet2/tts/fastspeech.py
+++ /dev/null
@@ -1,611 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2020 Nagoya University (Tomoki Hayashi)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-"""Fastspeech related modules for ESPnet2."""
-
-from typing import Dict
-from typing import Sequence
-from typing import Tuple
-
-import torch
-import torch.nn.functional as F
-from typeguard import check_argument_types
-
-from espnet.nets.pytorch_backend.conformer.encoder import (
-    Encoder as ConformerEncoder,  # noqa: H301
-)
-from espnet.nets.pytorch_backend.e2e_tts_fastspeech import (
-    FeedForwardTransformerLoss as FastSpeechLoss,  # NOQA
-)
-from espnet.nets.pytorch_backend.fastspeech.duration_predictor import DurationPredictor
-from espnet.nets.pytorch_backend.fastspeech.length_regulator import LengthRegulator
-from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
-from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
-from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
-from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
-from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
-from espnet.nets.pytorch_backend.transformer.encoder import (
-    Encoder as TransformerEncoder,  # noqa: H301
-)
-
-from espnet2.torch_utils.device_funcs import force_gatherable
-from espnet2.torch_utils.initialize import initialize
-from espnet2.tts.abs_tts import AbsTTS
-from espnet2.tts.gst.style_encoder import StyleEncoder
-
-
-class FastSpeech(AbsTTS):
-    """FastSpeech module for end-to-end text-to-speech.
-
-    This is a module of FastSpeech, feed-forward Transformer with duration predictor
-    described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_, which
-    does not require any auto-regressive processing during inference, resulting in
-    fast decoding compared with auto-regressive Transformer.
-
-    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
-        https://arxiv.org/pdf/1905.09263.pdf
-
-    Args:
-        idim (int): Dimension of the inputs.
-        odim (int): Dimension of the outputs.
-        elayers (int, optional): Number of encoder layers.
-        eunits (int, optional): Number of encoder hidden units.
-        dlayers (int, optional): Number of decoder layers.
-        dunits (int, optional): Number of decoder hidden units.
-        use_scaled_pos_enc (bool, optional):
-            Whether to use trainable scaled positional encoding.
-        encoder_normalize_before (bool, optional):
-            Whether to perform layer normalization before encoder block.
-        decoder_normalize_before (bool, optional):
-            Whether to perform layer normalization before decoder block.
-        encoder_concat_after (bool, optional): Whether to concatenate attention
-            layer's input and output in encoder.
-        decoder_concat_after (bool, optional): Whether to concatenate attention
-            layer's input and output in decoder.
-        duration_predictor_layers (int, optional): Number of duration predictor layers.
-        duration_predictor_chans (int, optional): Number of duration predictor channels.
-        duration_predictor_kernel_size (int, optional):
-            Kernel size of duration predictor.
-        spk_embed_dim (int, optional): Number of speaker embedding dimensions.
-        spk_embed_integration_type: How to integrate speaker embedding.
-        use_gst (str, optional): Whether to use global style token.
-        gst_tokens (int, optional): The number of GST embeddings.
-        gst_heads (int, optional): The number of heads in GST multihead attention.
-        gst_conv_layers (int, optional): The number of conv layers in GST.
-        gst_conv_chans_list: (Sequence[int], optional):
-            List of the number of channels of conv layers in GST.
-        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
-        gst_conv_stride (int, optional): Stride size of conv layers in GST.
-        gst_gru_layers (int, optional): The number of GRU layers in GST.
-        gst_gru_units (int, optional): The number of GRU units in GST.
-        reduction_factor (int, optional): Reduction factor.
-        transformer_enc_dropout_rate (float, optional):
-            Dropout rate in encoder except attention & positional encoding.
-        transformer_enc_positional_dropout_rate (float, optional):
-            Dropout rate after encoder positional encoding.
-        transformer_enc_attn_dropout_rate (float, optional):
-            Dropout rate in encoder self-attention module.
-        transformer_dec_dropout_rate (float, optional):
-            Dropout rate in decoder except attention & positional encoding.
-        transformer_dec_positional_dropout_rate (float, optional):
-            Dropout rate after decoder positional encoding.
-        transformer_dec_attn_dropout_rate (float, optional):
-            Dropout rate in deocoder self-attention module.
-        init_type (str, optional):
-            How to initialize transformer parameters.
-        init_enc_alpha (float, optional):
-            Initial value of alpha in scaled pos encoding of the encoder.
-        init_dec_alpha (float, optional):
-            Initial value of alpha in scaled pos encoding of the decoder.
-        use_masking (bool, optional):
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking (bool, optional):
-            Whether to apply weighted masking in loss calculation.
-
-    """
-
-    def __init__(
-        self,
-        # network structure related
-        idim: int,
-        odim: int,
-        adim: int = 384,
-        aheads: int = 4,
-        elayers: int = 6,
-        eunits: int = 1536,
-        dlayers: int = 6,
-        dunits: int = 1536,
-        postnet_layers: int = 5,
-        postnet_chans: int = 512,
-        postnet_filts: int = 5,
-        positionwise_layer_type: str = "conv1d",
-        positionwise_conv_kernel_size: int = 1,
-        use_scaled_pos_enc: bool = True,
-        use_batch_norm: bool = True,
-        encoder_normalize_before: bool = True,
-        decoder_normalize_before: bool = True,
-        encoder_concat_after: bool = False,
-        decoder_concat_after: bool = False,
-        duration_predictor_layers: int = 2,
-        duration_predictor_chans: int = 384,
-        duration_predictor_kernel_size: int = 3,
-        reduction_factor: int = 1,
-        encoder_type: str = "transformer",
-        decoder_type: str = "transformer",
-        # only for conformer
-        conformer_pos_enc_layer_type: str = "rel_pos",
-        conformer_self_attn_layer_type: str = "rel_selfattn",
-        conformer_activation_type: str = "swish",
-        use_macaron_style_in_conformer: bool = True,
-        use_cnn_in_conformer: bool = True,
-        conformer_enc_kernel_size: int = 7,
-        conformer_dec_kernel_size: int = 31,
-        # pretrained spk emb
-        spk_embed_dim: int = None,
-        spk_embed_integration_type: str = "add",
-        # GST
-        use_gst: bool = False,
-        gst_tokens: int = 10,
-        gst_heads: int = 4,
-        gst_conv_layers: int = 6,
-        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
-        gst_conv_kernel_size: int = 3,
-        gst_conv_stride: int = 2,
-        gst_gru_layers: int = 1,
-        gst_gru_units: int = 128,
-        # training related
-        transformer_enc_dropout_rate: float = 0.1,
-        transformer_enc_positional_dropout_rate: float = 0.1,
-        transformer_enc_attn_dropout_rate: float = 0.1,
-        transformer_dec_dropout_rate: float = 0.1,
-        transformer_dec_positional_dropout_rate: float = 0.1,
-        transformer_dec_attn_dropout_rate: float = 0.1,
-        duration_predictor_dropout_rate: float = 0.1,
-        postnet_dropout_rate: float = 0.5,
-        init_type: str = "xavier_uniform",
-        init_enc_alpha: float = 1.0,
-        init_dec_alpha: float = 1.0,
-        use_masking: bool = False,
-        use_weighted_masking: bool = False,
-    ):
-        """Initialize FastSpeech module."""
-        assert check_argument_types()
-        super().__init__()
-
-        # store hyperparameters
-        self.idim = idim
-        self.odim = odim
-        self.eos = idim - 1
-        self.reduction_factor = reduction_factor
-        self.encoder_type = encoder_type
-        self.decoder_type = decoder_type
-        self.use_scaled_pos_enc = use_scaled_pos_enc
-        self.use_gst = use_gst
-        self.spk_embed_dim = spk_embed_dim
-        if self.spk_embed_dim is not None:
-            self.spk_embed_integration_type = spk_embed_integration_type
-
-        # use idx 0 as padding idx
-        self.padding_idx = 0
-
-        # get positional encoding class
-        pos_enc_class = (
-            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
-        )
-
-        # define encoder
-        encoder_input_layer = torch.nn.Embedding(
-            num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx
-        )
-        if encoder_type == "transformer":
-            self.encoder = TransformerEncoder(
-                idim=idim,
-                attention_dim=adim,
-                attention_heads=aheads,
-                linear_units=eunits,
-                num_blocks=elayers,
-                input_layer=encoder_input_layer,
-                dropout_rate=transformer_enc_dropout_rate,
-                positional_dropout_rate=transformer_enc_positional_dropout_rate,
-                attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
-                normalize_before=encoder_normalize_before,
-                concat_after=encoder_concat_after,
-                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-            )
-        elif encoder_type == "conformer":
-            self.encoder = ConformerEncoder(
-                idim=idim,
-                attention_dim=adim,
-                attention_heads=aheads,
-                linear_units=eunits,
-                num_blocks=elayers,
-                input_layer=encoder_input_layer,
-                dropout_rate=transformer_enc_dropout_rate,
-                positional_dropout_rate=transformer_enc_positional_dropout_rate,
-                attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                normalize_before=encoder_normalize_before,
-                concat_after=encoder_concat_after,
-                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-                macaron_style=use_macaron_style_in_conformer,
-                pos_enc_layer_type=conformer_pos_enc_layer_type,
-                selfattention_layer_type=conformer_self_attn_layer_type,
-                activation_type=conformer_activation_type,
-                use_cnn_module=use_cnn_in_conformer,
-                cnn_module_kernel=conformer_enc_kernel_size,
-            )
-        else:
-            raise ValueError(f"{encoder_type} is not supported.")
-
-        # define GST
-        if self.use_gst:
-            self.gst = StyleEncoder(
-                idim=odim,  # the input is mel-spectrogram
-                gst_tokens=gst_tokens,
-                gst_token_dim=adim,
-                gst_heads=gst_heads,
-                conv_layers=gst_conv_layers,
-                conv_chans_list=gst_conv_chans_list,
-                conv_kernel_size=gst_conv_kernel_size,
-                conv_stride=gst_conv_stride,
-                gru_layers=gst_gru_layers,
-                gru_units=gst_gru_units,
-            )
-
-        # define additional projection for speaker embedding
-        if self.spk_embed_dim is not None:
-            if self.spk_embed_integration_type == "add":
-                self.projection = torch.nn.Linear(self.spk_embed_dim, adim)
-            else:
-                self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)
-
-        # define duration predictor
-        self.duration_predictor = DurationPredictor(
-            idim=adim,
-            n_layers=duration_predictor_layers,
-            n_chans=duration_predictor_chans,
-            kernel_size=duration_predictor_kernel_size,
-            dropout_rate=duration_predictor_dropout_rate,
-        )
-
-        # define length regulator
-        self.length_regulator = LengthRegulator()
-
-        # define decoder
-        # NOTE: we use encoder as decoder
-        # because fastspeech's decoder is the same as encoder
-        if decoder_type == "transformer":
-            self.decoder = TransformerEncoder(
-                idim=0,
-                attention_dim=adim,
-                attention_heads=aheads,
-                linear_units=dunits,
-                num_blocks=dlayers,
-                input_layer=None,
-                dropout_rate=transformer_dec_dropout_rate,
-                positional_dropout_rate=transformer_dec_positional_dropout_rate,
-                attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
-                normalize_before=decoder_normalize_before,
-                concat_after=decoder_concat_after,
-                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-            )
-        elif decoder_type == "conformer":
-            self.decoder = ConformerEncoder(
-                idim=0,
-                attention_dim=adim,
-                attention_heads=aheads,
-                linear_units=dunits,
-                num_blocks=dlayers,
-                input_layer=None,
-                dropout_rate=transformer_dec_dropout_rate,
-                positional_dropout_rate=transformer_dec_positional_dropout_rate,
-                attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                normalize_before=decoder_normalize_before,
-                concat_after=decoder_concat_after,
-                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-                macaron_style=use_macaron_style_in_conformer,
-                pos_enc_layer_type=conformer_pos_enc_layer_type,
-                selfattention_layer_type=conformer_self_attn_layer_type,
-                activation_type=conformer_activation_type,
-                use_cnn_module=use_cnn_in_conformer,
-                cnn_module_kernel=conformer_dec_kernel_size,
-            )
-        else:
-            raise ValueError(f"{decoder_type} is not supported.")
-
-        # define final projection
-        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
-
-        # define postnet
-        self.postnet = (
-            None
-            if postnet_layers == 0
-            else Postnet(
-                idim=idim,
-                odim=odim,
-                n_layers=postnet_layers,
-                n_chans=postnet_chans,
-                n_filts=postnet_filts,
-                use_batch_norm=use_batch_norm,
-                dropout_rate=postnet_dropout_rate,
-            )
-        )
-
-        # initialize parameters
-        self._reset_parameters(
-            init_type=init_type,
-            init_enc_alpha=init_enc_alpha,
-            init_dec_alpha=init_dec_alpha,
-        )
-
-        # define criterions
-        self.criterion = FastSpeechLoss(
-            use_masking=use_masking, use_weighted_masking=use_weighted_masking
-        )
-
-    def _forward(
-        self,
-        xs: torch.Tensor,
-        ilens: torch.Tensor,
-        ys: torch.Tensor = None,
-        olens: torch.Tensor = None,
-        ds: torch.Tensor = None,
-        spembs: torch.Tensor = None,
-        is_inference: bool = False,
-        alpha: float = 1.0,
-    ) -> Sequence[torch.Tensor]:
-        # forward encoder
-        x_masks = self._source_mask(ilens)
-        hs, _ = self.encoder(xs, x_masks)  # (B, Tmax, adim)
-
-        # integrate with GST
-        if self.use_gst:
-            style_embs = self.gst(ys)
-            hs = hs + style_embs.unsqueeze(1)
-
-        # integrate speaker embedding
-        if self.spk_embed_dim is not None:
-            hs = self._integrate_with_spk_embed(hs, spembs)
-
-        # forward duration predictor and length regulator
-        d_masks = make_pad_mask(ilens).to(xs.device)
-        if is_inference:
-            d_outs = self.duration_predictor.inference(hs, d_masks)  # (B, Tmax)
-            hs = self.length_regulator(hs, d_outs, alpha)  # (B, Lmax, adim)
-        else:
-            d_outs = self.duration_predictor(hs, d_masks)  # (B, Tmax)
-            hs = self.length_regulator(hs, ds)  # (B, Lmax, adim)
-
-        # forward decoder
-        if olens is not None and not is_inference:
-            if self.reduction_factor > 1:
-                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
-            else:
-                olens_in = olens
-            h_masks = self._source_mask(olens_in)
-        else:
-            h_masks = None
-        zs, _ = self.decoder(hs, h_masks)  # (B, Lmax, adim)
-        before_outs = self.feat_out(zs).view(
-            zs.size(0), -1, self.odim
-        )  # (B, Lmax, odim)
-
-        # postnet -> (B, Lmax//r * r, odim)
-        if self.postnet is None:
-            after_outs = before_outs
-        else:
-            after_outs = before_outs + self.postnet(
-                before_outs.transpose(1, 2)
-            ).transpose(1, 2)
-
-        return before_outs, after_outs, d_outs
-
-    def forward(
-        self,
-        text: torch.Tensor,
-        text_lengths: torch.Tensor,
-        speech: torch.Tensor,
-        speech_lengths: torch.Tensor,
-        durations: torch.Tensor,
-        durations_lengths: torch.Tensor,
-        spembs: torch.Tensor = None,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
-        """Calculate forward propagation.
-
-        Args:
-            text (LongTensor): Batch of padded character ids (B, Tmax).
-            text_lengths (LongTensor): Batch of lengths of each input (B,).
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
-            speech_lengths (LongTensor): Batch of the lengths of each target (B,).
-            durations (LongTensor): Batch of padded durations (B, Tmax + 1).
-            durations_lengths (LongTensor): Batch of duration lengths (B, Tmax + 1).
-            spembs (Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns:
-            Tensor: Loss scalar value.
-            Dict: Statistics to be monitored.
-            Tensor: Weight value.
-
-        """
-        text = text[:, : text_lengths.max()]  # for data-parallel
-        speech = speech[:, : speech_lengths.max()]  # for data-parallel
-        durations = durations[:, : durations_lengths.max()]  # for data-parallel
-
-        batch_size = text.size(0)
-
-        # Add eos at the last of sequence
-        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
-        for i, l in enumerate(text_lengths):
-            xs[i, l] = self.eos
-        ilens = text_lengths + 1
-
-        ys, ds = speech, durations
-        olens = speech_lengths
-
-        # forward propagation
-        before_outs, after_outs, d_outs = self._forward(
-            xs, ilens, ys, olens, ds, spembs=spembs, is_inference=False
-        )
-
-        # modifiy mod part of groundtruth
-        if self.reduction_factor > 1:
-            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
-            max_olen = max(olens)
-            ys = ys[:, :max_olen]
-
-        # calculate loss
-        if self.postnet is None:
-            after_outs = None
-        l1_loss, duration_loss = self.criterion(
-            after_outs, before_outs, d_outs, ys, ds, ilens, olens
-        )
-        loss = l1_loss + duration_loss
-
-        stats = dict(
-            l1_loss=l1_loss.item(),
-            duration_loss=duration_loss.item(),
-            loss=loss.item(),
-        )
-
-        # report extra information
-        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
-            stats.update(
-                encoder_alpha=self.encoder.embed[-1].alpha.data.item(),
-            )
-        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
-            stats.update(
-                decoder_alpha=self.decoder.embed[-1].alpha.data.item(),
-            )
-
-        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
-        return loss, stats, weight
-
-    def inference(
-        self,
-        text: torch.Tensor,
-        speech: torch.Tensor = None,
-        spembs: torch.Tensor = None,
-        durations: torch.Tensor = None,
-        alpha: float = 1.0,
-        use_teacher_forcing: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Generate the sequence of features given the sequences of characters.
-
-        Args:
-            text (LongTensor): Input sequence of characters (T,).
-            speech (Tensor, optional): Feature sequence to extract style (N, idim).
-            spembs (Tensor, optional): Speaker embedding vector (spk_embed_dim,).
-            durations (LongTensor, optional): Groundtruth of duration (T + 1,).
-            alpha (float, optional): Alpha to control the speed.
-            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
-                If true, groundtruth of duration, pitch and energy will be used.
-
-        Returns:
-            Tensor: Output sequence of features (L, odim).
-            None: Dummy for compatibility.
-            None: Dummy for compatibility.
-
-        """
-        x, y = text, speech
-        spemb, d = spembs, durations
-
-        # add eos at the last of sequence
-        x = F.pad(x, [0, 1], "constant", self.eos)
-
-        # setup batch axis
-        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
-        xs, ys = x.unsqueeze(0), None
-        if y is not None:
-            ys = y.unsqueeze(0)
-        if spemb is not None:
-            spembs = spemb.unsqueeze(0)
-
-        if use_teacher_forcing:
-            # use groundtruth of duration, pitch, and energy
-            ds = d.unsqueeze(0)
-            _, outs, *_ = self._forward(
-                xs,
-                ilens,
-                ys,
-                ds=ds,
-                spembs=spembs,
-            )  # (1, L, odim)
-        else:
-            # inference
-            _, outs, _ = self._forward(
-                xs,
-                ilens,
-                ys,
-                spembs=spembs,
-                is_inference=True,
-                alpha=alpha,
-            )  # (1, L, odim)
-
-        return outs[0], None, None
-
-    def _integrate_with_spk_embed(
-        self, hs: torch.Tensor, spembs: torch.Tensor
-    ) -> torch.Tensor:
-        """Integrate speaker embedding with hidden states.
-
-        Args:
-            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
-            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns:
-            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
-
-        """
-        if self.spk_embed_integration_type == "add":
-            # apply projection and then add to hidden states
-            spembs = self.projection(F.normalize(spembs))
-            hs = hs + spembs.unsqueeze(1)
-        elif self.spk_embed_integration_type == "concat":
-            # concat hidden states with spk embeds and then apply projection
-            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
-            hs = self.projection(torch.cat([hs, spembs], dim=-1))
-        else:
-            raise NotImplementedError("support only add or concat.")
-
-        return hs
-
-    def _source_mask(self, ilens: torch.Tensor) -> torch.Tensor:
-        """Make masks for self-attention.
-
-        Args:
-            ilens (LongTensor): Batch of lengths (B,).
-
-        Returns:
-            Tensor: Mask tensor for self-attention.
-                dtype=torch.uint8 in PyTorch 1.2-
-                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
-
-        Examples:
-            >>> ilens = [5, 3]
-            >>> self._source_mask(ilens)
-            tensor([[[1, 1, 1, 1, 1],
-                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
-
-        """
-        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
-        return x_masks.unsqueeze(-2)
-
-    def _reset_parameters(
-        self, init_type: str, init_enc_alpha: float, init_dec_alpha: float
-    ):
-        # initialize parameters
-        if init_type != "pytorch":
-            initialize(self, init_type)
-
-        # initialize alpha in scaled positional encoding
-        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
-            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
-        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
-            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)
diff --git a/espnet2/tts/fastspeech/__init__.py b/espnet2/tts/fastspeech/__init__.py
new file mode 100644
index 00000000000..8e7665c5204
--- /dev/null
+++ b/espnet2/tts/fastspeech/__init__.py
@@ -0,0 +1 @@
+from espnet2.tts.fastspeech.fastspeech import FastSpeech  # NOQA
diff --git a/espnet2/tts/fastspeech/fastspeech.py b/espnet2/tts/fastspeech/fastspeech.py
new file mode 100644
index 00000000000..481b86976fa
--- /dev/null
+++ b/espnet2/tts/fastspeech/fastspeech.py
@@ -0,0 +1,709 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Fastspeech related modules for ESPnet2."""
+
+import logging
+
+from typing import Dict
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.conformer.encoder import (
+    Encoder as ConformerEncoder,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.e2e_tts_fastspeech import (
+    FeedForwardTransformerLoss as FastSpeechLoss,  # NOQA
+)
+from espnet.nets.pytorch_backend.fastspeech.duration_predictor import DurationPredictor
+from espnet.nets.pytorch_backend.fastspeech.length_regulator import LengthRegulator
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
+from espnet.nets.pytorch_backend.transformer.encoder import (
+    Encoder as TransformerEncoder,  # noqa: H301
+)
+
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.torch_utils.initialize import initialize
+from espnet2.tts.abs_tts import AbsTTS
+from espnet2.tts.gst.style_encoder import StyleEncoder
+
+
+class FastSpeech(AbsTTS):
+    """FastSpeech module for end-to-end text-to-speech.
+
+    This is a module of FastSpeech, feed-forward Transformer with duration predictor
+    described in `FastSpeech: Fast, Robust and Controllable Text to Speech`_, which
+    does not require any auto-regressive processing during inference, resulting in
+    fast decoding compared with auto-regressive Transformer.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    """
+
+    def __init__(
+        self,
+        # network structure related
+        idim: int,
+        odim: int,
+        adim: int = 384,
+        aheads: int = 4,
+        elayers: int = 6,
+        eunits: int = 1536,
+        dlayers: int = 6,
+        dunits: int = 1536,
+        postnet_layers: int = 5,
+        postnet_chans: int = 512,
+        postnet_filts: int = 5,
+        postnet_dropout_rate: float = 0.5,
+        positionwise_layer_type: str = "conv1d",
+        positionwise_conv_kernel_size: int = 1,
+        use_scaled_pos_enc: bool = True,
+        use_batch_norm: bool = True,
+        encoder_normalize_before: bool = True,
+        decoder_normalize_before: bool = True,
+        encoder_concat_after: bool = False,
+        decoder_concat_after: bool = False,
+        duration_predictor_layers: int = 2,
+        duration_predictor_chans: int = 384,
+        duration_predictor_kernel_size: int = 3,
+        duration_predictor_dropout_rate: float = 0.1,
+        reduction_factor: int = 1,
+        encoder_type: str = "transformer",
+        decoder_type: str = "transformer",
+        transformer_enc_dropout_rate: float = 0.1,
+        transformer_enc_positional_dropout_rate: float = 0.1,
+        transformer_enc_attn_dropout_rate: float = 0.1,
+        transformer_dec_dropout_rate: float = 0.1,
+        transformer_dec_positional_dropout_rate: float = 0.1,
+        transformer_dec_attn_dropout_rate: float = 0.1,
+        # only for conformer
+        conformer_rel_pos_type: str = "legacy",
+        conformer_pos_enc_layer_type: str = "rel_pos",
+        conformer_self_attn_layer_type: str = "rel_selfattn",
+        conformer_activation_type: str = "swish",
+        use_macaron_style_in_conformer: bool = True,
+        use_cnn_in_conformer: bool = True,
+        conformer_enc_kernel_size: int = 7,
+        conformer_dec_kernel_size: int = 31,
+        zero_triu: bool = False,
+        # extra embedding related
+        spks: Optional[int] = None,
+        langs: Optional[int] = None,
+        spk_embed_dim: Optional[int] = None,
+        spk_embed_integration_type: str = "add",
+        use_gst: bool = False,
+        gst_tokens: int = 10,
+        gst_heads: int = 4,
+        gst_conv_layers: int = 6,
+        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        gst_conv_kernel_size: int = 3,
+        gst_conv_stride: int = 2,
+        gst_gru_layers: int = 1,
+        gst_gru_units: int = 128,
+        # training related
+        init_type: str = "xavier_uniform",
+        init_enc_alpha: float = 1.0,
+        init_dec_alpha: float = 1.0,
+        use_masking: bool = False,
+        use_weighted_masking: bool = False,
+    ):
+        """Initialize FastSpeech module.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            elayers (int): Number of encoder layers.
+            eunits (int): Number of encoder hidden units.
+            dlayers (int): Number of decoder layers.
+            dunits (int): Number of decoder hidden units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_chans (int): Number of postnet channels.
+            postnet_filts (int): Kernel size of postnet.
+            postnet_dropout_rate (float): Dropout rate in postnet.
+            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
+            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
+            encoder_normalize_before (bool): Whether to apply layernorm layer before
+                encoder block.
+            decoder_normalize_before (bool): Whether to apply layernorm layer before
+                decoder block.
+            encoder_concat_after (bool): Whether to concatenate attention layer's input
+                and output in encoder.
+            decoder_concat_after (bool): Whether to concatenate attention layer's input
+                and output in decoder.
+            duration_predictor_layers (int): Number of duration predictor layers.
+            duration_predictor_chans (int): Number of duration predictor channels.
+            duration_predictor_kernel_size (int): Kernel size of duration predictor.
+            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
+            reduction_factor (int): Reduction factor.
+            encoder_type (str): Encoder type ("transformer" or "conformer").
+            decoder_type (str): Decoder type ("transformer" or "conformer").
+            transformer_enc_dropout_rate (float): Dropout rate in encoder except
+                attention and positional encoding.
+            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+                positional encoding.
+            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+                self-attention module.
+            transformer_dec_dropout_rate (float): Dropout rate in decoder except
+                attention & positional encoding.
+            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+                positional encoding.
+            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+                self-attention module.
+            conformer_rel_pos_type (str): Relative pos encoding type in conformer.
+            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
+            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
+            conformer_activation_type (str): Activation function type in conformer.
+            use_macaron_style_in_conformer: Whether to use macaron style FFN.
+            use_cnn_in_conformer: Whether to use CNN in conformer.
+            conformer_enc_kernel_size: Kernel size of encoder conformer.
+            conformer_dec_kernel_size: Kernel size of decoder conformer.
+            zero_triu: Whether to use zero triu in relative self-attention module.
+            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            langs (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spembs will be provided as the input.
+            spk_embed_integration_type: How to integrate speaker embedding.
+            use_gst (str): Whether to use global style token.
+            gst_tokens (int): The number of GST embeddings.
+            gst_heads (int): The number of heads in GST multihead attention.
+            gst_conv_layers (int): The number of conv layers in GST.
+            gst_conv_chans_list: (Sequence[int]):
+                List of the number of channels of conv layers in GST.
+            gst_conv_kernel_size (int): Kernel size of conv layers in GST.
+            gst_conv_stride (int): Stride size of conv layers in GST.
+            gst_gru_layers (int): The number of GRU layers in GST.
+            gst_gru_units (int): The number of GRU units in GST.
+            init_type (str): How to initialize transformer parameters.
+            init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the
+                encoder.
+            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the
+                decoder.
+            use_masking (bool): Whether to apply masking for padded part in loss
+                calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in loss
+                calculation.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.reduction_factor = reduction_factor
+        self.encoder_type = encoder_type
+        self.decoder_type = decoder_type
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.use_gst = use_gst
+
+        # use idx 0 as padding idx
+        self.padding_idx = 0
+
+        # get positional encoding class
+        pos_enc_class = (
+            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
+        )
+
+        # check relative positional encoding compatibility
+        if "conformer" in [encoder_type, decoder_type]:
+            if conformer_rel_pos_type == "legacy":
+                if conformer_pos_enc_layer_type == "rel_pos":
+                    conformer_pos_enc_layer_type = "legacy_rel_pos"
+                    logging.warning(
+                        "Fallback to conformer_pos_enc_layer_type = 'legacy_rel_pos' "
+                        "due to the compatibility. If you want to use the new one, "
+                        "please use conformer_pos_enc_layer_type = 'latest'."
+                    )
+                if conformer_self_attn_layer_type == "rel_selfattn":
+                    conformer_self_attn_layer_type = "legacy_rel_selfattn"
+                    logging.warning(
+                        "Fallback to "
+                        "conformer_self_attn_layer_type = 'legacy_rel_selfattn' "
+                        "due to the compatibility. If you want to use the new one, "
+                        "please use conformer_pos_enc_layer_type = 'latest'."
+                    )
+            elif conformer_rel_pos_type == "latest":
+                assert conformer_pos_enc_layer_type != "legacy_rel_pos"
+                assert conformer_self_attn_layer_type != "legacy_rel_selfattn"
+            else:
+                raise ValueError(f"Unknown rel_pos_type: {conformer_rel_pos_type}")
+
+        # define encoder
+        encoder_input_layer = torch.nn.Embedding(
+            num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx
+        )
+        if encoder_type == "transformer":
+            self.encoder = TransformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                pos_enc_class=pos_enc_class,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            )
+        elif encoder_type == "conformer":
+            self.encoder = ConformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+            )
+        else:
+            raise ValueError(f"{encoder_type} is not supported.")
+
+        # define GST
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=adim,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units,
+            )
+
+        # define spk and lang embedding
+        self.spks = None
+        if spks is not None and spks > 1:
+            self.spks = spks
+            self.sid_emb = torch.nn.Embedding(spks, adim)
+        self.langs = None
+        if langs is not None and langs > 1:
+            self.langs = langs
+            self.lid_emb = torch.nn.Embedding(langs, adim)
+
+        # define additional projection for speaker embedding
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            self.spk_embed_dim = spk_embed_dim
+            self.spk_embed_integration_type = spk_embed_integration_type
+        if self.spk_embed_dim is not None:
+            if self.spk_embed_integration_type == "add":
+                self.projection = torch.nn.Linear(self.spk_embed_dim, adim)
+            else:
+                self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)
+
+        # define duration predictor
+        self.duration_predictor = DurationPredictor(
+            idim=adim,
+            n_layers=duration_predictor_layers,
+            n_chans=duration_predictor_chans,
+            kernel_size=duration_predictor_kernel_size,
+            dropout_rate=duration_predictor_dropout_rate,
+        )
+
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+
+        # define decoder
+        # NOTE: we use encoder as decoder
+        # because fastspeech's decoder is the same as encoder
+        if decoder_type == "transformer":
+            self.decoder = TransformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                pos_enc_class=pos_enc_class,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            )
+        elif decoder_type == "conformer":
+            self.decoder = ConformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size,
+            )
+        else:
+            raise ValueError(f"{decoder_type} is not supported.")
+
+        # define final projection
+        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
+
+        # define postnet
+        self.postnet = (
+            None
+            if postnet_layers == 0
+            else Postnet(
+                idim=idim,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=use_batch_norm,
+                dropout_rate=postnet_dropout_rate,
+            )
+        )
+
+        # initialize parameters
+        self._reset_parameters(
+            init_type=init_type,
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha,
+        )
+
+        # define criterions
+        self.criterion = FastSpeechLoss(
+            use_masking=use_masking, use_weighted_masking=use_weighted_masking
+        )
+
+    def _forward(
+        self,
+        xs: torch.Tensor,
+        ilens: torch.Tensor,
+        ys: Optional[torch.Tensor] = None,
+        olens: Optional[torch.Tensor] = None,
+        ds: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        is_inference: bool = False,
+        alpha: float = 1.0,
+    ) -> Sequence[torch.Tensor]:
+        # forward encoder
+        x_masks = self._source_mask(ilens)
+        hs, _ = self.encoder(xs, x_masks)  # (B, T_text, adim)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate with SID and LID embeddings
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.view(-1))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.view(-1))
+            hs = hs + lid_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        # forward duration predictor and length regulator
+        d_masks = make_pad_mask(ilens).to(xs.device)
+        if is_inference:
+            d_outs = self.duration_predictor.inference(hs, d_masks)  # (B, T_text)
+            hs = self.length_regulator(hs, d_outs, alpha)  # (B, T_feats, adim)
+        else:
+            d_outs = self.duration_predictor(hs, d_masks)  # (B, T_text)
+            hs = self.length_regulator(hs, ds)  # (B, T_feats, adim)
+
+        # forward decoder
+        if olens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
+            else:
+                olens_in = olens
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(hs, h_masks)  # (B, T_feats, adim)
+        before_outs = self.feat_out(zs).view(
+            zs.size(0), -1, self.odim
+        )  # (B, T_feats, odim)
+
+        # postnet -> (B, T_feats//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose(1, 2)
+            ).transpose(1, 2)
+
+        return before_outs, after_outs, d_outs
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        durations: torch.Tensor,
+        durations_lengths: torch.Tensor,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        joint_training: bool = False,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text (LongTensor): Batch of padded character ids (B, T_text).
+            text_lengths (LongTensor): Batch of lengths of each input (B,).
+            feats (Tensor): Batch of padded target features (B, T_feats, odim).
+            feats_lengths (LongTensor): Batch of the lengths of each target (B,).
+            durations (LongTensor): Batch of padded durations (B, T_text + 1).
+            durations_lengths (LongTensor): Batch of duration lengths (B, T_text + 1).
+            spembs (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            sids (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lids (Optional[Tensor]): Batch of language IDs (B, 1).
+            joint_training (bool): Whether to perform joint training with vocoder.
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.
+
+        """
+        text = text[:, : text_lengths.max()]  # for data-parallel
+        feats = feats[:, : feats_lengths.max()]  # for data-parallel
+        durations = durations[:, : durations_lengths.max()]  # for data-parallel
+
+        batch_size = text.size(0)
+
+        # Add eos at the last of sequence
+        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
+        for i, l in enumerate(text_lengths):
+            xs[i, l] = self.eos
+        ilens = text_lengths + 1
+
+        ys, ds = feats, durations
+        olens = feats_lengths
+
+        # forward propagation
+        before_outs, after_outs, d_outs = self._forward(
+            xs,
+            ilens,
+            ys,
+            olens,
+            ds,
+            spembs=spembs,
+            sids=sids,
+            lids=lids,
+            is_inference=False,
+        )
+
+        # modifiy mod part of groundtruth
+        if self.reduction_factor > 1:
+            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
+            max_olen = max(olens)
+            ys = ys[:, :max_olen]
+
+        # calculate loss
+        if self.postnet is None:
+            after_outs = None
+        l1_loss, duration_loss = self.criterion(
+            after_outs, before_outs, d_outs, ys, ds, ilens, olens
+        )
+        loss = l1_loss + duration_loss
+
+        stats = dict(
+            l1_loss=l1_loss.item(),
+            duration_loss=duration_loss.item(),
+        )
+
+        # report extra information
+        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
+            stats.update(
+                encoder_alpha=self.encoder.embed[-1].alpha.data.item(),
+            )
+        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
+            stats.update(
+                decoder_alpha=self.decoder.embed[-1].alpha.data.item(),
+            )
+
+        if not joint_training:
+            stats.update(loss=loss.item())
+            loss, stats, weight = force_gatherable(
+                (loss, stats, batch_size), loss.device
+            )
+            return loss, stats, weight
+        else:
+            return loss, stats, after_outs if after_outs is not None else before_outs
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        feats: Optional[torch.Tensor] = None,
+        durations: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        alpha: float = 1.0,
+        use_teacher_forcing: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text (LongTensor): Input sequence of characters (T_text,).
+            feats (Optional[Tensor]): Feature sequence to extract style (N, idim).
+            durations (Optional[LongTensor]): Groundtruth of duration (T_text + 1,).
+            spembs (Optional[Tensor]): Speaker embedding (spk_embed_dim,).
+            sids (Optional[Tensor]): Speaker ID (1,).
+            lids (Optional[Tensor]): Language ID (1,).
+            alpha (float): Alpha to control the speed.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+
+        Returns:
+            Dict[str, Tensor]: Output dict including the following items:
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+                * duration (Tensor): Duration sequence (T_text + 1,).
+
+        """
+        x, y = text, feats
+        spemb, d = spembs, durations
+
+        # add eos at the last of sequence
+        x = F.pad(x, [0, 1], "constant", self.eos)
+
+        # setup batch axis
+        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
+        xs, ys = x.unsqueeze(0), None
+        if y is not None:
+            ys = y.unsqueeze(0)
+        if spemb is not None:
+            spembs = spemb.unsqueeze(0)
+
+        if use_teacher_forcing:
+            # use groundtruth of duration
+            ds = d.unsqueeze(0)
+            _, outs, d_outs = self._forward(
+                xs,
+                ilens,
+                ys,
+                ds=ds,
+                spembs=spembs,
+                sids=sids,
+                lids=lids,
+            )  # (1, T_feats, odim)
+        else:
+            # inference
+            _, outs, d_outs = self._forward(
+                xs,
+                ilens,
+                ys,
+                spembs=spembs,
+                sids=sids,
+                lids=lids,
+                is_inference=True,
+                alpha=alpha,
+            )  # (1, T_feats, odim)
+
+        return dict(feat_gen=outs[0], duration=d_outs[0])
+
+    def _integrate_with_spk_embed(
+        self, hs: torch.Tensor, spembs: torch.Tensor
+    ) -> torch.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, T_text, adim).
+            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, T_text, adim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spembs = self.projection(F.normalize(spembs))
+            hs = hs + spembs.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds and then apply projection
+            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
+            hs = self.projection(torch.cat([hs, spembs], dim=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+    def _source_mask(self, ilens: torch.Tensor) -> torch.Tensor:
+        """Make masks for self-attention.
+
+        Args:
+            ilens (LongTensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
+
+        """
+        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
+        return x_masks.unsqueeze(-2)
+
+    def _reset_parameters(
+        self, init_type: str, init_enc_alpha: float, init_dec_alpha: float
+    ):
+        # initialize parameters
+        if init_type != "pytorch":
+            initialize(self, init_type)
+
+        # initialize alpha in scaled positional encoding
+        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
+            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
+        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
+            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)
diff --git a/espnet2/tts/fastspeech2.py b/espnet2/tts/fastspeech2.py
deleted file mode 100644
index 316ff9f7e2a..00000000000
--- a/espnet2/tts/fastspeech2.py
+++ /dev/null
@@ -1,774 +0,0 @@
-# Copyright 2020 Nagoya University (Tomoki Hayashi)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-"""Fastspeech2 related modules for ESPnet2."""
-
-from typing import Dict
-from typing import Sequence
-from typing import Tuple
-
-import torch
-import torch.nn.functional as F
-
-from typeguard import check_argument_types
-
-from espnet.nets.pytorch_backend.conformer.encoder import (
-    Encoder as ConformerEncoder,  # noqa: H301
-)
-from espnet.nets.pytorch_backend.fastspeech.duration_predictor import DurationPredictor
-from espnet.nets.pytorch_backend.fastspeech.duration_predictor import (
-    DurationPredictorLoss,  # noqa: H301
-)
-from espnet.nets.pytorch_backend.fastspeech.length_regulator import LengthRegulator
-from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
-from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
-from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
-from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
-from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
-from espnet.nets.pytorch_backend.transformer.encoder import (
-    Encoder as TransformerEncoder,  # noqa: H301
-)
-
-from espnet2.torch_utils.device_funcs import force_gatherable
-from espnet2.torch_utils.initialize import initialize
-from espnet2.tts.abs_tts import AbsTTS
-from espnet2.tts.gst.style_encoder import StyleEncoder
-from espnet2.tts.variance_predictor import VariancePredictor
-
-
-class FastSpeech2(AbsTTS):
-    """FastSpeech2 module.
-
-    This is a module of FastSpeech2 described in `FastSpeech 2: Fast and
-    High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and
-    energy, we use token-averaged value introduced in `FastPitch: Parallel
-    Text-to-speech with Pitch Prediction`_.
-
-    .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
-        https://arxiv.org/abs/2006.04558
-    .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`:
-        https://arxiv.org/abs/2006.06873
-
-    """
-
-    def __init__(
-        self,
-        # network structure related
-        idim: int,
-        odim: int,
-        adim: int = 384,
-        aheads: int = 4,
-        elayers: int = 6,
-        eunits: int = 1536,
-        dlayers: int = 6,
-        dunits: int = 1536,
-        postnet_layers: int = 5,
-        postnet_chans: int = 512,
-        postnet_filts: int = 5,
-        positionwise_layer_type: str = "conv1d",
-        positionwise_conv_kernel_size: int = 1,
-        use_scaled_pos_enc: bool = True,
-        use_batch_norm: bool = True,
-        encoder_normalize_before: bool = True,
-        decoder_normalize_before: bool = True,
-        encoder_concat_after: bool = False,
-        decoder_concat_after: bool = False,
-        reduction_factor: int = 1,
-        encoder_type: str = "transformer",
-        decoder_type: str = "transformer",
-        # only for conformer
-        conformer_pos_enc_layer_type: str = "rel_pos",
-        conformer_self_attn_layer_type: str = "rel_selfattn",
-        conformer_activation_type: str = "swish",
-        use_macaron_style_in_conformer: bool = True,
-        use_cnn_in_conformer: bool = True,
-        conformer_enc_kernel_size: int = 7,
-        conformer_dec_kernel_size: int = 31,
-        # duration predictor
-        duration_predictor_layers: int = 2,
-        duration_predictor_chans: int = 384,
-        duration_predictor_kernel_size: int = 3,
-        # energy predictor
-        energy_predictor_layers: int = 2,
-        energy_predictor_chans: int = 384,
-        energy_predictor_kernel_size: int = 3,
-        energy_predictor_dropout: float = 0.5,
-        energy_embed_kernel_size: int = 9,
-        energy_embed_dropout: float = 0.5,
-        stop_gradient_from_energy_predictor: bool = False,
-        # pitch predictor
-        pitch_predictor_layers: int = 2,
-        pitch_predictor_chans: int = 384,
-        pitch_predictor_kernel_size: int = 3,
-        pitch_predictor_dropout: float = 0.5,
-        pitch_embed_kernel_size: int = 9,
-        pitch_embed_dropout: float = 0.5,
-        stop_gradient_from_pitch_predictor: bool = False,
-        # pretrained spk emb
-        spk_embed_dim: int = None,
-        spk_embed_integration_type: str = "add",
-        # GST
-        use_gst: bool = False,
-        gst_tokens: int = 10,
-        gst_heads: int = 4,
-        gst_conv_layers: int = 6,
-        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
-        gst_conv_kernel_size: int = 3,
-        gst_conv_stride: int = 2,
-        gst_gru_layers: int = 1,
-        gst_gru_units: int = 128,
-        # training related
-        transformer_enc_dropout_rate: float = 0.1,
-        transformer_enc_positional_dropout_rate: float = 0.1,
-        transformer_enc_attn_dropout_rate: float = 0.1,
-        transformer_dec_dropout_rate: float = 0.1,
-        transformer_dec_positional_dropout_rate: float = 0.1,
-        transformer_dec_attn_dropout_rate: float = 0.1,
-        duration_predictor_dropout_rate: float = 0.1,
-        postnet_dropout_rate: float = 0.5,
-        init_type: str = "xavier_uniform",
-        init_enc_alpha: float = 1.0,
-        init_dec_alpha: float = 1.0,
-        use_masking: bool = False,
-        use_weighted_masking: bool = False,
-    ):
-        """Initialize FastSpeech2 module."""
-        assert check_argument_types()
-        super().__init__()
-
-        # store hyperparameters
-        self.idim = idim
-        self.odim = odim
-        self.eos = idim - 1
-        self.reduction_factor = reduction_factor
-        self.encoder_type = encoder_type
-        self.decoder_type = decoder_type
-        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
-        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
-        self.use_scaled_pos_enc = use_scaled_pos_enc
-        self.use_gst = use_gst
-        self.spk_embed_dim = spk_embed_dim
-        if self.spk_embed_dim is not None:
-            self.spk_embed_integration_type = spk_embed_integration_type
-
-        # use idx 0 as padding idx
-        self.padding_idx = 0
-
-        # get positional encoding class
-        pos_enc_class = (
-            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
-        )
-
-        # define encoder
-        encoder_input_layer = torch.nn.Embedding(
-            num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx
-        )
-        if encoder_type == "transformer":
-            self.encoder = TransformerEncoder(
-                idim=idim,
-                attention_dim=adim,
-                attention_heads=aheads,
-                linear_units=eunits,
-                num_blocks=elayers,
-                input_layer=encoder_input_layer,
-                dropout_rate=transformer_enc_dropout_rate,
-                positional_dropout_rate=transformer_enc_positional_dropout_rate,
-                attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
-                normalize_before=encoder_normalize_before,
-                concat_after=encoder_concat_after,
-                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-            )
-        elif encoder_type == "conformer":
-            self.encoder = ConformerEncoder(
-                idim=idim,
-                attention_dim=adim,
-                attention_heads=aheads,
-                linear_units=eunits,
-                num_blocks=elayers,
-                input_layer=encoder_input_layer,
-                dropout_rate=transformer_enc_dropout_rate,
-                positional_dropout_rate=transformer_enc_positional_dropout_rate,
-                attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                normalize_before=encoder_normalize_before,
-                concat_after=encoder_concat_after,
-                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-                macaron_style=use_macaron_style_in_conformer,
-                pos_enc_layer_type=conformer_pos_enc_layer_type,
-                selfattention_layer_type=conformer_self_attn_layer_type,
-                activation_type=conformer_activation_type,
-                use_cnn_module=use_cnn_in_conformer,
-                cnn_module_kernel=conformer_enc_kernel_size,
-            )
-        else:
-            raise ValueError(f"{encoder_type} is not supported.")
-
-        # define GST
-        if self.use_gst:
-            self.gst = StyleEncoder(
-                idim=odim,  # the input is mel-spectrogram
-                gst_tokens=gst_tokens,
-                gst_token_dim=adim,
-                gst_heads=gst_heads,
-                conv_layers=gst_conv_layers,
-                conv_chans_list=gst_conv_chans_list,
-                conv_kernel_size=gst_conv_kernel_size,
-                conv_stride=gst_conv_stride,
-                gru_layers=gst_gru_layers,
-                gru_units=gst_gru_units,
-            )
-
-        # define additional projection for speaker embedding
-        if self.spk_embed_dim is not None:
-            if self.spk_embed_integration_type == "add":
-                self.projection = torch.nn.Linear(self.spk_embed_dim, adim)
-            else:
-                self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)
-
-        # define duration predictor
-        self.duration_predictor = DurationPredictor(
-            idim=adim,
-            n_layers=duration_predictor_layers,
-            n_chans=duration_predictor_chans,
-            kernel_size=duration_predictor_kernel_size,
-            dropout_rate=duration_predictor_dropout_rate,
-        )
-
-        # define pitch predictor
-        self.pitch_predictor = VariancePredictor(
-            idim=adim,
-            n_layers=pitch_predictor_layers,
-            n_chans=pitch_predictor_chans,
-            kernel_size=pitch_predictor_kernel_size,
-            dropout_rate=pitch_predictor_dropout,
-        )
-        # NOTE(kan-bayashi): We use continuous pitch + FastPitch style avg
-        self.pitch_embed = torch.nn.Sequential(
-            torch.nn.Conv1d(
-                in_channels=1,
-                out_channels=adim,
-                kernel_size=pitch_embed_kernel_size,
-                padding=(pitch_embed_kernel_size - 1) // 2,
-            ),
-            torch.nn.Dropout(pitch_embed_dropout),
-        )
-
-        # define energy predictor
-        self.energy_predictor = VariancePredictor(
-            idim=adim,
-            n_layers=energy_predictor_layers,
-            n_chans=energy_predictor_chans,
-            kernel_size=energy_predictor_kernel_size,
-            dropout_rate=energy_predictor_dropout,
-        )
-        # NOTE(kan-bayashi): We use continuous enegy + FastPitch style avg
-        self.energy_embed = torch.nn.Sequential(
-            torch.nn.Conv1d(
-                in_channels=1,
-                out_channels=adim,
-                kernel_size=energy_embed_kernel_size,
-                padding=(energy_embed_kernel_size - 1) // 2,
-            ),
-            torch.nn.Dropout(energy_embed_dropout),
-        )
-
-        # define length regulator
-        self.length_regulator = LengthRegulator()
-
-        # define decoder
-        # NOTE: we use encoder as decoder
-        # because fastspeech's decoder is the same as encoder
-        if decoder_type == "transformer":
-            self.decoder = TransformerEncoder(
-                idim=0,
-                attention_dim=adim,
-                attention_heads=aheads,
-                linear_units=dunits,
-                num_blocks=dlayers,
-                input_layer=None,
-                dropout_rate=transformer_dec_dropout_rate,
-                positional_dropout_rate=transformer_dec_positional_dropout_rate,
-                attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
-                normalize_before=decoder_normalize_before,
-                concat_after=decoder_concat_after,
-                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-            )
-        elif decoder_type == "conformer":
-            self.decoder = ConformerEncoder(
-                idim=0,
-                attention_dim=adim,
-                attention_heads=aheads,
-                linear_units=dunits,
-                num_blocks=dlayers,
-                input_layer=None,
-                dropout_rate=transformer_dec_dropout_rate,
-                positional_dropout_rate=transformer_dec_positional_dropout_rate,
-                attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                normalize_before=decoder_normalize_before,
-                concat_after=decoder_concat_after,
-                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-                macaron_style=use_macaron_style_in_conformer,
-                pos_enc_layer_type=conformer_pos_enc_layer_type,
-                selfattention_layer_type=conformer_self_attn_layer_type,
-                activation_type=conformer_activation_type,
-                use_cnn_module=use_cnn_in_conformer,
-                cnn_module_kernel=conformer_dec_kernel_size,
-            )
-        else:
-            raise ValueError(f"{decoder_type} is not supported.")
-
-        # define final projection
-        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
-
-        # define postnet
-        self.postnet = (
-            None
-            if postnet_layers == 0
-            else Postnet(
-                idim=idim,
-                odim=odim,
-                n_layers=postnet_layers,
-                n_chans=postnet_chans,
-                n_filts=postnet_filts,
-                use_batch_norm=use_batch_norm,
-                dropout_rate=postnet_dropout_rate,
-            )
-        )
-
-        # initialize parameters
-        self._reset_parameters(
-            init_type=init_type,
-            init_enc_alpha=init_enc_alpha,
-            init_dec_alpha=init_dec_alpha,
-        )
-
-        # define criterions
-        self.criterion = FastSpeech2Loss(
-            use_masking=use_masking, use_weighted_masking=use_weighted_masking
-        )
-
-    def forward(
-        self,
-        text: torch.Tensor,
-        text_lengths: torch.Tensor,
-        speech: torch.Tensor,
-        speech_lengths: torch.Tensor,
-        durations: torch.Tensor,
-        durations_lengths: torch.Tensor,
-        pitch: torch.Tensor,
-        pitch_lengths: torch.Tensor,
-        energy: torch.Tensor,
-        energy_lengths: torch.Tensor,
-        spembs: torch.Tensor = None,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
-        """Calculate forward propagation.
-
-        Args:
-            text (LongTensor): Batch of padded token ids (B, Tmax).
-            text_lengths (LongTensor): Batch of lengths of each input (B,).
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
-            speech_lengths (LongTensor): Batch of the lengths of each target (B,).
-            durations (LongTensor): Batch of padded durations (B, Tmax + 1).
-            durations_lengths (LongTensor): Batch of duration lengths (B, Tmax + 1).
-            pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1).
-            pitch_lengths (LongTensor): Batch of pitch lengths (B, Tmax + 1).
-            energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1).
-            energy_lengths (LongTensor): Batch of energy lengths (B, Tmax + 1).
-            spembs (Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns:
-            Tensor: Loss scalar value.
-            Dict: Statistics to be monitored.
-            Tensor: Weight value.
-
-        """
-        text = text[:, : text_lengths.max()]  # for data-parallel
-        speech = speech[:, : speech_lengths.max()]  # for data-parallel
-        durations = durations[:, : durations_lengths.max()]  # for data-parallel
-        pitch = pitch[:, : pitch_lengths.max()]  # for data-parallel
-        energy = energy[:, : energy_lengths.max()]  # for data-parallel
-
-        batch_size = text.size(0)
-
-        # Add eos at the last of sequence
-        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
-        for i, l in enumerate(text_lengths):
-            xs[i, l] = self.eos
-        ilens = text_lengths + 1
-
-        ys, ds, ps, es = speech, durations, pitch, energy
-        olens = speech_lengths
-
-        # forward propagation
-        before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
-            xs, ilens, ys, olens, ds, ps, es, spembs=spembs, is_inference=False
-        )
-
-        # modify mod part of groundtruth
-        if self.reduction_factor > 1:
-            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
-            max_olen = max(olens)
-            ys = ys[:, :max_olen]
-
-        # calculate loss
-        if self.postnet is None:
-            after_outs = None
-
-        # calculate loss
-        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
-            after_outs=after_outs,
-            before_outs=before_outs,
-            d_outs=d_outs,
-            p_outs=p_outs,
-            e_outs=e_outs,
-            ys=ys,
-            ds=ds,
-            ps=ps,
-            es=es,
-            ilens=ilens,
-            olens=olens,
-        )
-        loss = l1_loss + duration_loss + pitch_loss + energy_loss
-
-        stats = dict(
-            l1_loss=l1_loss.item(),
-            duration_loss=duration_loss.item(),
-            pitch_loss=pitch_loss.item(),
-            energy_loss=energy_loss.item(),
-            loss=loss.item(),
-        )
-
-        # report extra information
-        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
-            stats.update(
-                encoder_alpha=self.encoder.embed[-1].alpha.data.item(),
-            )
-        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
-            stats.update(
-                decoder_alpha=self.decoder.embed[-1].alpha.data.item(),
-            )
-
-        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
-        return loss, stats, weight
-
-    def _forward(
-        self,
-        xs: torch.Tensor,
-        ilens: torch.Tensor,
-        ys: torch.Tensor = None,
-        olens: torch.Tensor = None,
-        ds: torch.Tensor = None,
-        ps: torch.Tensor = None,
-        es: torch.Tensor = None,
-        spembs: torch.Tensor = None,
-        is_inference: bool = False,
-        alpha: float = 1.0,
-    ) -> Sequence[torch.Tensor]:
-        # forward encoder
-        x_masks = self._source_mask(ilens)
-        hs, _ = self.encoder(xs, x_masks)  # (B, Tmax, adim)
-
-        # integrate with GST
-        if self.use_gst:
-            style_embs = self.gst(ys)
-            hs = hs + style_embs.unsqueeze(1)
-
-        # integrate speaker embedding
-        if self.spk_embed_dim is not None:
-            hs = self._integrate_with_spk_embed(hs, spembs)
-
-        # forward duration predictor and variance predictors
-        d_masks = make_pad_mask(ilens).to(xs.device)
-
-        if self.stop_gradient_from_pitch_predictor:
-            p_outs = self.pitch_predictor(hs.detach(), d_masks.unsqueeze(-1))
-        else:
-            p_outs = self.pitch_predictor(hs, d_masks.unsqueeze(-1))
-        if self.stop_gradient_from_energy_predictor:
-            e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
-        else:
-            e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
-
-        if is_inference:
-            d_outs = self.duration_predictor.inference(hs, d_masks)  # (B, Tmax)
-            # use prediction in inference
-            p_embs = self.pitch_embed(p_outs.transpose(1, 2)).transpose(1, 2)
-            e_embs = self.energy_embed(e_outs.transpose(1, 2)).transpose(1, 2)
-            hs = hs + e_embs + p_embs
-            hs = self.length_regulator(hs, d_outs, alpha)  # (B, Lmax, adim)
-        else:
-            d_outs = self.duration_predictor(hs, d_masks)
-            # use groundtruth in training
-            p_embs = self.pitch_embed(ps.transpose(1, 2)).transpose(1, 2)
-            e_embs = self.energy_embed(es.transpose(1, 2)).transpose(1, 2)
-            hs = hs + e_embs + p_embs
-            hs = self.length_regulator(hs, ds)  # (B, Lmax, adim)
-
-        # forward decoder
-        if olens is not None and not is_inference:
-            if self.reduction_factor > 1:
-                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
-            else:
-                olens_in = olens
-            h_masks = self._source_mask(olens_in)
-        else:
-            h_masks = None
-        zs, _ = self.decoder(hs, h_masks)  # (B, Lmax, adim)
-        before_outs = self.feat_out(zs).view(
-            zs.size(0), -1, self.odim
-        )  # (B, Lmax, odim)
-
-        # postnet -> (B, Lmax//r * r, odim)
-        if self.postnet is None:
-            after_outs = before_outs
-        else:
-            after_outs = before_outs + self.postnet(
-                before_outs.transpose(1, 2)
-            ).transpose(1, 2)
-
-        return before_outs, after_outs, d_outs, p_outs, e_outs
-
-    def inference(
-        self,
-        text: torch.Tensor,
-        speech: torch.Tensor = None,
-        spembs: torch.Tensor = None,
-        durations: torch.Tensor = None,
-        pitch: torch.Tensor = None,
-        energy: torch.Tensor = None,
-        alpha: float = 1.0,
-        use_teacher_forcing: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Generate the sequence of features given the sequences of characters.
-
-        Args:
-            text (LongTensor): Input sequence of characters (T,).
-            speech (Tensor, optional): Feature sequence to extract style (N, idim).
-            spembs (Tensor, optional): Speaker embedding vector (spk_embed_dim,).
-            durations (LongTensor, optional): Groundtruth of duration (T + 1,).
-            pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1).
-            energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1).
-            alpha (float, optional): Alpha to control the speed.
-            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
-                If true, groundtruth of duration, pitch and energy will be used.
-
-        Returns:
-            Tensor: Output sequence of features (L, odim).
-            None: Dummy for compatibility.
-            None: Dummy for compatibility.
-
-        """
-        x, y = text, speech
-        spemb, d, p, e = spembs, durations, pitch, energy
-
-        # add eos at the last of sequence
-        x = F.pad(x, [0, 1], "constant", self.eos)
-
-        # setup batch axis
-        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
-        xs, ys = x.unsqueeze(0), None
-        if y is not None:
-            ys = y.unsqueeze(0)
-        if spemb is not None:
-            spembs = spemb.unsqueeze(0)
-
-        if use_teacher_forcing:
-            # use groundtruth of duration, pitch, and energy
-            ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0)
-            _, outs, *_ = self._forward(
-                xs,
-                ilens,
-                ys,
-                ds=ds,
-                ps=ps,
-                es=es,
-                spembs=spembs,
-            )  # (1, L, odim)
-        else:
-            _, outs, *_ = self._forward(
-                xs,
-                ilens,
-                ys,
-                spembs=spembs,
-                is_inference=True,
-                alpha=alpha,
-            )  # (1, L, odim)
-
-        return outs[0], None, None
-
-    def _integrate_with_spk_embed(
-        self, hs: torch.Tensor, spembs: torch.Tensor
-    ) -> torch.Tensor:
-        """Integrate speaker embedding with hidden states.
-
-        Args:
-            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
-            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns:
-            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
-
-        """
-        if self.spk_embed_integration_type == "add":
-            # apply projection and then add to hidden states
-            spembs = self.projection(F.normalize(spembs))
-            hs = hs + spembs.unsqueeze(1)
-        elif self.spk_embed_integration_type == "concat":
-            # concat hidden states with spk embeds and then apply projection
-            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
-            hs = self.projection(torch.cat([hs, spembs], dim=-1))
-        else:
-            raise NotImplementedError("support only add or concat.")
-
-        return hs
-
-    def _source_mask(self, ilens: torch.Tensor) -> torch.Tensor:
-        """Make masks for self-attention.
-
-        Args:
-            ilens (LongTensor): Batch of lengths (B,).
-
-        Returns:
-            Tensor: Mask tensor for self-attention.
-                dtype=torch.uint8 in PyTorch 1.2-
-                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
-
-        Examples:
-            >>> ilens = [5, 3]
-            >>> self._source_mask(ilens)
-            tensor([[[1, 1, 1, 1, 1],
-                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
-
-        """
-        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
-        return x_masks.unsqueeze(-2)
-
-    def _reset_parameters(
-        self, init_type: str, init_enc_alpha: float, init_dec_alpha: float
-    ):
-        # initialize parameters
-        if init_type != "pytorch":
-            initialize(self, init_type)
-
-        # initialize alpha in scaled positional encoding
-        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
-            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
-        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
-            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)
-
-
-class FastSpeech2Loss(torch.nn.Module):
-    """Loss function module for FastSpeech2."""
-
-    def __init__(self, use_masking: bool = True, use_weighted_masking: bool = False):
-        """Initialize feed-forward Transformer loss module.
-
-        Args:
-            use_masking (bool):
-                Whether to apply masking for padded part in loss calculation.
-            use_weighted_masking (bool):
-                Whether to weighted masking in loss calculation.
-
-        """
-        assert check_argument_types()
-        super().__init__()
-
-        assert (use_masking != use_weighted_masking) or not use_masking
-        self.use_masking = use_masking
-        self.use_weighted_masking = use_weighted_masking
-
-        # define criterions
-        reduction = "none" if self.use_weighted_masking else "mean"
-        self.l1_criterion = torch.nn.L1Loss(reduction=reduction)
-        self.mse_criterion = torch.nn.MSELoss(reduction=reduction)
-        self.duration_criterion = DurationPredictorLoss(reduction=reduction)
-
-    def forward(
-        self,
-        after_outs: torch.Tensor,
-        before_outs: torch.Tensor,
-        d_outs: torch.Tensor,
-        p_outs: torch.Tensor,
-        e_outs: torch.Tensor,
-        ys: torch.Tensor,
-        ds: torch.Tensor,
-        ps: torch.Tensor,
-        es: torch.Tensor,
-        ilens: torch.Tensor,
-        olens: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Calculate forward propagation.
-
-        Args:
-            after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
-            before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
-            d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax).
-            p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
-            e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
-            ys (Tensor): Batch of target features (B, Lmax, odim).
-            ds (LongTensor): Batch of durations (B, Tmax).
-            ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
-            es (Tensor): Batch of target token-averaged energy (B, Tmax, 1).
-            ilens (LongTensor): Batch of the lengths of each input (B,).
-            olens (LongTensor): Batch of the lengths of each target (B,).
-
-        Returns:
-            Tensor: L1 loss value.
-            Tensor: Duration predictor loss value.
-            Tensor: Pitch predictor loss value.
-            Tensor: Energy predictor loss value.
-
-        """
-        # apply mask to remove padded part
-        if self.use_masking:
-            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
-            before_outs = before_outs.masked_select(out_masks)
-            if after_outs is not None:
-                after_outs = after_outs.masked_select(out_masks)
-            ys = ys.masked_select(out_masks)
-            duration_masks = make_non_pad_mask(ilens).to(ys.device)
-            d_outs = d_outs.masked_select(duration_masks)
-            ds = ds.masked_select(duration_masks)
-            pitch_masks = make_non_pad_mask(ilens).unsqueeze(-1).to(ys.device)
-            p_outs = p_outs.masked_select(pitch_masks)
-            e_outs = e_outs.masked_select(pitch_masks)
-            ps = ps.masked_select(pitch_masks)
-            es = es.masked_select(pitch_masks)
-
-        # calculate loss
-        l1_loss = self.l1_criterion(before_outs, ys)
-        if after_outs is not None:
-            l1_loss += self.l1_criterion(after_outs, ys)
-        duration_loss = self.duration_criterion(d_outs, ds)
-        pitch_loss = self.mse_criterion(p_outs, ps)
-        energy_loss = self.mse_criterion(e_outs, es)
-
-        # make weighted mask and apply it
-        if self.use_weighted_masking:
-            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
-            out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
-            out_weights /= ys.size(0) * ys.size(2)
-            duration_masks = make_non_pad_mask(ilens).to(ys.device)
-            duration_weights = (
-                duration_masks.float() / duration_masks.sum(dim=1, keepdim=True).float()
-            )
-            duration_weights /= ds.size(0)
-
-            # apply weight
-            l1_loss = l1_loss.mul(out_weights).masked_select(out_masks).sum()
-            duration_loss = (
-                duration_loss.mul(duration_weights).masked_select(duration_masks).sum()
-            )
-            pitch_masks = duration_masks.unsqueeze(-1)
-            pitch_weights = duration_weights.unsqueeze(-1)
-            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(pitch_masks).sum()
-            energy_loss = (
-                energy_loss.mul(pitch_weights).masked_select(pitch_masks).sum()
-            )
-
-        return l1_loss, duration_loss, pitch_loss, energy_loss
diff --git a/espnet2/tts/fastspeech2/__init__.py b/espnet2/tts/fastspeech2/__init__.py
new file mode 100644
index 00000000000..bc8bb5231d0
--- /dev/null
+++ b/espnet2/tts/fastspeech2/__init__.py
@@ -0,0 +1 @@
+from espnet2.tts.fastspeech2.fastspeech2 import FastSpeech2  # NOQA
diff --git a/espnet2/tts/fastspeech2/fastspeech2.py b/espnet2/tts/fastspeech2/fastspeech2.py
new file mode 100644
index 00000000000..06d3b0c6c5f
--- /dev/null
+++ b/espnet2/tts/fastspeech2/fastspeech2.py
@@ -0,0 +1,842 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Fastspeech2 related modules for ESPnet2."""
+
+import logging
+
+from typing import Dict
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.conformer.encoder import (
+    Encoder as ConformerEncoder,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.fastspeech.duration_predictor import DurationPredictor
+from espnet.nets.pytorch_backend.fastspeech.length_regulator import LengthRegulator
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
+from espnet.nets.pytorch_backend.transformer.encoder import (
+    Encoder as TransformerEncoder,  # noqa: H301
+)
+
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.torch_utils.initialize import initialize
+from espnet2.tts.abs_tts import AbsTTS
+from espnet2.tts.fastspeech2.loss import FastSpeech2Loss
+from espnet2.tts.fastspeech2.variance_predictor import VariancePredictor
+from espnet2.tts.gst.style_encoder import StyleEncoder
+
+
+class FastSpeech2(AbsTTS):
+    """FastSpeech2 module.
+
+    This is a module of FastSpeech2 described in `FastSpeech 2: Fast and
+    High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and
+    energy, we use token-averaged value introduced in `FastPitch: Parallel
+    Text-to-speech with Pitch Prediction`_.
+
+    .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
+        https://arxiv.org/abs/2006.04558
+    .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`:
+        https://arxiv.org/abs/2006.06873
+
+    """
+
+    def __init__(
+        self,
+        # network structure related
+        idim: int,
+        odim: int,
+        adim: int = 384,
+        aheads: int = 4,
+        elayers: int = 6,
+        eunits: int = 1536,
+        dlayers: int = 6,
+        dunits: int = 1536,
+        postnet_layers: int = 5,
+        postnet_chans: int = 512,
+        postnet_filts: int = 5,
+        postnet_dropout_rate: float = 0.5,
+        positionwise_layer_type: str = "conv1d",
+        positionwise_conv_kernel_size: int = 1,
+        use_scaled_pos_enc: bool = True,
+        use_batch_norm: bool = True,
+        encoder_normalize_before: bool = True,
+        decoder_normalize_before: bool = True,
+        encoder_concat_after: bool = False,
+        decoder_concat_after: bool = False,
+        reduction_factor: int = 1,
+        encoder_type: str = "transformer",
+        decoder_type: str = "transformer",
+        transformer_enc_dropout_rate: float = 0.1,
+        transformer_enc_positional_dropout_rate: float = 0.1,
+        transformer_enc_attn_dropout_rate: float = 0.1,
+        transformer_dec_dropout_rate: float = 0.1,
+        transformer_dec_positional_dropout_rate: float = 0.1,
+        transformer_dec_attn_dropout_rate: float = 0.1,
+        # only for conformer
+        conformer_rel_pos_type: str = "legacy",
+        conformer_pos_enc_layer_type: str = "rel_pos",
+        conformer_self_attn_layer_type: str = "rel_selfattn",
+        conformer_activation_type: str = "swish",
+        use_macaron_style_in_conformer: bool = True,
+        use_cnn_in_conformer: bool = True,
+        zero_triu: bool = False,
+        conformer_enc_kernel_size: int = 7,
+        conformer_dec_kernel_size: int = 31,
+        # duration predictor
+        duration_predictor_layers: int = 2,
+        duration_predictor_chans: int = 384,
+        duration_predictor_kernel_size: int = 3,
+        duration_predictor_dropout_rate: float = 0.1,
+        # energy predictor
+        energy_predictor_layers: int = 2,
+        energy_predictor_chans: int = 384,
+        energy_predictor_kernel_size: int = 3,
+        energy_predictor_dropout: float = 0.5,
+        energy_embed_kernel_size: int = 9,
+        energy_embed_dropout: float = 0.5,
+        stop_gradient_from_energy_predictor: bool = False,
+        # pitch predictor
+        pitch_predictor_layers: int = 2,
+        pitch_predictor_chans: int = 384,
+        pitch_predictor_kernel_size: int = 3,
+        pitch_predictor_dropout: float = 0.5,
+        pitch_embed_kernel_size: int = 9,
+        pitch_embed_dropout: float = 0.5,
+        stop_gradient_from_pitch_predictor: bool = False,
+        # extra embedding related
+        spks: Optional[int] = None,
+        langs: Optional[int] = None,
+        spk_embed_dim: Optional[int] = None,
+        spk_embed_integration_type: str = "add",
+        use_gst: bool = False,
+        gst_tokens: int = 10,
+        gst_heads: int = 4,
+        gst_conv_layers: int = 6,
+        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        gst_conv_kernel_size: int = 3,
+        gst_conv_stride: int = 2,
+        gst_gru_layers: int = 1,
+        gst_gru_units: int = 128,
+        # training related
+        init_type: str = "xavier_uniform",
+        init_enc_alpha: float = 1.0,
+        init_dec_alpha: float = 1.0,
+        use_masking: bool = False,
+        use_weighted_masking: bool = False,
+    ):
+        """Initialize FastSpeech2 module.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            elayers (int): Number of encoder layers.
+            eunits (int): Number of encoder hidden units.
+            dlayers (int): Number of decoder layers.
+            dunits (int): Number of decoder hidden units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_chans (int): Number of postnet channels.
+            postnet_filts (int): Kernel size of postnet.
+            postnet_dropout_rate (float): Dropout rate in postnet.
+            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
+            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
+            encoder_normalize_before (bool): Whether to apply layernorm layer before
+                encoder block.
+            decoder_normalize_before (bool): Whether to apply layernorm layer before
+                decoder block.
+            encoder_concat_after (bool): Whether to concatenate attention layer's input
+                and output in encoder.
+            decoder_concat_after (bool): Whether to concatenate attention layer's input
+                and output in decoder.
+            reduction_factor (int): Reduction factor.
+            encoder_type (str): Encoder type ("transformer" or "conformer").
+            decoder_type (str): Decoder type ("transformer" or "conformer").
+            transformer_enc_dropout_rate (float): Dropout rate in encoder except
+                attention and positional encoding.
+            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+                positional encoding.
+            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+                self-attention module.
+            transformer_dec_dropout_rate (float): Dropout rate in decoder except
+                attention & positional encoding.
+            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+                positional encoding.
+            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+                self-attention module.
+            conformer_rel_pos_type (str): Relative pos encoding type in conformer.
+            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
+            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
+            conformer_activation_type (str): Activation function type in conformer.
+            use_macaron_style_in_conformer: Whether to use macaron style FFN.
+            use_cnn_in_conformer: Whether to use CNN in conformer.
+            zero_triu: Whether to use zero triu in relative self-attention module.
+            conformer_enc_kernel_size: Kernel size of encoder conformer.
+            conformer_dec_kernel_size: Kernel size of decoder conformer.
+            duration_predictor_layers (int): Number of duration predictor layers.
+            duration_predictor_chans (int): Number of duration predictor channels.
+            duration_predictor_kernel_size (int): Kernel size of duration predictor.
+            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
+            pitch_predictor_layers (int): Number of pitch predictor layers.
+            pitch_predictor_chans (int): Number of pitch predictor channels.
+            pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
+            pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
+            pitch_embed_kernel_size (float): Kernel size of pitch embedding.
+            pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
+            stop_gradient_from_pitch_predictor: Whether to stop gradient from pitch
+                predictor to encoder.
+            energy_predictor_layers (int): Number of energy predictor layers.
+            energy_predictor_chans (int): Number of energy predictor channels.
+            energy_predictor_kernel_size (int): Kernel size of energy predictor.
+            energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
+            energy_embed_kernel_size (float): Kernel size of energy embedding.
+            energy_embed_dropout_rate (float): Dropout rate for energy embedding.
+            stop_gradient_from_energy_predictor: Whether to stop gradient from energy
+                predictor to encoder.
+            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            langs (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spembs will be provided as the input.
+            spk_embed_integration_type: How to integrate speaker embedding.
+            use_gst (str): Whether to use global style token.
+            gst_tokens (int): The number of GST embeddings.
+            gst_heads (int): The number of heads in GST multihead attention.
+            gst_conv_layers (int): The number of conv layers in GST.
+            gst_conv_chans_list: (Sequence[int]):
+                List of the number of channels of conv layers in GST.
+            gst_conv_kernel_size (int): Kernel size of conv layers in GST.
+            gst_conv_stride (int): Stride size of conv layers in GST.
+            gst_gru_layers (int): The number of GRU layers in GST.
+            gst_gru_units (int): The number of GRU units in GST.
+            init_type (str): How to initialize transformer parameters.
+            init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the
+                encoder.
+            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the
+                decoder.
+            use_masking (bool): Whether to apply masking for padded part in loss
+                calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in loss
+                calculation.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.reduction_factor = reduction_factor
+        self.encoder_type = encoder_type
+        self.decoder_type = decoder_type
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.use_gst = use_gst
+
+        # use idx 0 as padding idx
+        self.padding_idx = 0
+
+        # get positional encoding class
+        pos_enc_class = (
+            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
+        )
+
+        # check relative positional encoding compatibility
+        if "conformer" in [encoder_type, decoder_type]:
+            if conformer_rel_pos_type == "legacy":
+                if conformer_pos_enc_layer_type == "rel_pos":
+                    conformer_pos_enc_layer_type = "legacy_rel_pos"
+                    logging.warning(
+                        "Fallback to conformer_pos_enc_layer_type = 'legacy_rel_pos' "
+                        "due to the compatibility. If you want to use the new one, "
+                        "please use conformer_pos_enc_layer_type = 'latest'."
+                    )
+                if conformer_self_attn_layer_type == "rel_selfattn":
+                    conformer_self_attn_layer_type = "legacy_rel_selfattn"
+                    logging.warning(
+                        "Fallback to "
+                        "conformer_self_attn_layer_type = 'legacy_rel_selfattn' "
+                        "due to the compatibility. If you want to use the new one, "
+                        "please use conformer_pos_enc_layer_type = 'latest'."
+                    )
+            elif conformer_rel_pos_type == "latest":
+                assert conformer_pos_enc_layer_type != "legacy_rel_pos"
+                assert conformer_self_attn_layer_type != "legacy_rel_selfattn"
+            else:
+                raise ValueError(f"Unknown rel_pos_type: {conformer_rel_pos_type}")
+
+        # define encoder
+        encoder_input_layer = torch.nn.Embedding(
+            num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx
+        )
+        if encoder_type == "transformer":
+            self.encoder = TransformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                pos_enc_class=pos_enc_class,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            )
+        elif encoder_type == "conformer":
+            self.encoder = ConformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu,
+            )
+        else:
+            raise ValueError(f"{encoder_type} is not supported.")
+
+        # define GST
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=adim,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units,
+            )
+
+        # define spk and lang embedding
+        self.spks = None
+        if spks is not None and spks > 1:
+            self.spks = spks
+            self.sid_emb = torch.nn.Embedding(spks, adim)
+        self.langs = None
+        if langs is not None and langs > 1:
+            self.langs = langs
+            self.lid_emb = torch.nn.Embedding(langs, adim)
+
+        # define additional projection for speaker embedding
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            self.spk_embed_dim = spk_embed_dim
+            self.spk_embed_integration_type = spk_embed_integration_type
+        if self.spk_embed_dim is not None:
+            if self.spk_embed_integration_type == "add":
+                self.projection = torch.nn.Linear(self.spk_embed_dim, adim)
+            else:
+                self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)
+
+        # define duration predictor
+        self.duration_predictor = DurationPredictor(
+            idim=adim,
+            n_layers=duration_predictor_layers,
+            n_chans=duration_predictor_chans,
+            kernel_size=duration_predictor_kernel_size,
+            dropout_rate=duration_predictor_dropout_rate,
+        )
+
+        # define pitch predictor
+        self.pitch_predictor = VariancePredictor(
+            idim=adim,
+            n_layers=pitch_predictor_layers,
+            n_chans=pitch_predictor_chans,
+            kernel_size=pitch_predictor_kernel_size,
+            dropout_rate=pitch_predictor_dropout,
+        )
+        # NOTE(kan-bayashi): We use continuous pitch + FastPitch style avg
+        self.pitch_embed = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                in_channels=1,
+                out_channels=adim,
+                kernel_size=pitch_embed_kernel_size,
+                padding=(pitch_embed_kernel_size - 1) // 2,
+            ),
+            torch.nn.Dropout(pitch_embed_dropout),
+        )
+
+        # define energy predictor
+        self.energy_predictor = VariancePredictor(
+            idim=adim,
+            n_layers=energy_predictor_layers,
+            n_chans=energy_predictor_chans,
+            kernel_size=energy_predictor_kernel_size,
+            dropout_rate=energy_predictor_dropout,
+        )
+        # NOTE(kan-bayashi): We use continuous enegy + FastPitch style avg
+        self.energy_embed = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                in_channels=1,
+                out_channels=adim,
+                kernel_size=energy_embed_kernel_size,
+                padding=(energy_embed_kernel_size - 1) // 2,
+            ),
+            torch.nn.Dropout(energy_embed_dropout),
+        )
+
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+
+        # define decoder
+        # NOTE: we use encoder as decoder
+        # because fastspeech's decoder is the same as encoder
+        if decoder_type == "transformer":
+            self.decoder = TransformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                pos_enc_class=pos_enc_class,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            )
+        elif decoder_type == "conformer":
+            self.decoder = ConformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size,
+            )
+        else:
+            raise ValueError(f"{decoder_type} is not supported.")
+
+        # define final projection
+        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
+
+        # define postnet
+        self.postnet = (
+            None
+            if postnet_layers == 0
+            else Postnet(
+                idim=idim,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=use_batch_norm,
+                dropout_rate=postnet_dropout_rate,
+            )
+        )
+
+        # initialize parameters
+        self._reset_parameters(
+            init_type=init_type,
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha,
+        )
+
+        # define criterions
+        self.criterion = FastSpeech2Loss(
+            use_masking=use_masking, use_weighted_masking=use_weighted_masking
+        )
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        durations: torch.Tensor,
+        durations_lengths: torch.Tensor,
+        pitch: torch.Tensor,
+        pitch_lengths: torch.Tensor,
+        energy: torch.Tensor,
+        energy_lengths: torch.Tensor,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        joint_training: bool = False,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text (LongTensor): Batch of padded token ids (B, T_text).
+            text_lengths (LongTensor): Batch of lengths of each input (B,).
+            feats (Tensor): Batch of padded target features (B, T_feats, odim).
+            feats_lengths (LongTensor): Batch of the lengths of each target (B,).
+            durations (LongTensor): Batch of padded durations (B, T_text + 1).
+            durations_lengths (LongTensor): Batch of duration lengths (B, T_text + 1).
+            pitch (Tensor): Batch of padded token-averaged pitch (B, T_text + 1, 1).
+            pitch_lengths (LongTensor): Batch of pitch lengths (B, T_text + 1).
+            energy (Tensor): Batch of padded token-averaged energy (B, T_text + 1, 1).
+            energy_lengths (LongTensor): Batch of energy lengths (B, T_text + 1).
+            spembs (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            sids (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lids (Optional[Tensor]): Batch of language IDs (B, 1).
+            joint_training (bool): Whether to perform joint training with vocoder.
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.
+
+        """
+        text = text[:, : text_lengths.max()]  # for data-parallel
+        feats = feats[:, : feats_lengths.max()]  # for data-parallel
+        durations = durations[:, : durations_lengths.max()]  # for data-parallel
+        pitch = pitch[:, : pitch_lengths.max()]  # for data-parallel
+        energy = energy[:, : energy_lengths.max()]  # for data-parallel
+
+        batch_size = text.size(0)
+
+        # Add eos at the last of sequence
+        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
+        for i, l in enumerate(text_lengths):
+            xs[i, l] = self.eos
+        ilens = text_lengths + 1
+
+        ys, ds, ps, es = feats, durations, pitch, energy
+        olens = feats_lengths
+
+        # forward propagation
+        before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(
+            xs,
+            ilens,
+            ys,
+            olens,
+            ds,
+            ps,
+            es,
+            spembs=spembs,
+            sids=sids,
+            lids=lids,
+            is_inference=False,
+        )
+
+        # modify mod part of groundtruth
+        if self.reduction_factor > 1:
+            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
+            max_olen = max(olens)
+            ys = ys[:, :max_olen]
+
+        # calculate loss
+        if self.postnet is None:
+            after_outs = None
+
+        # calculate loss
+        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            d_outs=d_outs,
+            p_outs=p_outs,
+            e_outs=e_outs,
+            ys=ys,
+            ds=ds,
+            ps=ps,
+            es=es,
+            ilens=ilens,
+            olens=olens,
+        )
+        loss = l1_loss + duration_loss + pitch_loss + energy_loss
+
+        stats = dict(
+            l1_loss=l1_loss.item(),
+            duration_loss=duration_loss.item(),
+            pitch_loss=pitch_loss.item(),
+            energy_loss=energy_loss.item(),
+        )
+
+        # report extra information
+        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
+            stats.update(
+                encoder_alpha=self.encoder.embed[-1].alpha.data.item(),
+            )
+        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
+            stats.update(
+                decoder_alpha=self.decoder.embed[-1].alpha.data.item(),
+            )
+
+        if not joint_training:
+            stats.update(loss=loss.item())
+            loss, stats, weight = force_gatherable(
+                (loss, stats, batch_size), loss.device
+            )
+            return loss, stats, weight
+        else:
+            return loss, stats, after_outs if after_outs is not None else before_outs
+
+    def _forward(
+        self,
+        xs: torch.Tensor,
+        ilens: torch.Tensor,
+        ys: Optional[torch.Tensor] = None,
+        olens: Optional[torch.Tensor] = None,
+        ds: Optional[torch.Tensor] = None,
+        ps: Optional[torch.Tensor] = None,
+        es: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        is_inference: bool = False,
+        alpha: float = 1.0,
+    ) -> Sequence[torch.Tensor]:
+        # forward encoder
+        x_masks = self._source_mask(ilens)
+        hs, _ = self.encoder(xs, x_masks)  # (B, T_text, adim)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate with SID and LID embeddings
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.view(-1))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.view(-1))
+            hs = hs + lid_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        # forward duration predictor and variance predictors
+        d_masks = make_pad_mask(ilens).to(xs.device)
+
+        if self.stop_gradient_from_pitch_predictor:
+            p_outs = self.pitch_predictor(hs.detach(), d_masks.unsqueeze(-1))
+        else:
+            p_outs = self.pitch_predictor(hs, d_masks.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
+        else:
+            e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
+
+        if is_inference:
+            d_outs = self.duration_predictor.inference(hs, d_masks)  # (B, T_text)
+            # use prediction in inference
+            p_embs = self.pitch_embed(p_outs.transpose(1, 2)).transpose(1, 2)
+            e_embs = self.energy_embed(e_outs.transpose(1, 2)).transpose(1, 2)
+            hs = hs + e_embs + p_embs
+            hs = self.length_regulator(hs, d_outs, alpha)  # (B, T_feats, adim)
+        else:
+            d_outs = self.duration_predictor(hs, d_masks)
+            # use groundtruth in training
+            p_embs = self.pitch_embed(ps.transpose(1, 2)).transpose(1, 2)
+            e_embs = self.energy_embed(es.transpose(1, 2)).transpose(1, 2)
+            hs = hs + e_embs + p_embs
+            hs = self.length_regulator(hs, ds)  # (B, T_feats, adim)
+
+        # forward decoder
+        if olens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
+            else:
+                olens_in = olens
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(hs, h_masks)  # (B, T_feats, adim)
+        before_outs = self.feat_out(zs).view(
+            zs.size(0), -1, self.odim
+        )  # (B, T_feats, odim)
+
+        # postnet -> (B, T_feats//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose(1, 2)
+            ).transpose(1, 2)
+
+        return before_outs, after_outs, d_outs, p_outs, e_outs
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        feats: Optional[torch.Tensor] = None,
+        durations: Optional[torch.Tensor] = None,
+        spembs: torch.Tensor = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        pitch: Optional[torch.Tensor] = None,
+        energy: Optional[torch.Tensor] = None,
+        alpha: float = 1.0,
+        use_teacher_forcing: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text (LongTensor): Input sequence of characters (T_text,).
+            feats (Optional[Tensor): Feature sequence to extract style (N, idim).
+            durations (Optional[Tensor): Groundtruth of duration (T_text + 1,).
+            spembs (Optional[Tensor): Speaker embedding vector (spk_embed_dim,).
+            sids (Optional[Tensor]): Speaker ID (1,).
+            lids (Optional[Tensor]): Language ID (1,).
+            pitch (Optional[Tensor]): Groundtruth of token-avg pitch (T_text + 1, 1).
+            energy (Optional[Tensor]): Groundtruth of token-avg energy (T_text + 1, 1).
+            alpha (float): Alpha to control the speed.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+
+        Returns:
+            Dict[str, Tensor]: Output dict including the following items:
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+                * duration (Tensor): Duration sequence (T_text + 1,).
+                * pitch (Tensor): Pitch sequence (T_text + 1,).
+                * energy (Tensor): Energy sequence (T_text + 1,).
+
+        """
+        x, y = text, feats
+        spemb, d, p, e = spembs, durations, pitch, energy
+
+        # add eos at the last of sequence
+        x = F.pad(x, [0, 1], "constant", self.eos)
+
+        # setup batch axis
+        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
+        xs, ys = x.unsqueeze(0), None
+        if y is not None:
+            ys = y.unsqueeze(0)
+        if spemb is not None:
+            spembs = spemb.unsqueeze(0)
+
+        if use_teacher_forcing:
+            # use groundtruth of duration, pitch, and energy
+            ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0)
+            _, outs, d_outs, p_outs, e_outs = self._forward(
+                xs,
+                ilens,
+                ys,
+                ds=ds,
+                ps=ps,
+                es=es,
+                spembs=spembs,
+                sids=sids,
+                lids=lids,
+            )  # (1, T_feats, odim)
+        else:
+            _, outs, d_outs, p_outs, e_outs = self._forward(
+                xs,
+                ilens,
+                ys,
+                spembs=spembs,
+                sids=sids,
+                lids=lids,
+                is_inference=True,
+                alpha=alpha,
+            )  # (1, T_feats, odim)
+
+        return dict(
+            feat_gen=outs[0],
+            duration=d_outs[0],
+            pitch=p_outs[0],
+            energy=e_outs[0],
+        )
+
+    def _integrate_with_spk_embed(
+        self, hs: torch.Tensor, spembs: torch.Tensor
+    ) -> torch.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, T_text, adim).
+            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, T_text, adim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spembs = self.projection(F.normalize(spembs))
+            hs = hs + spembs.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds and then apply projection
+            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
+            hs = self.projection(torch.cat([hs, spembs], dim=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+    def _source_mask(self, ilens: torch.Tensor) -> torch.Tensor:
+        """Make masks for self-attention.
+
+        Args:
+            ilens (LongTensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
+
+        """
+        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
+        return x_masks.unsqueeze(-2)
+
+    def _reset_parameters(
+        self, init_type: str, init_enc_alpha: float, init_dec_alpha: float
+    ):
+        # initialize parameters
+        if init_type != "pytorch":
+            initialize(self, init_type)
+
+        # initialize alpha in scaled positional encoding
+        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
+            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
+        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
+            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)
diff --git a/espnet2/tts/fastspeech2/loss.py b/espnet2/tts/fastspeech2/loss.py
new file mode 100644
index 00000000000..086b856831a
--- /dev/null
+++ b/espnet2/tts/fastspeech2/loss.py
@@ -0,0 +1,127 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Fastspeech2 related loss module for ESPnet2."""
+
+from typing import Tuple
+
+import torch
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.fastspeech.duration_predictor import (
+    DurationPredictorLoss,  # noqa: H301
+)
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+
+
+class FastSpeech2Loss(torch.nn.Module):
+    """Loss function module for FastSpeech2."""
+
+    def __init__(self, use_masking: bool = True, use_weighted_masking: bool = False):
+        """Initialize feed-forward Transformer loss module.
+
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss
+                calculation.
+            use_weighted_masking (bool): Whether to weighted masking in loss
+                calculation.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = torch.nn.L1Loss(reduction=reduction)
+        self.mse_criterion = torch.nn.MSELoss(reduction=reduction)
+        self.duration_criterion = DurationPredictorLoss(reduction=reduction)
+
+    def forward(
+        self,
+        after_outs: torch.Tensor,
+        before_outs: torch.Tensor,
+        d_outs: torch.Tensor,
+        p_outs: torch.Tensor,
+        e_outs: torch.Tensor,
+        ys: torch.Tensor,
+        ds: torch.Tensor,
+        ps: torch.Tensor,
+        es: torch.Tensor,
+        ilens: torch.Tensor,
+        olens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            after_outs (Tensor): Batch of outputs after postnets (B, T_feats, odim).
+            before_outs (Tensor): Batch of outputs before postnets (B, T_feats, odim).
+            d_outs (LongTensor): Batch of outputs of duration predictor (B, T_text).
+            p_outs (Tensor): Batch of outputs of pitch predictor (B, T_text, 1).
+            e_outs (Tensor): Batch of outputs of energy predictor (B, T_text, 1).
+            ys (Tensor): Batch of target features (B, T_feats, odim).
+            ds (LongTensor): Batch of durations (B, T_text).
+            ps (Tensor): Batch of target token-averaged pitch (B, T_text, 1).
+            es (Tensor): Batch of target token-averaged energy (B, T_text, 1).
+            ilens (LongTensor): Batch of the lengths of each input (B,).
+            olens (LongTensor): Batch of the lengths of each target (B,).
+
+        Returns:
+            Tensor: L1 loss value.
+            Tensor: Duration predictor loss value.
+            Tensor: Pitch predictor loss value.
+            Tensor: Energy predictor loss value.
+
+        """
+        # apply mask to remove padded part
+        if self.use_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
+            before_outs = before_outs.masked_select(out_masks)
+            if after_outs is not None:
+                after_outs = after_outs.masked_select(out_masks)
+            ys = ys.masked_select(out_masks)
+            duration_masks = make_non_pad_mask(ilens).to(ys.device)
+            d_outs = d_outs.masked_select(duration_masks)
+            ds = ds.masked_select(duration_masks)
+            pitch_masks = make_non_pad_mask(ilens).unsqueeze(-1).to(ys.device)
+            p_outs = p_outs.masked_select(pitch_masks)
+            e_outs = e_outs.masked_select(pitch_masks)
+            ps = ps.masked_select(pitch_masks)
+            es = es.masked_select(pitch_masks)
+
+        # calculate loss
+        l1_loss = self.l1_criterion(before_outs, ys)
+        if after_outs is not None:
+            l1_loss += self.l1_criterion(after_outs, ys)
+        duration_loss = self.duration_criterion(d_outs, ds)
+        pitch_loss = self.mse_criterion(p_outs, ps)
+        energy_loss = self.mse_criterion(e_outs, es)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
+            out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
+            out_weights /= ys.size(0) * ys.size(2)
+            duration_masks = make_non_pad_mask(ilens).to(ys.device)
+            duration_weights = (
+                duration_masks.float() / duration_masks.sum(dim=1, keepdim=True).float()
+            )
+            duration_weights /= ds.size(0)
+
+            # apply weight
+            l1_loss = l1_loss.mul(out_weights).masked_select(out_masks).sum()
+            duration_loss = (
+                duration_loss.mul(duration_weights).masked_select(duration_masks).sum()
+            )
+            pitch_masks = duration_masks.unsqueeze(-1)
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(pitch_masks).sum()
+            energy_loss = (
+                energy_loss.mul(pitch_weights).masked_select(pitch_masks).sum()
+            )
+
+        return l1_loss, duration_loss, pitch_loss, energy_loss
diff --git a/espnet2/tts/variance_predictor.py b/espnet2/tts/fastspeech2/variance_predictor.py
similarity index 85%
rename from espnet2/tts/variance_predictor.py
rename to espnet2/tts/fastspeech2/variance_predictor.py
index abc8f99cf30..e948c17e0e7 100644
--- a/espnet2/tts/variance_predictor.py
+++ b/espnet2/tts/fastspeech2/variance_predictor.py
@@ -36,10 +36,10 @@ def __init__(
 
         Args:
             idim (int): Input dimension.
-            n_layers (int, optional): Number of convolutional layers.
-            n_chans (int, optional): Number of channels of convolutional layers.
-            kernel_size (int, optional): Kernel size of convolutional layers.
-            dropout_rate (float, optional): Dropout rate.
+            n_layers (int): Number of convolutional layers.
+            n_chans (int): Number of channels of convolutional layers.
+            kernel_size (int): Kernel size of convolutional layers.
+            dropout_rate (float): Dropout rate.
 
         """
         assert check_argument_types()
@@ -69,8 +69,7 @@ def forward(self, xs: torch.Tensor, x_masks: torch.Tensor = None) -> torch.Tenso
 
         Args:
             xs (Tensor): Batch of input sequences (B, Tmax, idim).
-            x_masks (ByteTensor, optional):
-                Batch of masks indicating padded part (B, Tmax).
+            x_masks (ByteTensor): Batch of masks indicating padded part (B, Tmax).
 
         Returns:
             Tensor: Batch of predicted sequences (B, Tmax, 1).
diff --git a/espnet2/tts/feats_extract/dio.py b/espnet2/tts/feats_extract/dio.py
index 48f7249aa1f..43b5dfae306 100644
--- a/espnet2/tts/feats_extract/dio.py
+++ b/espnet2/tts/feats_extract/dio.py
@@ -24,7 +24,7 @@
 
 
 class Dio(AbsFeatsExtract):
-    """F0 estimation with dio + stonemask algortihm.
+    """F0 estimation with dio + stonemask algorithm.
 
     This is f0 extractor based on dio + stonmask algorithm introduced in `WORLD:
     a vocoder-based high-quality speech synthesis system for real-time applications`_.
diff --git a/espnet2/tts/feats_extract/linear_spectrogram.py b/espnet2/tts/feats_extract/linear_spectrogram.py
new file mode 100644
index 00000000000..d8f05d116a0
--- /dev/null
+++ b/espnet2/tts/feats_extract/linear_spectrogram.py
@@ -0,0 +1,72 @@
+from typing import Any
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+
+import torch
+from typeguard import check_argument_types
+
+from espnet2.layers.stft import Stft
+from espnet2.tts.feats_extract.abs_feats_extract import AbsFeatsExtract
+
+
+class LinearSpectrogram(AbsFeatsExtract):
+    """Linear amplitude spectrogram.
+
+    Stft -> amplitude-spec
+    """
+
+    def __init__(
+        self,
+        n_fft: int = 1024,
+        win_length: int = None,
+        hop_length: int = 256,
+        window: Optional[str] = "hann",
+        center: bool = True,
+        normalized: bool = False,
+        onesided: bool = True,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.stft = Stft(
+            n_fft=n_fft,
+            win_length=win_length,
+            hop_length=hop_length,
+            window=window,
+            center=center,
+            normalized=normalized,
+            onesided=onesided,
+        )
+        self.n_fft = n_fft
+
+    def output_size(self) -> int:
+        return self.n_fft // 2 + 1
+
+    def get_parameters(self) -> Dict[str, Any]:
+        """Return the parameters required by Vocoder."""
+        return dict(
+            n_fft=self.n_fft,
+            n_shift=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+        )
+
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Stft: time -> time-freq
+        input_stft, feats_lens = self.stft(input, input_lengths)
+
+        assert input_stft.dim() >= 4, input_stft.shape
+        # "2" refers to the real/imag parts of Complex
+        assert input_stft.shape[-1] == 2, input_stft.shape
+
+        # STFT -> Power spectrum -> Amp spectrum
+        # input_stft: (..., F, 2) -> (..., F)
+        input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2
+        input_amp = torch.sqrt(torch.clamp(input_power, min=1.0e-10))
+        return input_amp, feats_lens
diff --git a/espnet2/tts/feats_extract/log_mel_fbank.py b/espnet2/tts/feats_extract/log_mel_fbank.py
index e760ceab61f..2073c8cecc3 100644
--- a/espnet2/tts/feats_extract/log_mel_fbank.py
+++ b/espnet2/tts/feats_extract/log_mel_fbank.py
@@ -14,7 +14,7 @@
 
 
 class LogMelFbank(AbsFeatsExtract):
-    """Conventional frontend structure for ASR
+    """Conventional frontend structure for TTS.
 
     Stft -> amplitude-spec -> Log-Mel-Fbank
     """
@@ -33,6 +33,7 @@ def __init__(
         fmin: Optional[int] = 80,
         fmax: Optional[int] = 7600,
         htk: bool = False,
+        log_base: Optional[float] = 10.0,
     ):
         assert check_argument_types()
         super().__init__()
@@ -65,7 +66,7 @@ def __init__(
             fmin=fmin,
             fmax=fmax,
             htk=htk,
-            log_base=10.0,
+            log_base=log_base,
         )
 
     def output_size(self) -> int:
diff --git a/espnet2/tts/gst/style_encoder.py b/espnet2/tts/gst/style_encoder.py
index 33ac0210b29..9fcdd9c52cd 100644
--- a/espnet2/tts/gst/style_encoder.py
+++ b/espnet2/tts/gst/style_encoder.py
@@ -31,7 +31,7 @@ class StyleEncoder(torch.nn.Module):
         conv_chans_list: (Sequence[int], optional):
             List of the number of channels of conv layers in the referece encoder.
         conv_kernel_size (int, optional):
-            Kernal size of conv layers in the reference encoder.
+            Kernel size of conv layers in the reference encoder.
         conv_stride (int, optional):
             Stride size of conv layers in the reference encoder.
         gru_layers (int, optional): The number of GRU layers in the reference encoder.
@@ -94,7 +94,7 @@ def forward(self, speech: torch.Tensor) -> torch.Tensor:
 class ReferenceEncoder(torch.nn.Module):
     """Reference encoder module.
 
-    This module is refernece encoder introduced in `Style Tokens: Unsupervised Style
+    This module is reference encoder introduced in `Style Tokens: Unsupervised Style
     Modeling, Control and Transfer in End-to-End Speech Synthesis`.
 
     .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
@@ -106,7 +106,7 @@ class ReferenceEncoder(torch.nn.Module):
         conv_chans_list: (Sequence[int], optional):
             List of the number of channels of conv layers in the referece encoder.
         conv_kernel_size (int, optional):
-            Kernal size of conv layers in the reference encoder.
+            Kernel size of conv layers in the reference encoder.
         conv_stride (int, optional):
             Stride size of conv layers in the reference encoder.
         gru_layers (int, optional): The number of GRU layers in the reference encoder.
diff --git a/espnet2/tts/tacotron2.py b/espnet2/tts/tacotron2.py
deleted file mode 100644
index d5c8b3cc714..00000000000
--- a/espnet2/tts/tacotron2.py
+++ /dev/null
@@ -1,463 +0,0 @@
-# Copyright 2020 Nagoya University (Tomoki Hayashi)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-"""Tacotron 2 related modules for ESPnet2."""
-
-import logging
-from typing import Dict
-from typing import Sequence
-from typing import Tuple
-
-import torch
-import torch.nn.functional as F
-from typeguard import check_argument_types
-
-from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import GuidedAttentionLoss
-from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2Loss
-from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
-from espnet.nets.pytorch_backend.rnn.attentions import AttForward
-from espnet.nets.pytorch_backend.rnn.attentions import AttForwardTA
-from espnet.nets.pytorch_backend.rnn.attentions import AttLoc
-from espnet.nets.pytorch_backend.tacotron2.decoder import Decoder
-from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder
-from espnet2.torch_utils.device_funcs import force_gatherable
-from espnet2.tts.abs_tts import AbsTTS
-from espnet2.tts.gst.style_encoder import StyleEncoder
-
-
-class Tacotron2(AbsTTS):
-    """Tacotron2 module for end-to-end text-to-speech.
-
-    This is a module of Spectrogram prediction network in Tacotron2 described
-    in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_,
-    which converts the sequence of characters into the sequence of Mel-filterbanks.
-
-    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
-       https://arxiv.org/abs/1712.05884
-
-    Args:
-        idim (int): Dimension of the inputs.
-        odim: (int) Dimension of the outputs.
-        spk_embed_dim (int, optional): Dimension of the speaker embedding.
-        embed_dim (int, optional): Dimension of character embedding.
-        elayers (int, optional): The number of encoder blstm layers.
-        eunits (int, optional): The number of encoder blstm units.
-        econv_layers (int, optional): The number of encoder conv layers.
-        econv_filts (int, optional): The number of encoder conv filter size.
-        econv_chans (int, optional): The number of encoder conv filter channels.
-        dlayers (int, optional): The number of decoder lstm layers.
-        dunits (int, optional): The number of decoder lstm units.
-        prenet_layers (int, optional): The number of prenet layers.
-        prenet_units (int, optional): The number of prenet units.
-        postnet_layers (int, optional): The number of postnet layers.
-        postnet_filts (int, optional): The number of postnet filter size.
-        postnet_chans (int, optional): The number of postnet filter channels.
-        output_activation (str, optional): The name of activation function for outputs.
-        adim (int, optional): The number of dimension of mlp in attention.
-        aconv_chans (int, optional): The number of attention conv filter channels.
-        aconv_filts (int, optional): The number of attention conv filter size.
-        cumulate_att_w (bool, optional): Whether to cumulate previous attention weight.
-        use_batch_norm (bool, optional): Whether to use batch normalization.
-        use_concate (bool, optional): Whether to concatenate encoder embedding with
-            decoder lstm outputs.
-        reduction_factor (int, optional): Reduction factor.
-        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
-        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
-        use_gst (str, optional): Whether to use global style token.
-        gst_tokens (int, optional): The number of GST embeddings.
-        gst_heads (int, optional): The number of heads in GST multihead attention.
-        gst_conv_layers (int, optional): The number of conv layers in GST.
-        gst_conv_chans_list: (Sequence[int], optional):
-            List of the number of channels of conv layers in GST.
-        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
-        gst_conv_stride (int, optional): Stride size of conv layers in GST.
-        gst_gru_layers (int, optional): The number of GRU layers in GST.
-        gst_gru_units (int, optional): The number of GRU units in GST.
-        dropout_rate (float, optional): Dropout rate.
-        zoneout_rate (float, optional): Zoneout rate.
-        use_masking (bool, optional): Whether to mask padded part in loss calculation.
-        use_weighted_masking (bool, optional): Whether to apply weighted masking in
-            loss calculation.
-        bce_pos_weight (float, optional): Weight of positive sample of stop token
-            (only for use_masking=True).
-        loss_type (str, optional): How to calculate loss.
-        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
-        guided_attn_loss_sigma (float, optional): Sigma in guided attention loss.
-        guided_attn_loss_lamdba (float, optional): Lambda in guided attention loss.
-
-    """
-
-    def __init__(
-        self,
-        # network structure related
-        idim: int,
-        odim: int,
-        embed_dim: int = 512,
-        elayers: int = 1,
-        eunits: int = 512,
-        econv_layers: int = 3,
-        econv_chans: int = 512,
-        econv_filts: int = 5,
-        atype: str = "location",
-        adim: int = 512,
-        aconv_chans: int = 32,
-        aconv_filts: int = 15,
-        cumulate_att_w: bool = True,
-        dlayers: int = 2,
-        dunits: int = 1024,
-        prenet_layers: int = 2,
-        prenet_units: int = 256,
-        postnet_layers: int = 5,
-        postnet_chans: int = 512,
-        postnet_filts: int = 5,
-        output_activation: str = None,
-        use_batch_norm: bool = True,
-        use_concate: bool = True,
-        use_residual: bool = False,
-        reduction_factor: int = 1,
-        spk_embed_dim: int = None,
-        spk_embed_integration_type: str = "concat",
-        use_gst: bool = False,
-        gst_tokens: int = 10,
-        gst_heads: int = 4,
-        gst_conv_layers: int = 6,
-        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
-        gst_conv_kernel_size: int = 3,
-        gst_conv_stride: int = 2,
-        gst_gru_layers: int = 1,
-        gst_gru_units: int = 128,
-        # training related
-        dropout_rate: float = 0.5,
-        zoneout_rate: float = 0.1,
-        use_masking: bool = True,
-        use_weighted_masking: bool = False,
-        bce_pos_weight: float = 5.0,
-        loss_type: str = "L1+L2",
-        use_guided_attn_loss: bool = True,
-        guided_attn_loss_sigma: float = 0.4,
-        guided_attn_loss_lambda: float = 1.0,
-    ):
-        """Initialize Tacotron2 module."""
-        assert check_argument_types()
-        super().__init__()
-
-        # store hyperparameters
-        self.idim = idim
-        self.odim = odim
-        self.eos = idim - 1
-        self.spk_embed_dim = spk_embed_dim
-        self.cumulate_att_w = cumulate_att_w
-        self.reduction_factor = reduction_factor
-        self.use_gst = use_gst
-        self.use_guided_attn_loss = use_guided_attn_loss
-        self.loss_type = loss_type
-        if self.spk_embed_dim is not None:
-            self.spk_embed_integration_type = spk_embed_integration_type
-
-        # define activation function for the final output
-        if output_activation is None:
-            self.output_activation_fn = None
-        elif hasattr(F, output_activation):
-            self.output_activation_fn = getattr(F, output_activation)
-        else:
-            raise ValueError(
-                f"there is no such an activation function. " f"({output_activation})"
-            )
-
-        # set padding idx
-        padding_idx = 0
-        self.padding_idx = padding_idx
-
-        # define network modules
-        self.enc = Encoder(
-            idim=idim,
-            embed_dim=embed_dim,
-            elayers=elayers,
-            eunits=eunits,
-            econv_layers=econv_layers,
-            econv_chans=econv_chans,
-            econv_filts=econv_filts,
-            use_batch_norm=use_batch_norm,
-            use_residual=use_residual,
-            dropout_rate=dropout_rate,
-            padding_idx=padding_idx,
-        )
-
-        if self.use_gst:
-            self.gst = StyleEncoder(
-                idim=odim,  # the input is mel-spectrogram
-                gst_tokens=gst_tokens,
-                gst_token_dim=eunits,
-                gst_heads=gst_heads,
-                conv_layers=gst_conv_layers,
-                conv_chans_list=gst_conv_chans_list,
-                conv_kernel_size=gst_conv_kernel_size,
-                conv_stride=gst_conv_stride,
-                gru_layers=gst_gru_layers,
-                gru_units=gst_gru_units,
-            )
-
-        if spk_embed_dim is None:
-            dec_idim = eunits
-        elif spk_embed_integration_type == "concat":
-            dec_idim = eunits + spk_embed_dim
-        elif spk_embed_integration_type == "add":
-            dec_idim = eunits
-            self.projection = torch.nn.Linear(self.spk_embed_dim, eunits)
-        else:
-            raise ValueError(f"{spk_embed_integration_type} is not supported.")
-
-        if atype == "location":
-            att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts)
-        elif atype == "forward":
-            att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts)
-            if self.cumulate_att_w:
-                logging.warning(
-                    "cumulation of attention weights is disabled "
-                    "in forward attention."
-                )
-                self.cumulate_att_w = False
-        elif atype == "forward_ta":
-            att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, odim)
-            if self.cumulate_att_w:
-                logging.warning(
-                    "cumulation of attention weights is disabled "
-                    "in forward attention."
-                )
-                self.cumulate_att_w = False
-        else:
-            raise NotImplementedError("Support only location or forward")
-        self.dec = Decoder(
-            idim=dec_idim,
-            odim=odim,
-            att=att,
-            dlayers=dlayers,
-            dunits=dunits,
-            prenet_layers=prenet_layers,
-            prenet_units=prenet_units,
-            postnet_layers=postnet_layers,
-            postnet_chans=postnet_chans,
-            postnet_filts=postnet_filts,
-            output_activation_fn=self.output_activation_fn,
-            cumulate_att_w=self.cumulate_att_w,
-            use_batch_norm=use_batch_norm,
-            use_concate=use_concate,
-            dropout_rate=dropout_rate,
-            zoneout_rate=zoneout_rate,
-            reduction_factor=reduction_factor,
-        )
-        self.taco2_loss = Tacotron2Loss(
-            use_masking=use_masking,
-            use_weighted_masking=use_weighted_masking,
-            bce_pos_weight=bce_pos_weight,
-        )
-        if self.use_guided_attn_loss:
-            self.attn_loss = GuidedAttentionLoss(
-                sigma=guided_attn_loss_sigma,
-                alpha=guided_attn_loss_lambda,
-            )
-
-    def forward(
-        self,
-        text: torch.Tensor,
-        text_lengths: torch.Tensor,
-        speech: torch.Tensor,
-        speech_lengths: torch.Tensor,
-        spembs: torch.Tensor = None,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
-        """Calculate forward propagation.
-
-        Args:
-            text (LongTensor): Batch of padded character ids (B, Tmax).
-            text_lengths (LongTensor): Batch of lengths of each input batch (B,).
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
-            speech_lengths (LongTensor): Batch of the lengths of each target (B,).
-            spembs (Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns:
-            Tensor: Loss scalar value.
-            Dict: Statistics to be monitored.
-            Tensor: Weight value.
-
-        """
-        text = text[:, : text_lengths.max()]  # for data-parallel
-        speech = speech[:, : speech_lengths.max()]  # for data-parallel
-
-        batch_size = text.size(0)
-
-        # Add eos at the last of sequence
-        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
-        for i, l in enumerate(text_lengths):
-            xs[i, l] = self.eos
-        ilens = text_lengths + 1
-
-        ys = speech
-        olens = speech_lengths
-
-        # make labels for stop prediction
-        labels = make_pad_mask(olens - 1).to(ys.device, ys.dtype)
-        labels = F.pad(labels, [0, 1], "constant", 1.0)
-
-        # calculate tacotron2 outputs
-        after_outs, before_outs, logits, att_ws = self._forward(
-            xs, ilens, ys, olens, spembs
-        )
-
-        # modify mod part of groundtruth
-        if self.reduction_factor > 1:
-            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
-            max_out = max(olens)
-            ys = ys[:, :max_out]
-            labels = labels[:, :max_out]
-            labels[:, -1] = 1.0  # make sure at least one frame has 1
-
-        # calculate taco2 loss
-        l1_loss, mse_loss, bce_loss = self.taco2_loss(
-            after_outs, before_outs, logits, ys, labels, olens
-        )
-        if self.loss_type == "L1+L2":
-            loss = l1_loss + mse_loss + bce_loss
-        elif self.loss_type == "L1":
-            loss = l1_loss + bce_loss
-        elif self.loss_type == "L2":
-            loss = mse_loss + bce_loss
-        else:
-            raise ValueError(f"unknown --loss-type {self.loss_type}")
-
-        stats = dict(
-            l1_loss=l1_loss.item(),
-            mse_loss=mse_loss.item(),
-            bce_loss=bce_loss.item(),
-        )
-
-        # calculate attention loss
-        if self.use_guided_attn_loss:
-            # NOTE(kan-bayashi): length of output for auto-regressive
-            # input will be changed when r > 1
-            if self.reduction_factor > 1:
-                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
-            else:
-                olens_in = olens
-            attn_loss = self.attn_loss(att_ws, ilens, olens_in)
-            loss = loss + attn_loss
-            stats.update(attn_loss=attn_loss.item())
-
-        stats.update(loss=loss.item())
-
-        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
-        return loss, stats, weight
-
-    def _forward(
-        self,
-        xs: torch.Tensor,
-        ilens: torch.Tensor,
-        ys: torch.Tensor,
-        olens: torch.Tensor,
-        spembs: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        hs, hlens = self.enc(xs, ilens)
-        if self.use_gst:
-            style_embs = self.gst(ys)
-            hs = hs + style_embs.unsqueeze(1)
-        if self.spk_embed_dim is not None:
-            hs = self._integrate_with_spk_embed(hs, spembs)
-        return self.dec(hs, hlens, ys)
-
-    def inference(
-        self,
-        text: torch.Tensor,
-        speech: torch.Tensor = None,
-        spembs: torch.Tensor = None,
-        threshold: float = 0.5,
-        minlenratio: float = 0.0,
-        maxlenratio: float = 10.0,
-        use_att_constraint: bool = False,
-        backward_window: int = 1,
-        forward_window: int = 3,
-        use_teacher_forcing: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Generate the sequence of features given the sequences of characters.
-
-        Args:
-            text (LongTensor): Input sequence of characters (T,).
-            speech (Tensor, optional): Feature sequence to extract style (N, idim).
-            spembs (Tensor, optional): Speaker embedding vector (spk_embed_dim,).
-            threshold (float, optional): Threshold in inference.
-            minlenratio (float, optional): Minimum length ratio in inference.
-            maxlenratio (float, optional): Maximum length ratio in inference.
-            use_att_constraint (bool, optional): Whether to apply attention constraint.
-            backward_window (int, optional): Backward window in attention constraint.
-            forward_window (int, optional): Forward window in attention constraint.
-            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
-
-        Returns:
-            Tensor: Output sequence of features (L, odim).
-            Tensor: Output sequence of stop probabilities (L,).
-            Tensor: Attention weights (L, T).
-
-        """
-        x = text
-        y = speech
-        spemb = spembs
-
-        # add eos at the last of sequence
-        x = F.pad(x, [0, 1], "constant", self.eos)
-
-        # inference with teacher forcing
-        if use_teacher_forcing:
-            assert speech is not None, "speech must be provided with teacher forcing."
-
-            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
-            spembs = None if spemb is None else spemb.unsqueeze(0)
-            ilens = x.new_tensor([xs.size(1)]).long()
-            olens = y.new_tensor([ys.size(1)]).long()
-            outs, _, _, att_ws = self._forward(xs, ilens, ys, olens, spembs)
-
-            return outs[0], None, att_ws[0]
-
-        # inference
-        h = self.enc.inference(x)
-        if self.use_gst:
-            style_emb = self.gst(y.unsqueeze(0))
-            h = h + style_emb
-        if self.spk_embed_dim is not None:
-            hs, spembs = h.unsqueeze(0), spemb.unsqueeze(0)
-            h = self._integrate_with_spk_embed(hs, spembs)[0]
-        outs, probs, att_ws = self.dec.inference(
-            h,
-            threshold=threshold,
-            minlenratio=minlenratio,
-            maxlenratio=maxlenratio,
-            use_att_constraint=use_att_constraint,
-            backward_window=backward_window,
-            forward_window=forward_window,
-        )
-
-        return outs, probs, att_ws
-
-    def _integrate_with_spk_embed(
-        self, hs: torch.Tensor, spembs: torch.Tensor
-    ) -> torch.Tensor:
-        """Integrate speaker embedding with hidden states.
-
-        Args:
-            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
-            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns:
-            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
-                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
-
-        """
-        if self.spk_embed_integration_type == "add":
-            # apply projection and then add to hidden states
-            spembs = self.projection(F.normalize(spembs))
-            hs = hs + spembs.unsqueeze(1)
-        elif self.spk_embed_integration_type == "concat":
-            # concat hidden states with spk embeds
-            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
-            hs = torch.cat([hs, spembs], dim=-1)
-        else:
-            raise NotImplementedError("support only add or concat.")
-
-        return hs
diff --git a/espnet2/tts/tacotron2/__init__.py b/espnet2/tts/tacotron2/__init__.py
new file mode 100644
index 00000000000..4e9c2dd344a
--- /dev/null
+++ b/espnet2/tts/tacotron2/__init__.py
@@ -0,0 +1 @@
+from espnet2.tts.tacotron2.tacotron2 import Tacotron2  # NOQA
diff --git a/espnet2/tts/tacotron2/tacotron2.py b/espnet2/tts/tacotron2/tacotron2.py
new file mode 100644
index 00000000000..a178b9079fd
--- /dev/null
+++ b/espnet2/tts/tacotron2/tacotron2.py
@@ -0,0 +1,532 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Tacotron 2 related modules for ESPnet2."""
+
+import logging
+
+from typing import Dict
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import GuidedAttentionLoss
+from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2Loss
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.rnn.attentions import AttForward
+from espnet.nets.pytorch_backend.rnn.attentions import AttForwardTA
+from espnet.nets.pytorch_backend.rnn.attentions import AttLoc
+from espnet.nets.pytorch_backend.tacotron2.decoder import Decoder
+from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.tts.abs_tts import AbsTTS
+from espnet2.tts.gst.style_encoder import StyleEncoder
+
+
+class Tacotron2(AbsTTS):
+    """Tacotron2 module for end-to-end text-to-speech.
+
+    This is a module of Spectrogram prediction network in Tacotron2 described
+    in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_,
+    which converts the sequence of characters into the sequence of Mel-filterbanks.
+
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+
+    """
+
+    def __init__(
+        self,
+        # network structure related
+        idim: int,
+        odim: int,
+        embed_dim: int = 512,
+        elayers: int = 1,
+        eunits: int = 512,
+        econv_layers: int = 3,
+        econv_chans: int = 512,
+        econv_filts: int = 5,
+        atype: str = "location",
+        adim: int = 512,
+        aconv_chans: int = 32,
+        aconv_filts: int = 15,
+        cumulate_att_w: bool = True,
+        dlayers: int = 2,
+        dunits: int = 1024,
+        prenet_layers: int = 2,
+        prenet_units: int = 256,
+        postnet_layers: int = 5,
+        postnet_chans: int = 512,
+        postnet_filts: int = 5,
+        output_activation: str = None,
+        use_batch_norm: bool = True,
+        use_concate: bool = True,
+        use_residual: bool = False,
+        reduction_factor: int = 1,
+        # extra embedding related
+        spks: Optional[int] = None,
+        langs: Optional[int] = None,
+        spk_embed_dim: Optional[int] = None,
+        spk_embed_integration_type: str = "concat",
+        use_gst: bool = False,
+        gst_tokens: int = 10,
+        gst_heads: int = 4,
+        gst_conv_layers: int = 6,
+        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        gst_conv_kernel_size: int = 3,
+        gst_conv_stride: int = 2,
+        gst_gru_layers: int = 1,
+        gst_gru_units: int = 128,
+        # training related
+        dropout_rate: float = 0.5,
+        zoneout_rate: float = 0.1,
+        use_masking: bool = True,
+        use_weighted_masking: bool = False,
+        bce_pos_weight: float = 5.0,
+        loss_type: str = "L1+L2",
+        use_guided_attn_loss: bool = True,
+        guided_attn_loss_sigma: float = 0.4,
+        guided_attn_loss_lambda: float = 1.0,
+    ):
+        """Initialize Tacotron2 module.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim: (int) Dimension of the outputs.
+            embed_dim (int): Dimension of the token embedding.
+            elayers (int): Number of encoder blstm layers.
+            eunits (int): Number of encoder blstm units.
+            econv_layers (int): Number of encoder conv layers.
+            econv_filts (int): Number of encoder conv filter size.
+            econv_chans (int): Number of encoder conv filter channels.
+            dlayers (int): Number of decoder lstm layers.
+            dunits (int): Number of decoder lstm units.
+            prenet_layers (int): Number of prenet layers.
+            prenet_units (int): Number of prenet units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_filts (int): Number of postnet filter size.
+            postnet_chans (int): Number of postnet filter channels.
+            output_activation (str): Name of activation function for outputs.
+            adim (int): Number of dimension of mlp in attention.
+            aconv_chans (int): Number of attention conv filter channels.
+            aconv_filts (int): Number of attention conv filter size.
+            cumulate_att_w (bool): Whether to cumulate previous attention weight.
+            use_batch_norm (bool): Whether to use batch normalization.
+            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
+            reduction_factor (int): Reduction factor.
+            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            langs (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spembs will be provided as the input.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            use_gst (str): Whether to use global style token.
+            gst_tokens (int): Number of GST embeddings.
+            gst_heads (int): Number of heads in GST multihead attention.
+            gst_conv_layers (int): Number of conv layers in GST.
+            gst_conv_chans_list: (Sequence[int]): List of the number of channels of conv
+                layers in GST.
+            gst_conv_kernel_size (int): Kernel size of conv layers in GST.
+            gst_conv_stride (int): Stride size of conv layers in GST.
+            gst_gru_layers (int): Number of GRU layers in GST.
+            gst_gru_units (int): Number of GRU units in GST.
+            dropout_rate (float): Dropout rate.
+            zoneout_rate (float): Zoneout rate.
+            use_masking (bool): Whether to mask padded part in loss calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in
+                loss calculation.
+            bce_pos_weight (float): Weight of positive sample of stop token
+                (only for use_masking=True).
+            loss_type (str): Loss function type ("L1", "L2", or "L1+L2").
+            use_guided_attn_loss (bool): Whether to use guided attention loss.
+            guided_attn_loss_sigma (float): Sigma in guided attention loss.
+            guided_attn_loss_lambda (float): Lambda in guided attention loss.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.cumulate_att_w = cumulate_att_w
+        self.reduction_factor = reduction_factor
+        self.use_gst = use_gst
+        self.use_guided_attn_loss = use_guided_attn_loss
+        self.loss_type = loss_type
+
+        # define activation function for the final output
+        if output_activation is None:
+            self.output_activation_fn = None
+        elif hasattr(F, output_activation):
+            self.output_activation_fn = getattr(F, output_activation)
+        else:
+            raise ValueError(
+                f"there is no such an activation function. " f"({output_activation})"
+            )
+
+        # set padding idx
+        padding_idx = 0
+        self.padding_idx = padding_idx
+
+        # define network modules
+        self.enc = Encoder(
+            idim=idim,
+            embed_dim=embed_dim,
+            elayers=elayers,
+            eunits=eunits,
+            econv_layers=econv_layers,
+            econv_chans=econv_chans,
+            econv_filts=econv_filts,
+            use_batch_norm=use_batch_norm,
+            use_residual=use_residual,
+            dropout_rate=dropout_rate,
+            padding_idx=padding_idx,
+        )
+
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=eunits,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units,
+            )
+
+        self.spks = None
+        if spks is not None and spks > 1:
+            self.spks = spks
+            self.sid_emb = torch.nn.Embedding(spks, eunits)
+        self.langs = None
+        if langs is not None and langs > 1:
+            self.langs = langs
+            self.lid_emb = torch.nn.Embedding(langs, eunits)
+
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            self.spk_embed_dim = spk_embed_dim
+            self.spk_embed_integration_type = spk_embed_integration_type
+        if self.spk_embed_dim is None:
+            dec_idim = eunits
+        elif self.spk_embed_integration_type == "concat":
+            dec_idim = eunits + spk_embed_dim
+        elif self.spk_embed_integration_type == "add":
+            dec_idim = eunits
+            self.projection = torch.nn.Linear(self.spk_embed_dim, eunits)
+        else:
+            raise ValueError(f"{spk_embed_integration_type} is not supported.")
+
+        if atype == "location":
+            att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts)
+        elif atype == "forward":
+            att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts)
+            if self.cumulate_att_w:
+                logging.warning(
+                    "cumulation of attention weights is disabled "
+                    "in forward attention."
+                )
+                self.cumulate_att_w = False
+        elif atype == "forward_ta":
+            att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, odim)
+            if self.cumulate_att_w:
+                logging.warning(
+                    "cumulation of attention weights is disabled "
+                    "in forward attention."
+                )
+                self.cumulate_att_w = False
+        else:
+            raise NotImplementedError("Support only location or forward")
+        self.dec = Decoder(
+            idim=dec_idim,
+            odim=odim,
+            att=att,
+            dlayers=dlayers,
+            dunits=dunits,
+            prenet_layers=prenet_layers,
+            prenet_units=prenet_units,
+            postnet_layers=postnet_layers,
+            postnet_chans=postnet_chans,
+            postnet_filts=postnet_filts,
+            output_activation_fn=self.output_activation_fn,
+            cumulate_att_w=self.cumulate_att_w,
+            use_batch_norm=use_batch_norm,
+            use_concate=use_concate,
+            dropout_rate=dropout_rate,
+            zoneout_rate=zoneout_rate,
+            reduction_factor=reduction_factor,
+        )
+        self.taco2_loss = Tacotron2Loss(
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight,
+        )
+        if self.use_guided_attn_loss:
+            self.attn_loss = GuidedAttentionLoss(
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda,
+            )
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        joint_training: bool = False,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text (LongTensor): Batch of padded character ids (B, T_text).
+            text_lengths (LongTensor): Batch of lengths of each input batch (B,).
+            feats (Tensor): Batch of padded target features (B, T_feats, odim).
+            feats_lengths (LongTensor): Batch of the lengths of each target (B,).
+            spembs (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            sids (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lids (Optional[Tensor]): Batch of language IDs (B, 1).
+            joint_training (bool): Whether to perform joint training with vocoder.
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.
+
+        """
+        text = text[:, : text_lengths.max()]  # for data-parallel
+        feats = feats[:, : feats_lengths.max()]  # for data-parallel
+
+        batch_size = text.size(0)
+
+        # Add eos at the last of sequence
+        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
+        for i, l in enumerate(text_lengths):
+            xs[i, l] = self.eos
+        ilens = text_lengths + 1
+
+        ys = feats
+        olens = feats_lengths
+
+        # make labels for stop prediction
+        labels = make_pad_mask(olens - 1).to(ys.device, ys.dtype)
+        labels = F.pad(labels, [0, 1], "constant", 1.0)
+
+        # calculate tacotron2 outputs
+        after_outs, before_outs, logits, att_ws = self._forward(
+            xs=xs,
+            ilens=ilens,
+            ys=ys,
+            olens=olens,
+            spembs=spembs,
+            sids=sids,
+            lids=lids,
+        )
+
+        # modify mod part of groundtruth
+        if self.reduction_factor > 1:
+            assert olens.ge(
+                self.reduction_factor
+            ).all(), "Output length must be greater than or equal to reduction factor."
+            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
+            max_out = max(olens)
+            ys = ys[:, :max_out]
+            labels = labels[:, :max_out]
+            labels = torch.scatter(
+                labels, 1, (olens - 1).unsqueeze(1), 1.0
+            )  # see #3388
+
+        # calculate taco2 loss
+        l1_loss, mse_loss, bce_loss = self.taco2_loss(
+            after_outs, before_outs, logits, ys, labels, olens
+        )
+        if self.loss_type == "L1+L2":
+            loss = l1_loss + mse_loss + bce_loss
+        elif self.loss_type == "L1":
+            loss = l1_loss + bce_loss
+        elif self.loss_type == "L2":
+            loss = mse_loss + bce_loss
+        else:
+            raise ValueError(f"unknown --loss-type {self.loss_type}")
+
+        stats = dict(
+            l1_loss=l1_loss.item(),
+            mse_loss=mse_loss.item(),
+            bce_loss=bce_loss.item(),
+        )
+
+        # calculate attention loss
+        if self.use_guided_attn_loss:
+            # NOTE(kan-bayashi): length of output for auto-regressive
+            # input will be changed when r > 1
+            if self.reduction_factor > 1:
+                olens_in = olens.new([olen // self.reduction_factor for olen in olens])
+            else:
+                olens_in = olens
+            attn_loss = self.attn_loss(att_ws, ilens, olens_in)
+            loss = loss + attn_loss
+            stats.update(attn_loss=attn_loss.item())
+
+        if not joint_training:
+            stats.update(loss=loss.item())
+            loss, stats, weight = force_gatherable(
+                (loss, stats, batch_size), loss.device
+            )
+            return loss, stats, weight
+        else:
+            return loss, stats, after_outs
+
+    def _forward(
+        self,
+        xs: torch.Tensor,
+        ilens: torch.Tensor,
+        ys: torch.Tensor,
+        olens: torch.Tensor,
+        spembs: torch.Tensor,
+        sids: torch.Tensor,
+        lids: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        hs, hlens = self.enc(xs, ilens)
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.view(-1))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.view(-1))
+            hs = hs + lid_embs.unsqueeze(1)
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+        return self.dec(hs, hlens, ys)
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        feats: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 10.0,
+        use_att_constraint: bool = False,
+        backward_window: int = 1,
+        forward_window: int = 3,
+        use_teacher_forcing: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text (LongTensor): Input sequence of characters (T_text,).
+            feats (Optional[Tensor]): Feature sequence to extract style (N, idim).
+            spembs (Optional[Tensor]): Speaker embedding (spk_embed_dim,).
+            sids (Optional[Tensor]): Speaker ID (1,).
+            lids (Optional[Tensor]): Language ID (1,).
+            threshold (float): Threshold in inference.
+            minlenratio (float): Minimum length ratio in inference.
+            maxlenratio (float): Maximum length ratio in inference.
+            use_att_constraint (bool): Whether to apply attention constraint.
+            backward_window (int): Backward window in attention constraint.
+            forward_window (int): Forward window in attention constraint.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
+            Dict[str, Tensor]: Output dict including the following items:
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
+                * att_w (Tensor): Attention weights (T_feats, T).
+
+        """
+        x = text
+        y = feats
+        spemb = spembs
+
+        # add eos at the last of sequence
+        x = F.pad(x, [0, 1], "constant", self.eos)
+
+        # inference with teacher forcing
+        if use_teacher_forcing:
+            assert feats is not None, "feats must be provided with teacher forcing."
+
+            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
+            spembs = None if spemb is None else spemb.unsqueeze(0)
+            ilens = x.new_tensor([xs.size(1)]).long()
+            olens = y.new_tensor([ys.size(1)]).long()
+            outs, _, _, att_ws = self._forward(
+                xs=xs,
+                ilens=ilens,
+                ys=ys,
+                olens=olens,
+                spembs=spembs,
+                sids=sids,
+                lids=lids,
+            )
+
+            return dict(feat_gen=outs[0], att_w=att_ws[0])
+
+        # inference
+        h = self.enc.inference(x)
+        if self.use_gst:
+            style_emb = self.gst(y.unsqueeze(0))
+            h = h + style_emb
+        if self.spks is not None:
+            sid_emb = self.sid_emb(sids.view(-1))
+            h = h + sid_emb
+        if self.langs is not None:
+            lid_emb = self.lid_emb(lids.view(-1))
+            h = h + lid_emb
+        if self.spk_embed_dim is not None:
+            hs, spembs = h.unsqueeze(0), spemb.unsqueeze(0)
+            h = self._integrate_with_spk_embed(hs, spembs)[0]
+        out, prob, att_w = self.dec.inference(
+            h,
+            threshold=threshold,
+            minlenratio=minlenratio,
+            maxlenratio=maxlenratio,
+            use_att_constraint=use_att_constraint,
+            backward_window=backward_window,
+            forward_window=forward_window,
+        )
+
+        return dict(feat_gen=out, prob=prob, att_w=att_w)
+
+    def _integrate_with_spk_embed(
+        self, hs: torch.Tensor, spembs: torch.Tensor
+    ) -> torch.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
+            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
+                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spembs = self.projection(F.normalize(spembs))
+            hs = hs + spembs.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds
+            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
+            hs = torch.cat([hs, spembs], dim=-1)
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
diff --git a/espnet2/tts/transformer.py b/espnet2/tts/transformer.py
deleted file mode 100644
index 15e9085352f..00000000000
--- a/espnet2/tts/transformer.py
+++ /dev/null
@@ -1,775 +0,0 @@
-# Copyright 2020 Nagoya University (Tomoki Hayashi)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-"""TTS-Transformer related modules."""
-
-from typing import Dict
-from typing import Sequence
-from typing import Tuple
-
-import torch
-import torch.nn.functional as F
-from typeguard import check_argument_types
-
-from espnet.nets.pytorch_backend.e2e_tts_transformer import GuidedMultiHeadAttentionLoss
-from espnet.nets.pytorch_backend.e2e_tts_transformer import TransformerLoss
-from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
-from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
-from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
-from espnet.nets.pytorch_backend.tacotron2.decoder import Prenet as DecoderPrenet
-from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder as EncoderPrenet
-from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
-from espnet.nets.pytorch_backend.transformer.decoder import Decoder
-from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
-from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
-from espnet.nets.pytorch_backend.transformer.encoder import Encoder
-from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
-from espnet2.torch_utils.device_funcs import force_gatherable
-from espnet2.torch_utils.initialize import initialize
-from espnet2.tts.abs_tts import AbsTTS
-from espnet2.tts.gst.style_encoder import StyleEncoder
-
-
-class Transformer(AbsTTS):
-    """TTS-Transformer module.
-
-    This is a module of text-to-speech Transformer described in `Neural Speech Synthesis
-    with Transformer Network`_, which convert the sequence of tokens into the sequence
-    of Mel-filterbanks.
-
-    .. _`Neural Speech Synthesis with Transformer Network`:
-        https://arxiv.org/pdf/1809.08895.pdf
-
-    Args:
-        idim (int): Dimension of the inputs.
-        odim (int): Dimension of the outputs.
-        embed_dim (int, optional): Dimension of character embedding.
-        eprenet_conv_layers (int, optional):
-            Number of encoder prenet convolution layers.
-        eprenet_conv_chans (int, optional):
-            Number of encoder prenet convolution channels.
-        eprenet_conv_filts (int, optional):
-            Filter size of encoder prenet convolution.
-        dprenet_layers (int, optional): Number of decoder prenet layers.
-        dprenet_units (int, optional): Number of decoder prenet hidden units.
-        elayers (int, optional): Number of encoder layers.
-        eunits (int, optional): Number of encoder hidden units.
-        adim (int, optional): Number of attention transformation dimensions.
-        aheads (int, optional): Number of heads for multi head attention.
-        dlayers (int, optional): Number of decoder layers.
-        dunits (int, optional): Number of decoder hidden units.
-        postnet_layers (int, optional): Number of postnet layers.
-        postnet_chans (int, optional): Number of postnet channels.
-        postnet_filts (int, optional): Filter size of postnet.
-        use_scaled_pos_enc (bool, optional):
-            Whether to use trainable scaled positional encoding.
-        use_batch_norm (bool, optional):
-            Whether to use batch normalization in encoder prenet.
-        encoder_normalize_before (bool, optional):
-            Whether to perform layer normalization before encoder block.
-        decoder_normalize_before (bool, optional):
-            Whether to perform layer normalization before decoder block.
-        encoder_concat_after (bool, optional): Whether to concatenate attention
-            layer's input and output in encoder.
-        decoder_concat_after (bool, optional): Whether to concatenate attention
-            layer's input and output in decoder.
-        positionwise_layer_type (str, optional):
-            Position-wise operation type.
-        positionwise_conv_kernel_size (int, optional):
-            Kernel size in position wise conv 1d.
-        reduction_factor (int, optional): Reduction factor.
-        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
-        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
-        use_gst (str, optional): Whether to use global style token.
-        gst_tokens (int, optional): The number of GST embeddings.
-        gst_heads (int, optional): The number of heads in GST multihead attention.
-        gst_conv_layers (int, optional): The number of conv layers in GST.
-        gst_conv_chans_list: (Sequence[int], optional):
-            List of the number of channels of conv layers in GST.
-        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
-        gst_conv_stride (int, optional): Stride size of conv layers in GST.
-        gst_gru_layers (int, optional): The number of GRU layers in GST.
-        gst_gru_units (int, optional): The number of GRU units in GST.
-        transformer_lr (float, optional): Initial value of learning rate.
-        transformer_warmup_steps (int, optional): Optimizer warmup steps.
-        transformer_enc_dropout_rate (float, optional):
-            Dropout rate in encoder except attention and positional encoding.
-        transformer_enc_positional_dropout_rate (float, optional):
-            Dropout rate after encoder positional encoding.
-        transformer_enc_attn_dropout_rate (float, optional):
-            Dropout rate in encoder self-attention module.
-        transformer_dec_dropout_rate (float, optional):
-            Dropout rate in decoder except attention & positional encoding.
-        transformer_dec_positional_dropout_rate (float, optional):
-            Dropout rate after decoder positional encoding.
-        transformer_dec_attn_dropout_rate (float, optional):
-            Dropout rate in deocoder self-attention module.
-        transformer_enc_dec_attn_dropout_rate (float, optional):
-            Dropout rate in encoder-deocoder attention module.
-        init_type (str, optional):
-            How to initialize transformer parameters.
-        init_enc_alpha (float, optional):
-            Initial value of alpha in scaled pos encoding of the encoder.
-        init_dec_alpha (float, optional):
-            Initial value of alpha in scaled pos encoding of the decoder.
-        eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
-        dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
-        postnet_dropout_rate (float, optional): Dropout rate in postnet.
-        use_masking (bool, optional):
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking (bool, optional):
-            Whether to apply weighted masking in loss calculation.
-        bce_pos_weight (float, optional): Positive sample weight in bce calculation
-            (only for use_masking=true).
-        loss_type (str, optional): How to calculate loss.
-        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
-        num_heads_applied_guided_attn (int, optional):
-            Number of heads in each layer to apply guided attention loss.
-        num_layers_applied_guided_attn (int, optional):
-            Number of layers to apply guided attention loss.
-        modules_applied_guided_attn (Sequence[str], optional):
-            List of module names to apply guided attention loss.
-        guided_attn_loss_sigma (float, optional) Sigma in guided attention loss.
-        guided_attn_loss_lambda (float, optional): Lambda in guided attention loss.
-
-    """
-
-    def __init__(
-        self,
-        # network structure related
-        idim: int,
-        odim: int,
-        embed_dim: int = 512,
-        eprenet_conv_layers: int = 3,
-        eprenet_conv_chans: int = 256,
-        eprenet_conv_filts: int = 5,
-        dprenet_layers: int = 2,
-        dprenet_units: int = 256,
-        elayers: int = 6,
-        eunits: int = 1024,
-        adim: int = 512,
-        aheads: int = 4,
-        dlayers: int = 6,
-        dunits: int = 1024,
-        postnet_layers: int = 5,
-        postnet_chans: int = 256,
-        postnet_filts: int = 5,
-        positionwise_layer_type: str = "conv1d",
-        positionwise_conv_kernel_size: int = 1,
-        use_scaled_pos_enc: bool = True,
-        use_batch_norm: bool = True,
-        encoder_normalize_before: bool = True,
-        decoder_normalize_before: bool = True,
-        encoder_concat_after: bool = False,
-        decoder_concat_after: bool = False,
-        reduction_factor: int = 1,
-        spk_embed_dim: int = None,
-        spk_embed_integration_type: str = "add",
-        use_gst: bool = False,
-        gst_tokens: int = 10,
-        gst_heads: int = 4,
-        gst_conv_layers: int = 6,
-        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
-        gst_conv_kernel_size: int = 3,
-        gst_conv_stride: int = 2,
-        gst_gru_layers: int = 1,
-        gst_gru_units: int = 128,
-        # training related
-        transformer_enc_dropout_rate: float = 0.1,
-        transformer_enc_positional_dropout_rate: float = 0.1,
-        transformer_enc_attn_dropout_rate: float = 0.1,
-        transformer_dec_dropout_rate: float = 0.1,
-        transformer_dec_positional_dropout_rate: float = 0.1,
-        transformer_dec_attn_dropout_rate: float = 0.1,
-        transformer_enc_dec_attn_dropout_rate: float = 0.1,
-        eprenet_dropout_rate: float = 0.5,
-        dprenet_dropout_rate: float = 0.5,
-        postnet_dropout_rate: float = 0.5,
-        init_type: str = "xavier_uniform",
-        init_enc_alpha: float = 1.0,
-        init_dec_alpha: float = 1.0,
-        use_masking: bool = False,
-        use_weighted_masking: bool = False,
-        bce_pos_weight: float = 5.0,
-        loss_type: str = "L1",
-        use_guided_attn_loss: bool = True,
-        num_heads_applied_guided_attn: int = 2,
-        num_layers_applied_guided_attn: int = 2,
-        modules_applied_guided_attn: Sequence[str] = ("encoder-decoder"),
-        guided_attn_loss_sigma: float = 0.4,
-        guided_attn_loss_lambda: float = 1.0,
-    ):
-        """Initialize Transformer module."""
-        assert check_argument_types()
-        super().__init__()
-
-        # store hyperparameters
-        self.idim = idim
-        self.odim = odim
-        self.eos = idim - 1
-        self.spk_embed_dim = spk_embed_dim
-        self.reduction_factor = reduction_factor
-        self.use_gst = use_gst
-        self.use_guided_attn_loss = use_guided_attn_loss
-        self.use_scaled_pos_enc = use_scaled_pos_enc
-        self.loss_type = loss_type
-        self.use_guided_attn_loss = use_guided_attn_loss
-        if self.use_guided_attn_loss:
-            if num_layers_applied_guided_attn == -1:
-                self.num_layers_applied_guided_attn = elayers
-            else:
-                self.num_layers_applied_guided_attn = num_layers_applied_guided_attn
-            if num_heads_applied_guided_attn == -1:
-                self.num_heads_applied_guided_attn = aheads
-            else:
-                self.num_heads_applied_guided_attn = num_heads_applied_guided_attn
-            self.modules_applied_guided_attn = modules_applied_guided_attn
-        if self.spk_embed_dim is not None:
-            self.spk_embed_integration_type = spk_embed_integration_type
-
-        # use idx 0 as padding idx
-        self.padding_idx = 0
-
-        # get positional encoding class
-        pos_enc_class = (
-            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
-        )
-
-        # define transformer encoder
-        if eprenet_conv_layers != 0:
-            # encoder prenet
-            encoder_input_layer = torch.nn.Sequential(
-                EncoderPrenet(
-                    idim=idim,
-                    embed_dim=embed_dim,
-                    elayers=0,
-                    econv_layers=eprenet_conv_layers,
-                    econv_chans=eprenet_conv_chans,
-                    econv_filts=eprenet_conv_filts,
-                    use_batch_norm=use_batch_norm,
-                    dropout_rate=eprenet_dropout_rate,
-                    padding_idx=self.padding_idx,
-                ),
-                torch.nn.Linear(eprenet_conv_chans, adim),
-            )
-        else:
-            encoder_input_layer = torch.nn.Embedding(
-                num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx
-            )
-        self.encoder = Encoder(
-            idim=idim,
-            attention_dim=adim,
-            attention_heads=aheads,
-            linear_units=eunits,
-            num_blocks=elayers,
-            input_layer=encoder_input_layer,
-            dropout_rate=transformer_enc_dropout_rate,
-            positional_dropout_rate=transformer_enc_positional_dropout_rate,
-            attention_dropout_rate=transformer_enc_attn_dropout_rate,
-            pos_enc_class=pos_enc_class,
-            normalize_before=encoder_normalize_before,
-            concat_after=encoder_concat_after,
-            positionwise_layer_type=positionwise_layer_type,
-            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-        )
-
-        # define GST
-        if self.use_gst:
-            self.gst = StyleEncoder(
-                idim=odim,  # the input is mel-spectrogram
-                gst_tokens=gst_tokens,
-                gst_token_dim=adim,
-                gst_heads=gst_heads,
-                conv_layers=gst_conv_layers,
-                conv_chans_list=gst_conv_chans_list,
-                conv_kernel_size=gst_conv_kernel_size,
-                conv_stride=gst_conv_stride,
-                gru_layers=gst_gru_layers,
-                gru_units=gst_gru_units,
-            )
-
-        # define projection layer
-        if self.spk_embed_dim is not None:
-            if self.spk_embed_integration_type == "add":
-                self.projection = torch.nn.Linear(self.spk_embed_dim, adim)
-            else:
-                self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)
-
-        # define transformer decoder
-        if dprenet_layers != 0:
-            # decoder prenet
-            decoder_input_layer = torch.nn.Sequential(
-                DecoderPrenet(
-                    idim=odim,
-                    n_layers=dprenet_layers,
-                    n_units=dprenet_units,
-                    dropout_rate=dprenet_dropout_rate,
-                ),
-                torch.nn.Linear(dprenet_units, adim),
-            )
-        else:
-            decoder_input_layer = "linear"
-        self.decoder = Decoder(
-            odim=odim,  # odim is needed when no prenet is used
-            attention_dim=adim,
-            attention_heads=aheads,
-            linear_units=dunits,
-            num_blocks=dlayers,
-            dropout_rate=transformer_dec_dropout_rate,
-            positional_dropout_rate=transformer_dec_positional_dropout_rate,
-            self_attention_dropout_rate=transformer_dec_attn_dropout_rate,
-            src_attention_dropout_rate=transformer_enc_dec_attn_dropout_rate,
-            input_layer=decoder_input_layer,
-            use_output_layer=False,
-            pos_enc_class=pos_enc_class,
-            normalize_before=decoder_normalize_before,
-            concat_after=decoder_concat_after,
-        )
-
-        # define final projection
-        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
-        self.prob_out = torch.nn.Linear(adim, reduction_factor)
-
-        # define postnet
-        self.postnet = (
-            None
-            if postnet_layers == 0
-            else Postnet(
-                idim=idim,
-                odim=odim,
-                n_layers=postnet_layers,
-                n_chans=postnet_chans,
-                n_filts=postnet_filts,
-                use_batch_norm=use_batch_norm,
-                dropout_rate=postnet_dropout_rate,
-            )
-        )
-
-        # define loss function
-        self.criterion = TransformerLoss(
-            use_masking=use_masking,
-            use_weighted_masking=use_weighted_masking,
-            bce_pos_weight=bce_pos_weight,
-        )
-        if self.use_guided_attn_loss:
-            self.attn_criterion = GuidedMultiHeadAttentionLoss(
-                sigma=guided_attn_loss_sigma,
-                alpha=guided_attn_loss_lambda,
-            )
-
-        # initialize parameters
-        self._reset_parameters(
-            init_type=init_type,
-            init_enc_alpha=init_enc_alpha,
-            init_dec_alpha=init_enc_alpha,
-        )
-
-    def _reset_parameters(self, init_type, init_enc_alpha=1.0, init_dec_alpha=1.0):
-        # initialize parameters
-        if init_type != "pytorch":
-            initialize(self, init_type)
-
-        # initialize alpha in scaled positional encoding
-        if self.use_scaled_pos_enc:
-            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
-            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)
-
-    def forward(
-        self,
-        text: torch.Tensor,
-        text_lengths: torch.Tensor,
-        speech: torch.Tensor,
-        speech_lengths: torch.Tensor,
-        spembs: torch.Tensor = None,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
-        """Calculate forward propagation.
-
-        Args:
-            text (LongTensor): Batch of padded character ids (B, Tmax).
-            text_lengths (LongTensor): Batch of lengths of each input batch (B,).
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
-            speech_lengths (LongTensor): Batch of the lengths of each target (B,).
-            spembs (Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns:
-            Tensor: Loss scalar value.
-            Dict: Statistics to be monitored.
-            Tensor: Weight value.
-
-        """
-        text = text[:, : text_lengths.max()]  # for data-parallel
-        speech = speech[:, : speech_lengths.max()]  # for data-parallel
-        batch_size = text.size(0)
-
-        # Add eos at the last of sequence
-        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
-        for i, l in enumerate(text_lengths):
-            xs[i, l] = self.eos
-        ilens = text_lengths + 1
-
-        ys = speech
-        olens = speech_lengths
-
-        # make labels for stop prediction
-        labels = make_pad_mask(olens - 1).to(ys.device, ys.dtype)
-        labels = F.pad(labels, [0, 1], "constant", 1.0)
-
-        # calculate transformer outputs
-        after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, spembs)
-
-        # modifiy mod part of groundtruth
-        olens_in = olens
-        if self.reduction_factor > 1:
-            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
-            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
-            max_olen = max(olens)
-            ys = ys[:, :max_olen]
-            labels = labels[:, :max_olen]
-            labels[:, -1] = 1.0  # make sure at least one frame has 1
-
-        # caluculate loss values
-        l1_loss, l2_loss, bce_loss = self.criterion(
-            after_outs, before_outs, logits, ys, labels, olens
-        )
-        if self.loss_type == "L1":
-            loss = l1_loss + bce_loss
-        elif self.loss_type == "L2":
-            loss = l2_loss + bce_loss
-        elif self.loss_type == "L1+L2":
-            loss = l1_loss + l2_loss + bce_loss
-        else:
-            raise ValueError("unknown --loss-type " + self.loss_type)
-
-        stats = dict(
-            l1_loss=l1_loss.item(),
-            l2_loss=l2_loss.item(),
-            bce_loss=bce_loss.item(),
-        )
-
-        # calculate guided attention loss
-        if self.use_guided_attn_loss:
-            # calculate for encoder
-            if "encoder" in self.modules_applied_guided_attn:
-                att_ws = []
-                for idx, layer_idx in enumerate(
-                    reversed(range(len(self.encoder.encoders)))
-                ):
-                    att_ws += [
-                        self.encoder.encoders[layer_idx].self_attn.attn[
-                            :, : self.num_heads_applied_guided_attn
-                        ]
-                    ]
-                    if idx + 1 == self.num_layers_applied_guided_attn:
-                        break
-                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_in, T_in)
-                enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
-                loss = loss + enc_attn_loss
-                stats.update(enc_attn_loss=enc_attn_loss.item())
-            # calculate for decoder
-            if "decoder" in self.modules_applied_guided_attn:
-                att_ws = []
-                for idx, layer_idx in enumerate(
-                    reversed(range(len(self.decoder.decoders)))
-                ):
-                    att_ws += [
-                        self.decoder.decoders[layer_idx].self_attn.attn[
-                            :, : self.num_heads_applied_guided_attn
-                        ]
-                    ]
-                    if idx + 1 == self.num_layers_applied_guided_attn:
-                        break
-                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_out, T_out)
-                dec_attn_loss = self.attn_criterion(att_ws, olens_in, olens_in)
-                loss = loss + dec_attn_loss
-                stats.update(dec_attn_loss=dec_attn_loss.item())
-            # calculate for encoder-decoder
-            if "encoder-decoder" in self.modules_applied_guided_attn:
-                att_ws = []
-                for idx, layer_idx in enumerate(
-                    reversed(range(len(self.decoder.decoders)))
-                ):
-                    att_ws += [
-                        self.decoder.decoders[layer_idx].src_attn.attn[
-                            :, : self.num_heads_applied_guided_attn
-                        ]
-                    ]
-                    if idx + 1 == self.num_layers_applied_guided_attn:
-                        break
-                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_out, T_in)
-                enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens_in)
-                loss = loss + enc_dec_attn_loss
-                stats.update(enc_dec_attn_loss=enc_dec_attn_loss.item())
-
-        stats.update(loss=loss.item())
-
-        # report extra information
-        if self.use_scaled_pos_enc:
-            stats.update(
-                encoder_alpha=self.encoder.embed[-1].alpha.data.item(),
-                decoder_alpha=self.decoder.embed[-1].alpha.data.item(),
-            )
-
-        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
-        return loss, stats, weight
-
-    def _forward(
-        self,
-        xs: torch.Tensor,
-        ilens: torch.Tensor,
-        ys: torch.Tensor,
-        olens: torch.Tensor,
-        spembs: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # forward encoder
-        x_masks = self._source_mask(ilens)
-        hs, h_masks = self.encoder(xs, x_masks)
-
-        # integrate with GST
-        if self.use_gst:
-            style_embs = self.gst(ys)
-            hs = hs + style_embs.unsqueeze(1)
-
-        # integrate speaker embedding
-        if self.spk_embed_dim is not None:
-            hs = self._integrate_with_spk_embed(hs, spembs)
-
-        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
-        if self.reduction_factor > 1:
-            ys_in = ys[:, self.reduction_factor - 1 :: self.reduction_factor]
-            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
-        else:
-            ys_in, olens_in = ys, olens
-
-        # add first zero frame and remove last frame for auto-regressive
-        ys_in = self._add_first_frame_and_remove_last_frame(ys_in)
-
-        # forward decoder
-        y_masks = self._target_mask(olens_in)
-        zs, _ = self.decoder(ys_in, y_masks, hs, h_masks)
-        # (B, Lmax//r, odim * r) -> (B, Lmax//r * r, odim)
-        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)
-        # (B, Lmax//r, r) -> (B, Lmax//r * r)
-        logits = self.prob_out(zs).view(zs.size(0), -1)
-
-        # postnet -> (B, Lmax//r * r, odim)
-        if self.postnet is None:
-            after_outs = before_outs
-        else:
-            after_outs = before_outs + self.postnet(
-                before_outs.transpose(1, 2)
-            ).transpose(1, 2)
-
-        return after_outs, before_outs, logits
-
-    def inference(
-        self,
-        text: torch.Tensor,
-        speech: torch.Tensor = None,
-        spembs: torch.Tensor = None,
-        threshold: float = 0.5,
-        minlenratio: float = 0.0,
-        maxlenratio: float = 10.0,
-        use_teacher_forcing: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Generate the sequence of features given the sequences of characters.
-
-        Args:
-            text (LongTensor): Input sequence of characters (T,).
-            speech (Tensor, optional): Feature sequence to extract style (N, idim).
-            spembs (Tensor, optional): Speaker embedding vector (spk_embed_dim,).
-            threshold (float, optional): Threshold in inference.
-            minlenratio (float, optional): Minimum length ratio in inference.
-            maxlenratio (float, optional): Maximum length ratio in inference.
-            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
-
-        Returns:
-            Tensor: Output sequence of features (L, odim).
-            Tensor: Output sequence of stop probabilities (L,).
-            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).
-
-        """
-        x = text
-        y = speech
-        spemb = spembs
-
-        # add eos at the last of sequence
-        x = F.pad(x, [0, 1], "constant", self.eos)
-
-        # inference with teacher forcing
-        if use_teacher_forcing:
-            assert speech is not None, "speech must be provided with teacher forcing."
-
-            # get teacher forcing outputs
-            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
-            spembs = None if spemb is None else spemb.unsqueeze(0)
-            ilens = x.new_tensor([xs.size(1)]).long()
-            olens = y.new_tensor([ys.size(1)]).long()
-            outs, *_ = self._forward(xs, ilens, ys, olens, spembs)
-
-            # get attention weights
-            att_ws = []
-            for i in range(len(self.decoder.decoders)):
-                att_ws += [self.decoder.decoders[i].src_attn.attn]
-            att_ws = torch.stack(att_ws, dim=1)  # (B, L, H, T_out, T_in)
-
-            return outs[0], None, att_ws[0]
-
-        # forward encoder
-        xs = x.unsqueeze(0)
-        hs, _ = self.encoder(xs, None)
-
-        # integrate GST
-        if self.use_gst:
-            style_embs = self.gst(y.unsqueeze(0))
-            hs = hs + style_embs.unsqueeze(1)
-
-        # integrate speaker embedding
-        if self.spk_embed_dim is not None:
-            spembs = spemb.unsqueeze(0)
-            hs = self._integrate_with_spk_embed(hs, spembs)
-
-        # set limits of length
-        maxlen = int(hs.size(1) * maxlenratio / self.reduction_factor)
-        minlen = int(hs.size(1) * minlenratio / self.reduction_factor)
-
-        # initialize
-        idx = 0
-        ys = hs.new_zeros(1, 1, self.odim)
-        outs, probs = [], []
-
-        # forward decoder step-by-step
-        z_cache = self.decoder.init_state(x)
-        while True:
-            # update index
-            idx += 1
-
-            # calculate output and stop prob at idx-th step
-            y_masks = subsequent_mask(idx).unsqueeze(0).to(x.device)
-            z, z_cache = self.decoder.forward_one_step(
-                ys, y_masks, hs, cache=z_cache
-            )  # (B, adim)
-            outs += [
-                self.feat_out(z).view(self.reduction_factor, self.odim)
-            ]  # [(r, odim), ...]
-            probs += [torch.sigmoid(self.prob_out(z))[0]]  # [(r), ...]
-
-            # update next inputs
-            ys = torch.cat(
-                (ys, outs[-1][-1].view(1, 1, self.odim)), dim=1
-            )  # (1, idx + 1, odim)
-
-            # get attention weights
-            att_ws_ = []
-            for name, m in self.named_modules():
-                if isinstance(m, MultiHeadedAttention) and "src" in name:
-                    att_ws_ += [m.attn[0, :, -1].unsqueeze(1)]  # [(#heads, 1, T),...]
-            if idx == 1:
-                att_ws = att_ws_
-            else:
-                # [(#heads, l, T), ...]
-                att_ws = [
-                    torch.cat([att_w, att_w_], dim=1)
-                    for att_w, att_w_ in zip(att_ws, att_ws_)
-                ]
-
-            # check whether to finish generation
-            if int(sum(probs[-1] >= threshold)) > 0 or idx >= maxlen:
-                # check mininum length
-                if idx < minlen:
-                    continue
-                outs = (
-                    torch.cat(outs, dim=0).unsqueeze(0).transpose(1, 2)
-                )  # (L, odim) -> (1, L, odim) -> (1, odim, L)
-                if self.postnet is not None:
-                    outs = outs + self.postnet(outs)  # (1, odim, L)
-                outs = outs.transpose(2, 1).squeeze(0)  # (L, odim)
-                probs = torch.cat(probs, dim=0)
-                break
-
-        # concatenate attention weights -> (#layers, #heads, L, T)
-        att_ws = torch.stack(att_ws, dim=0)
-
-        return outs, probs, att_ws
-
-    def _add_first_frame_and_remove_last_frame(self, ys: torch.Tensor) -> torch.Tensor:
-        ys_in = torch.cat(
-            [ys.new_zeros((ys.shape[0], 1, ys.shape[2])), ys[:, :-1]], dim=1
-        )
-        return ys_in
-
-    def _source_mask(self, ilens):
-        """Make masks for self-attention.
-
-        Args:
-            ilens (LongTensor): Batch of lengths (B,).
-
-        Returns:
-            Tensor: Mask tensor for self-attention.
-                    dtype=torch.uint8 in PyTorch 1.2-
-                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
-
-        Examples:
-            >>> ilens = [5, 3]
-            >>> self._source_mask(ilens)
-            tensor([[[1, 1, 1, 1, 1],
-                    [[1, 1, 1, 0, 0]]], dtype=torch.uint8)
-
-        """
-        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
-        return x_masks.unsqueeze(-2)
-
-    def _target_mask(self, olens: torch.Tensor) -> torch.Tensor:
-        """Make masks for masked self-attention.
-
-        Args:
-            olens (LongTensor): Batch of lengths (B,).
-
-        Returns:
-            Tensor: Mask tensor for masked self-attention.
-                dtype=torch.uint8 in PyTorch 1.2-
-                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
-
-        Examples:
-            >>> olens = [5, 3]
-            >>> self._target_mask(olens)
-            tensor([[[1, 0, 0, 0, 0],
-                     [1, 1, 0, 0, 0],
-                     [1, 1, 1, 0, 0],
-                     [1, 1, 1, 1, 0],
-                     [1, 1, 1, 1, 1]],
-                    [[1, 0, 0, 0, 0],
-                     [1, 1, 0, 0, 0],
-                     [1, 1, 1, 0, 0],
-                     [1, 1, 1, 0, 0],
-                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
-
-        """
-        y_masks = make_non_pad_mask(olens).to(next(self.parameters()).device)
-        s_masks = subsequent_mask(y_masks.size(-1), device=y_masks.device).unsqueeze(0)
-        return y_masks.unsqueeze(-2) & s_masks
-
-    def _integrate_with_spk_embed(
-        self, hs: torch.Tensor, spembs: torch.Tensor
-    ) -> torch.Tensor:
-        """Integrate speaker embedding with hidden states.
-
-        Args:
-            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
-            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns:
-            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
-
-        """
-        if self.spk_embed_integration_type == "add":
-            # apply projection and then add to hidden states
-            spembs = self.projection(F.normalize(spembs))
-            hs = hs + spembs.unsqueeze(1)
-        elif self.spk_embed_integration_type == "concat":
-            # concat hidden states with spk embeds and then apply projection
-            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
-            hs = self.projection(torch.cat([hs, spembs], dim=-1))
-        else:
-            raise NotImplementedError("support only add or concat.")
-
-        return hs
diff --git a/espnet2/tts/transformer/__init__.py b/espnet2/tts/transformer/__init__.py
new file mode 100644
index 00000000000..ea85b036b56
--- /dev/null
+++ b/espnet2/tts/transformer/__init__.py
@@ -0,0 +1 @@
+from espnet2.tts.transformer.transformer import Transformer  # NOQA
diff --git a/espnet2/tts/transformer/transformer.py b/espnet2/tts/transformer/transformer.py
new file mode 100644
index 00000000000..f6d1f13cdb1
--- /dev/null
+++ b/espnet2/tts/transformer/transformer.py
@@ -0,0 +1,846 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Transformer-TTS related modules."""
+
+from typing import Dict
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+
+from typeguard import check_argument_types
+
+from espnet.nets.pytorch_backend.e2e_tts_transformer import GuidedMultiHeadAttentionLoss
+from espnet.nets.pytorch_backend.e2e_tts_transformer import TransformerLoss
+from espnet.nets.pytorch_backend.nets_utils import make_non_pad_mask
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.tacotron2.decoder import Postnet
+from espnet.nets.pytorch_backend.tacotron2.decoder import Prenet as DecoderPrenet
+from espnet.nets.pytorch_backend.tacotron2.encoder import Encoder as EncoderPrenet
+from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
+from espnet.nets.pytorch_backend.transformer.decoder import Decoder
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
+from espnet.nets.pytorch_backend.transformer.encoder import Encoder
+from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.torch_utils.initialize import initialize
+from espnet2.tts.abs_tts import AbsTTS
+from espnet2.tts.gst.style_encoder import StyleEncoder
+
+
+class Transformer(AbsTTS):
+    """Transformer-TTS module.
+
+    This is a module of text-to-speech Transformer described in `Neural Speech Synthesis
+    with Transformer Network`_, which convert the sequence of tokens into the sequence
+    of Mel-filterbanks.
+
+    .. _`Neural Speech Synthesis with Transformer Network`:
+        https://arxiv.org/pdf/1809.08895.pdf
+
+    """
+
+    def __init__(
+        self,
+        # network structure related
+        idim: int,
+        odim: int,
+        embed_dim: int = 512,
+        eprenet_conv_layers: int = 3,
+        eprenet_conv_chans: int = 256,
+        eprenet_conv_filts: int = 5,
+        dprenet_layers: int = 2,
+        dprenet_units: int = 256,
+        elayers: int = 6,
+        eunits: int = 1024,
+        adim: int = 512,
+        aheads: int = 4,
+        dlayers: int = 6,
+        dunits: int = 1024,
+        postnet_layers: int = 5,
+        postnet_chans: int = 256,
+        postnet_filts: int = 5,
+        positionwise_layer_type: str = "conv1d",
+        positionwise_conv_kernel_size: int = 1,
+        use_scaled_pos_enc: bool = True,
+        use_batch_norm: bool = True,
+        encoder_normalize_before: bool = True,
+        decoder_normalize_before: bool = True,
+        encoder_concat_after: bool = False,
+        decoder_concat_after: bool = False,
+        reduction_factor: int = 1,
+        # extra embedding related
+        spks: Optional[int] = None,
+        langs: Optional[int] = None,
+        spk_embed_dim: Optional[int] = None,
+        spk_embed_integration_type: str = "add",
+        use_gst: bool = False,
+        gst_tokens: int = 10,
+        gst_heads: int = 4,
+        gst_conv_layers: int = 6,
+        gst_conv_chans_list: Sequence[int] = (32, 32, 64, 64, 128, 128),
+        gst_conv_kernel_size: int = 3,
+        gst_conv_stride: int = 2,
+        gst_gru_layers: int = 1,
+        gst_gru_units: int = 128,
+        # training related
+        transformer_enc_dropout_rate: float = 0.1,
+        transformer_enc_positional_dropout_rate: float = 0.1,
+        transformer_enc_attn_dropout_rate: float = 0.1,
+        transformer_dec_dropout_rate: float = 0.1,
+        transformer_dec_positional_dropout_rate: float = 0.1,
+        transformer_dec_attn_dropout_rate: float = 0.1,
+        transformer_enc_dec_attn_dropout_rate: float = 0.1,
+        eprenet_dropout_rate: float = 0.5,
+        dprenet_dropout_rate: float = 0.5,
+        postnet_dropout_rate: float = 0.5,
+        init_type: str = "xavier_uniform",
+        init_enc_alpha: float = 1.0,
+        init_dec_alpha: float = 1.0,
+        use_masking: bool = False,
+        use_weighted_masking: bool = False,
+        bce_pos_weight: float = 5.0,
+        loss_type: str = "L1",
+        use_guided_attn_loss: bool = True,
+        num_heads_applied_guided_attn: int = 2,
+        num_layers_applied_guided_attn: int = 2,
+        modules_applied_guided_attn: Sequence[str] = ("encoder-decoder"),
+        guided_attn_loss_sigma: float = 0.4,
+        guided_attn_loss_lambda: float = 1.0,
+    ):
+        """Initialize Transformer module.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            embed_dim (int): Dimension of character embedding.
+            eprenet_conv_layers (int): Number of encoder prenet convolution layers.
+            eprenet_conv_chans (int): Number of encoder prenet convolution channels.
+            eprenet_conv_filts (int): Filter size of encoder prenet convolution.
+            dprenet_layers (int): Number of decoder prenet layers.
+            dprenet_units (int): Number of decoder prenet hidden units.
+            elayers (int): Number of encoder layers.
+            eunits (int): Number of encoder hidden units.
+            adim (int): Number of attention transformation dimensions.
+            aheads (int): Number of heads for multi head attention.
+            dlayers (int): Number of decoder layers.
+            dunits (int): Number of decoder hidden units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_chans (int): Number of postnet channels.
+            postnet_filts (int): Filter size of postnet.
+            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
+            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
+            encoder_normalize_before (bool): Whether to apply layernorm layer before
+                encoder block.
+            decoder_normalize_before (bool): Whether to apply layernorm layer before
+                decoder block.
+            encoder_concat_after (bool): Whether to concatenate attention layer's input
+                and output in encoder.
+            decoder_concat_after (bool): Whether to concatenate attention layer's input
+                and output in decoder.
+            positionwise_layer_type (str): Position-wise operation type.
+            positionwise_conv_kernel_size (int): Kernel size in position wise conv 1d.
+            reduction_factor (int): Reduction factor.
+            spks (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            langs (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spembs will be provided as the input.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            use_gst (str): Whether to use global style token.
+            gst_tokens (int): Number of GST embeddings.
+            gst_heads (int): Number of heads in GST multihead attention.
+            gst_conv_layers (int): Number of conv layers in GST.
+            gst_conv_chans_list: (Sequence[int]): List of the number of channels of conv
+                layers in GST.
+            gst_conv_kernel_size (int): Kernel size of conv layers in GST.
+            gst_conv_stride (int): Stride size of conv layers in GST.
+            gst_gru_layers (int): Number of GRU layers in GST.
+            gst_gru_units (int): Number of GRU units in GST.
+            transformer_lr (float): Initial value of learning rate.
+            transformer_warmup_steps (int): Optimizer warmup steps.
+            transformer_enc_dropout_rate (float): Dropout rate in encoder except
+                attention and positional encoding.
+            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+                positional encoding.
+            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+                self-attention module.
+            transformer_dec_dropout_rate (float): Dropout rate in decoder except
+                attention & positional encoding.
+            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+                positional encoding.
+            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+                self-attention module.
+            transformer_enc_dec_attn_dropout_rate (float): Dropout rate in source
+                attention module.
+            init_type (str): How to initialize transformer parameters.
+            init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the
+                encoder.
+            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the
+                decoder.
+            eprenet_dropout_rate (float): Dropout rate in encoder prenet.
+            dprenet_dropout_rate (float): Dropout rate in decoder prenet.
+            postnet_dropout_rate (float): Dropout rate in postnet.
+            use_masking (bool): Whether to apply masking for padded part in loss
+                calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in loss
+                calculation.
+            bce_pos_weight (float): Positive sample weight in bce calculation
+                (only for use_masking=true).
+            loss_type (str): How to calculate loss.
+            use_guided_attn_loss (bool): Whether to use guided attention loss.
+            num_heads_applied_guided_attn (int): Number of heads in each layer to apply
+                guided attention loss.
+            num_layers_applied_guided_attn (int): Number of layers to apply guided
+                attention loss.
+            modules_applied_guided_attn (Sequence[str]): List of module names to apply
+                guided attention loss.
+            guided_attn_loss_sigma (float) Sigma in guided attention loss.
+            guided_attn_loss_lambda (float): Lambda in guided attention loss.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.reduction_factor = reduction_factor
+        self.use_gst = use_gst
+        self.use_guided_attn_loss = use_guided_attn_loss
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.loss_type = loss_type
+        self.use_guided_attn_loss = use_guided_attn_loss
+        if self.use_guided_attn_loss:
+            if num_layers_applied_guided_attn == -1:
+                self.num_layers_applied_guided_attn = elayers
+            else:
+                self.num_layers_applied_guided_attn = num_layers_applied_guided_attn
+            if num_heads_applied_guided_attn == -1:
+                self.num_heads_applied_guided_attn = aheads
+            else:
+                self.num_heads_applied_guided_attn = num_heads_applied_guided_attn
+            self.modules_applied_guided_attn = modules_applied_guided_attn
+
+        # use idx 0 as padding idx
+        self.padding_idx = 0
+
+        # get positional encoding class
+        pos_enc_class = (
+            ScaledPositionalEncoding if self.use_scaled_pos_enc else PositionalEncoding
+        )
+
+        # define transformer encoder
+        if eprenet_conv_layers != 0:
+            # encoder prenet
+            encoder_input_layer = torch.nn.Sequential(
+                EncoderPrenet(
+                    idim=idim,
+                    embed_dim=embed_dim,
+                    elayers=0,
+                    econv_layers=eprenet_conv_layers,
+                    econv_chans=eprenet_conv_chans,
+                    econv_filts=eprenet_conv_filts,
+                    use_batch_norm=use_batch_norm,
+                    dropout_rate=eprenet_dropout_rate,
+                    padding_idx=self.padding_idx,
+                ),
+                torch.nn.Linear(eprenet_conv_chans, adim),
+            )
+        else:
+            encoder_input_layer = torch.nn.Embedding(
+                num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx
+            )
+        self.encoder = Encoder(
+            idim=idim,
+            attention_dim=adim,
+            attention_heads=aheads,
+            linear_units=eunits,
+            num_blocks=elayers,
+            input_layer=encoder_input_layer,
+            dropout_rate=transformer_enc_dropout_rate,
+            positional_dropout_rate=transformer_enc_positional_dropout_rate,
+            attention_dropout_rate=transformer_enc_attn_dropout_rate,
+            pos_enc_class=pos_enc_class,
+            normalize_before=encoder_normalize_before,
+            concat_after=encoder_concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+        )
+
+        # define GST
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=adim,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units,
+            )
+
+        # define spk and lang embedding
+        self.spks = None
+        if spks is not None and spks > 1:
+            self.spks = spks
+            self.sid_emb = torch.nn.Embedding(spks, adim)
+        self.langs = None
+        if langs is not None and langs > 1:
+            self.langs = langs
+            self.lid_emb = torch.nn.Embedding(langs, adim)
+
+        # define projection layer
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            self.spk_embed_dim = spk_embed_dim
+            self.spk_embed_integration_type = spk_embed_integration_type
+        if self.spk_embed_dim is not None:
+            if self.spk_embed_integration_type == "add":
+                self.projection = torch.nn.Linear(self.spk_embed_dim, adim)
+            else:
+                self.projection = torch.nn.Linear(adim + self.spk_embed_dim, adim)
+
+        # define transformer decoder
+        if dprenet_layers != 0:
+            # decoder prenet
+            decoder_input_layer = torch.nn.Sequential(
+                DecoderPrenet(
+                    idim=odim,
+                    n_layers=dprenet_layers,
+                    n_units=dprenet_units,
+                    dropout_rate=dprenet_dropout_rate,
+                ),
+                torch.nn.Linear(dprenet_units, adim),
+            )
+        else:
+            decoder_input_layer = "linear"
+        self.decoder = Decoder(
+            odim=odim,  # odim is needed when no prenet is used
+            attention_dim=adim,
+            attention_heads=aheads,
+            linear_units=dunits,
+            num_blocks=dlayers,
+            dropout_rate=transformer_dec_dropout_rate,
+            positional_dropout_rate=transformer_dec_positional_dropout_rate,
+            self_attention_dropout_rate=transformer_dec_attn_dropout_rate,
+            src_attention_dropout_rate=transformer_enc_dec_attn_dropout_rate,
+            input_layer=decoder_input_layer,
+            use_output_layer=False,
+            pos_enc_class=pos_enc_class,
+            normalize_before=decoder_normalize_before,
+            concat_after=decoder_concat_after,
+        )
+
+        # define final projection
+        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
+        self.prob_out = torch.nn.Linear(adim, reduction_factor)
+
+        # define postnet
+        self.postnet = (
+            None
+            if postnet_layers == 0
+            else Postnet(
+                idim=idim,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=use_batch_norm,
+                dropout_rate=postnet_dropout_rate,
+            )
+        )
+
+        # define loss function
+        self.criterion = TransformerLoss(
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight,
+        )
+        if self.use_guided_attn_loss:
+            self.attn_criterion = GuidedMultiHeadAttentionLoss(
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda,
+            )
+
+        # initialize parameters
+        self._reset_parameters(
+            init_type=init_type,
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha,
+        )
+
+    def _reset_parameters(self, init_type, init_enc_alpha=1.0, init_dec_alpha=1.0):
+        # initialize parameters
+        if init_type != "pytorch":
+            initialize(self, init_type)
+
+        # initialize alpha in scaled positional encoding
+        if self.use_scaled_pos_enc:
+            self.encoder.embed[-1].alpha.data = torch.tensor(init_enc_alpha)
+            self.decoder.embed[-1].alpha.data = torch.tensor(init_dec_alpha)
+
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        feats: torch.Tensor,
+        feats_lengths: torch.Tensor,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        joint_training: bool = False,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text (LongTensor): Batch of padded character ids (B, Tmax).
+            text_lengths (LongTensor): Batch of lengths of each input batch (B,).
+            feats (Tensor): Batch of padded target features (B, Lmax, odim).
+            feats_lengths (LongTensor): Batch of the lengths of each target (B,).
+            spembs (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            sids (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lids (Optional[Tensor]): Batch of language IDs (B, 1).
+            joint_training (bool): Whether to perform joint training with vocoder.
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.
+
+        """
+        text = text[:, : text_lengths.max()]  # for data-parallel
+        feats = feats[:, : feats_lengths.max()]  # for data-parallel
+        batch_size = text.size(0)
+
+        # Add eos at the last of sequence
+        xs = F.pad(text, [0, 1], "constant", self.padding_idx)
+        for i, l in enumerate(text_lengths):
+            xs[i, l] = self.eos
+        ilens = text_lengths + 1
+
+        ys = feats
+        olens = feats_lengths
+
+        # make labels for stop prediction
+        labels = make_pad_mask(olens - 1).to(ys.device, ys.dtype)
+        labels = F.pad(labels, [0, 1], "constant", 1.0)
+
+        # calculate transformer outputs
+        after_outs, before_outs, logits = self._forward(
+            xs=xs,
+            ilens=ilens,
+            ys=ys,
+            olens=olens,
+            spembs=spembs,
+            sids=sids,
+            lids=lids,
+        )
+
+        # modifiy mod part of groundtruth
+        olens_in = olens
+        if self.reduction_factor > 1:
+            assert olens.ge(
+                self.reduction_factor
+            ).all(), "Output length must be greater than or equal to reduction factor."
+            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
+            olens = olens.new([olen - olen % self.reduction_factor for olen in olens])
+            max_olen = max(olens)
+            ys = ys[:, :max_olen]
+            labels = labels[:, :max_olen]
+            labels = torch.scatter(
+                labels, 1, (olens - 1).unsqueeze(1), 1.0
+            )  # see #3388
+
+        # calculate loss values
+        l1_loss, l2_loss, bce_loss = self.criterion(
+            after_outs, before_outs, logits, ys, labels, olens
+        )
+        if self.loss_type == "L1":
+            loss = l1_loss + bce_loss
+        elif self.loss_type == "L2":
+            loss = l2_loss + bce_loss
+        elif self.loss_type == "L1+L2":
+            loss = l1_loss + l2_loss + bce_loss
+        else:
+            raise ValueError("unknown --loss-type " + self.loss_type)
+
+        stats = dict(
+            l1_loss=l1_loss.item(),
+            l2_loss=l2_loss.item(),
+            bce_loss=bce_loss.item(),
+        )
+
+        # calculate guided attention loss
+        if self.use_guided_attn_loss:
+            # calculate for encoder
+            if "encoder" in self.modules_applied_guided_attn:
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                    reversed(range(len(self.encoder.encoders)))
+                ):
+                    att_ws += [
+                        self.encoder.encoders[layer_idx].self_attn.attn[
+                            :, : self.num_heads_applied_guided_attn
+                        ]
+                    ]
+                    if idx + 1 == self.num_layers_applied_guided_attn:
+                        break
+                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_text, T_text)
+                enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
+                loss = loss + enc_attn_loss
+                stats.update(enc_attn_loss=enc_attn_loss.item())
+            # calculate for decoder
+            if "decoder" in self.modules_applied_guided_attn:
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                    reversed(range(len(self.decoder.decoders)))
+                ):
+                    att_ws += [
+                        self.decoder.decoders[layer_idx].self_attn.attn[
+                            :, : self.num_heads_applied_guided_attn
+                        ]
+                    ]
+                    if idx + 1 == self.num_layers_applied_guided_attn:
+                        break
+                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_feats, T_feats)
+                dec_attn_loss = self.attn_criterion(att_ws, olens_in, olens_in)
+                loss = loss + dec_attn_loss
+                stats.update(dec_attn_loss=dec_attn_loss.item())
+            # calculate for encoder-decoder
+            if "encoder-decoder" in self.modules_applied_guided_attn:
+                att_ws = []
+                for idx, layer_idx in enumerate(
+                    reversed(range(len(self.decoder.decoders)))
+                ):
+                    att_ws += [
+                        self.decoder.decoders[layer_idx].src_attn.attn[
+                            :, : self.num_heads_applied_guided_attn
+                        ]
+                    ]
+                    if idx + 1 == self.num_layers_applied_guided_attn:
+                        break
+                att_ws = torch.cat(att_ws, dim=1)  # (B, H*L, T_feats, T_text)
+                enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens_in)
+                loss = loss + enc_dec_attn_loss
+                stats.update(enc_dec_attn_loss=enc_dec_attn_loss.item())
+
+        # report extra information
+        if self.use_scaled_pos_enc:
+            stats.update(
+                encoder_alpha=self.encoder.embed[-1].alpha.data.item(),
+                decoder_alpha=self.decoder.embed[-1].alpha.data.item(),
+            )
+
+        if not joint_training:
+            stats.update(loss=loss.item())
+            loss, stats, weight = force_gatherable(
+                (loss, stats, batch_size), loss.device
+            )
+            return loss, stats, weight
+        else:
+            return loss, stats, after_outs
+
+    def _forward(
+        self,
+        xs: torch.Tensor,
+        ilens: torch.Tensor,
+        ys: torch.Tensor,
+        olens: torch.Tensor,
+        spembs: torch.Tensor,
+        sids: torch.Tensor,
+        lids: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # forward encoder
+        x_masks = self._source_mask(ilens)
+        hs, h_masks = self.encoder(xs, x_masks)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate with SID and LID embeddings
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.view(-1))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.view(-1))
+            hs = hs + lid_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        # thin out frames for reduction factor
+        # (B, T_feats, odim) ->  (B, T_feats//r, odim)
+        if self.reduction_factor > 1:
+            ys_in = ys[:, self.reduction_factor - 1 :: self.reduction_factor]
+            olens_in = olens.new([olen // self.reduction_factor for olen in olens])
+        else:
+            ys_in, olens_in = ys, olens
+
+        # add first zero frame and remove last frame for auto-regressive
+        ys_in = self._add_first_frame_and_remove_last_frame(ys_in)
+
+        # forward decoder
+        y_masks = self._target_mask(olens_in)
+        zs, _ = self.decoder(ys_in, y_masks, hs, h_masks)
+        # (B, T_feats//r, odim * r) -> (B, T_feats//r * r, odim)
+        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)
+        # (B, T_feats//r, r) -> (B, T_feats//r * r)
+        logits = self.prob_out(zs).view(zs.size(0), -1)
+
+        # postnet -> (B, T_feats//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose(1, 2)
+            ).transpose(1, 2)
+
+        return after_outs, before_outs, logits
+
+    def inference(
+        self,
+        text: torch.Tensor,
+        feats: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
+        sids: Optional[torch.Tensor] = None,
+        lids: Optional[torch.Tensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 10.0,
+        use_teacher_forcing: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text (LongTensor): Input sequence of characters (T_text,).
+            feats (Optional[Tensor]): Feature sequence to extract style embedding
+                (T_feats', idim).
+            spembs (Optional[Tensor]): Speaker embedding (spk_embed_dim,).
+            sids (Optional[Tensor]): Speaker ID (1,).
+            lids (Optional[Tensor]): Language ID (1,).
+            threshold (float): Threshold in inference.
+            minlenratio (float): Minimum length ratio in inference.
+            maxlenratio (float): Maximum length ratio in inference.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
+            Dict[str, Tensor]: Output dict including the following items:
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
+                * att_w (Tensor): Source attn weight (#layers, #heads, T_feats, T_text).
+
+        """
+        x = text
+        y = feats
+        spemb = spembs
+
+        # add eos at the last of sequence
+        x = F.pad(x, [0, 1], "constant", self.eos)
+
+        # inference with teacher forcing
+        if use_teacher_forcing:
+            assert feats is not None, "feats must be provided with teacher forcing."
+
+            # get teacher forcing outputs
+            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
+            spembs = None if spemb is None else spemb.unsqueeze(0)
+            ilens = x.new_tensor([xs.size(1)]).long()
+            olens = y.new_tensor([ys.size(1)]).long()
+            outs, *_ = self._forward(
+                xs=xs,
+                ilens=ilens,
+                ys=ys,
+                olens=olens,
+                spembs=spembs,
+                sids=sids,
+                lids=lids,
+            )
+
+            # get attention weights
+            att_ws = []
+            for i in range(len(self.decoder.decoders)):
+                att_ws += [self.decoder.decoders[i].src_attn.attn]
+            att_ws = torch.stack(att_ws, dim=1)  # (B, L, H, T_feats, T_text)
+
+            return dict(feat_gen=outs[0], att_w=att_ws[0])
+
+        # forward encoder
+        xs = x.unsqueeze(0)
+        hs, _ = self.encoder(xs, None)
+
+        # integrate GST
+        if self.use_gst:
+            style_embs = self.gst(y.unsqueeze(0))
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate spk & lang embeddings
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.view(-1))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.view(-1))
+            hs = hs + lid_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            spembs = spemb.unsqueeze(0)
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        # set limits of length
+        maxlen = int(hs.size(1) * maxlenratio / self.reduction_factor)
+        minlen = int(hs.size(1) * minlenratio / self.reduction_factor)
+
+        # initialize
+        idx = 0
+        ys = hs.new_zeros(1, 1, self.odim)
+        outs, probs = [], []
+
+        # forward decoder step-by-step
+        z_cache = self.decoder.init_state(x)
+        while True:
+            # update index
+            idx += 1
+
+            # calculate output and stop prob at idx-th step
+            y_masks = subsequent_mask(idx).unsqueeze(0).to(x.device)
+            z, z_cache = self.decoder.forward_one_step(
+                ys, y_masks, hs, cache=z_cache
+            )  # (B, adim)
+            outs += [
+                self.feat_out(z).view(self.reduction_factor, self.odim)
+            ]  # [(r, odim), ...]
+            probs += [torch.sigmoid(self.prob_out(z))[0]]  # [(r), ...]
+
+            # update next inputs
+            ys = torch.cat(
+                (ys, outs[-1][-1].view(1, 1, self.odim)), dim=1
+            )  # (1, idx + 1, odim)
+
+            # get attention weights
+            att_ws_ = []
+            for name, m in self.named_modules():
+                if isinstance(m, MultiHeadedAttention) and "src" in name:
+                    att_ws_ += [m.attn[0, :, -1].unsqueeze(1)]  # [(#heads, 1, T),...]
+            if idx == 1:
+                att_ws = att_ws_
+            else:
+                # [(#heads, l, T), ...]
+                att_ws = [
+                    torch.cat([att_w, att_w_], dim=1)
+                    for att_w, att_w_ in zip(att_ws, att_ws_)
+                ]
+
+            # check whether to finish generation
+            if int(sum(probs[-1] >= threshold)) > 0 or idx >= maxlen:
+                # check mininum length
+                if idx < minlen:
+                    continue
+                outs = (
+                    torch.cat(outs, dim=0).unsqueeze(0).transpose(1, 2)
+                )  # (T_feats, odim) -> (1, T_feats, odim) -> (1, odim, T_feats)
+                if self.postnet is not None:
+                    outs = outs + self.postnet(outs)  # (1, odim, T_feats)
+                outs = outs.transpose(2, 1).squeeze(0)  # (T_feats, odim)
+                probs = torch.cat(probs, dim=0)
+                break
+
+        # concatenate attention weights -> (#layers, #heads, T_feats, T_text)
+        att_ws = torch.stack(att_ws, dim=0)
+
+        return dict(feat_gen=outs, prob=probs, att_w=att_ws)
+
+    def _add_first_frame_and_remove_last_frame(self, ys: torch.Tensor) -> torch.Tensor:
+        ys_in = torch.cat(
+            [ys.new_zeros((ys.shape[0], 1, ys.shape[2])), ys[:, :-1]], dim=1
+        )
+        return ys_in
+
+    def _source_mask(self, ilens):
+        """Make masks for self-attention.
+
+        Args:
+            ilens (LongTensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                    [[1, 1, 1, 0, 0]]], dtype=torch.uint8)
+
+        """
+        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
+        return x_masks.unsqueeze(-2)
+
+    def _target_mask(self, olens: torch.Tensor) -> torch.Tensor:
+        """Make masks for masked self-attention.
+
+        Args:
+            olens (LongTensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for masked self-attention.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
+        Examples:
+            >>> olens = [5, 3]
+            >>> self._target_mask(olens)
+            tensor([[[1, 0, 0, 0, 0],
+                     [1, 1, 0, 0, 0],
+                     [1, 1, 1, 0, 0],
+                     [1, 1, 1, 1, 0],
+                     [1, 1, 1, 1, 1]],
+                    [[1, 0, 0, 0, 0],
+                     [1, 1, 0, 0, 0],
+                     [1, 1, 1, 0, 0],
+                     [1, 1, 1, 0, 0],
+                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
+
+        """
+        y_masks = make_non_pad_mask(olens).to(next(self.parameters()).device)
+        s_masks = subsequent_mask(y_masks.size(-1), device=y_masks.device).unsqueeze(0)
+        return y_masks.unsqueeze(-2) & s_masks
+
+    def _integrate_with_spk_embed(
+        self, hs: torch.Tensor, spembs: torch.Tensor
+    ) -> torch.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spembs = self.projection(F.normalize(spembs))
+            hs = hs + spembs.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds and then apply projection
+            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.size(1), -1)
+            hs = self.projection(torch.cat([hs, spembs], dim=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
diff --git a/espnet2/tts/utils/__init__.py b/espnet2/tts/utils/__init__.py
new file mode 100644
index 00000000000..0b512d822e8
--- /dev/null
+++ b/espnet2/tts/utils/__init__.py
@@ -0,0 +1,4 @@
+from espnet2.tts.utils.duration_calculator import DurationCalculator  # NOQA
+from espnet2.tts.utils.parallel_wavegan_pretrained_vocoder import (  # NOQA
+    ParallelWaveGANPretrainedVocoder,  # NOQA
+)
diff --git a/espnet2/tts/duration_calculator.py b/espnet2/tts/utils/duration_calculator.py
similarity index 76%
rename from espnet2/tts/duration_calculator.py
rename to espnet2/tts/utils/duration_calculator.py
index 82a31498e5f..79ce9a8bd71 100644
--- a/espnet2/tts/duration_calculator.py
+++ b/espnet2/tts/utils/duration_calculator.py
@@ -13,19 +13,16 @@
 class DurationCalculator(torch.nn.Module):
     """Duration calculator module."""
 
-    def __init__(self):
-        """Initilize duration calculator."""
-        super().__init__()
-
     @torch.no_grad()
     def forward(self, att_ws: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Convert attention weight to durations.
 
         Args:
-            att_ws (Tesnor): Attention weight tensor (L, T) or (#layers, #heads, L, T).
+            att_ws (Tesnor): Attention weight tensor (T_feats, T_text) or
+                (#layers, #heads, T_feats, T_text).
 
         Returns:
-            LongTensor: Duration of each input (T,).
+            LongTensor: Duration of each input (T_text,).
             Tensor: Focus rate value.
 
         """
@@ -37,10 +34,10 @@ def forward(self, att_ws: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     @staticmethod
     def _calculate_focus_rete(att_ws):
         if len(att_ws.shape) == 2:
-            # tacotron 2 case -> (L, T)
+            # tacotron 2 case -> (T_feats, T_text)
             return att_ws.max(dim=-1)[0].mean()
         elif len(att_ws.shape) == 4:
-            # transformer case -> (#layers, #heads, L, T)
+            # transformer case -> (#layers, #heads, T_feats, T_text)
             return att_ws.max(dim=-1)[0].mean(dim=-1).max()
         else:
             raise ValueError("att_ws should be 2 or 4 dimensional tensor.")
@@ -48,17 +45,17 @@ def _calculate_focus_rete(att_ws):
     @staticmethod
     def _calculate_duration(att_ws):
         if len(att_ws.shape) == 2:
-            # tacotron 2 case -> (L, T)
+            # tacotron 2 case -> (T_feats, T_text)
             pass
         elif len(att_ws.shape) == 4:
-            # transformer case -> (#layers, #heads, L, T)
+            # transformer case -> (#layers, #heads, T_feats, T_text)
             # get the most diagonal head according to focus rate
             att_ws = torch.cat(
                 [att_w for att_w in att_ws], dim=0
-            )  # (#heads * #layers, L, T)
+            )  # (#heads * #layers, T_feats, T_text)
             diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1)  # (#heads * #layers,)
             diagonal_head_idx = diagonal_scores.argmax()
-            att_ws = att_ws[diagonal_head_idx]  # (L, T)
+            att_ws = att_ws[diagonal_head_idx]  # (T_feats, T_text)
         else:
             raise ValueError("att_ws should be 2 or 4 dimensional tensor.")
         # calculate duration from 2d attention weight
diff --git a/espnet2/tts/utils/parallel_wavegan_pretrained_vocoder.py b/espnet2/tts/utils/parallel_wavegan_pretrained_vocoder.py
new file mode 100644
index 00000000000..5ac5c48cda8
--- /dev/null
+++ b/espnet2/tts/utils/parallel_wavegan_pretrained_vocoder.py
@@ -0,0 +1,63 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Wrapper class for the vocoder model trained with parallel_wavegan repo."""
+
+import logging
+import os
+
+from pathlib import Path
+from typing import Optional
+from typing import Union
+
+import yaml
+
+import torch
+
+
+class ParallelWaveGANPretrainedVocoder(torch.nn.Module):
+    """Wrapper class to load the vocoder trained with parallel_wavegan repo."""
+
+    def __init__(
+        self,
+        model_file: Union[Path, str],
+        config_file: Optional[Union[Path, str]] = None,
+    ):
+        """Initialize ParallelWaveGANPretrainedVocoder module."""
+        super().__init__()
+        try:
+            from parallel_wavegan.utils import load_model
+        except ImportError:
+            logging.error(
+                "`parallel_wavegan` is not installed. "
+                "Please install via `pip install -U parallel_wavegan`."
+            )
+            raise
+        if config_file is None:
+            dirname = os.path.dirname(str(model_file))
+            config_file = os.path.join(dirname, "config.yml")
+        with open(config_file) as f:
+            config = yaml.load(f, Loader=yaml.Loader)
+        self.fs = config["sampling_rate"]
+        self.vocoder = load_model(model_file, config)
+        if hasattr(self.vocoder, "remove_weight_norm"):
+            self.vocoder.remove_weight_norm()
+        self.normalize_before = False
+        if hasattr(self.vocoder, "mean"):
+            self.normalize_before = True
+
+    @torch.no_grad()
+    def forward(self, feats: torch.Tensor) -> torch.Tensor:
+        """Generate waveform with pretrained vocoder.
+
+        Args:
+            feats (Tensor): Feature tensor (T_feats, #mels).
+
+        Returns:
+            Tensor: Generated waveform tensor (T_wav).
+
+        """
+        return self.vocoder.inference(
+            feats,
+            normalize_before=self.normalize_before,
+        ).view(-1)
diff --git a/espnet2/utils/griffin_lim.py b/espnet2/utils/griffin_lim.py
index a56f7fc683d..c1536d51b2b 100644
--- a/espnet2/utils/griffin_lim.py
+++ b/espnet2/utils/griffin_lim.py
@@ -14,6 +14,7 @@
 
 import librosa
 import numpy as np
+import torch
 
 EPS = 1e-10
 
@@ -44,7 +45,9 @@ def logmel2linear(
     fmin = 0 if fmin is None else fmin
     fmax = fs / 2 if fmax is None else fmax
     mspc = np.power(10.0, lmspc)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+    )
     inv_mel_basis = np.linalg.pinv(mel_basis)
     return np.maximum(EPS, np.dot(inv_mel_basis, mspc.T).T)
 
@@ -119,7 +122,7 @@ def __init__(
         window: Optional[str] = "hann",
         fmin: int = None,
         fmax: int = None,
-        griffin_lim_iters: Optional[int] = 32,
+        griffin_lim_iters: Optional[int] = 8,
     ):
         """Initialize module.
 
@@ -169,17 +172,21 @@ def __repr__(self):
         retval += ")"
         return retval
 
-    def __call__(self, spc):
+    def __call__(self, spc: torch.Tensor) -> torch.Tensor:
         """Convert spectrogram to waveform.
 
         Args:
-            spc: Log Mel filterbank (T, n_mels)
-                or linear spectrogram (T, n_fft // 2 + 1).
+            spc: Log Mel filterbank (T_feats, n_mels)
+                or linear spectrogram (T_feats, n_fft // 2 + 1).
 
         Returns:
-            Reconstructed waveform (N,).
+            Tensor: Reconstructed waveform (T_wav,).
 
         """
+        device = spc.device
+        dtype = spc.dtype
+        spc = spc.cpu().numpy()
         if self.logmel2linear is not None:
             spc = self.logmel2linear(spc)
-        return self.griffin_lim(spc)
+        wav = self.griffin_lim(spc)
+        return torch.tensor(wav).to(device=device, dtype=dtype)
diff --git a/setup.py b/setup.py
index 726c1af50ac..1fb22d2c7c2 100644
--- a/setup.py
+++ b/setup.py
@@ -14,39 +14,39 @@
         "setuptools>=38.5.1",
         "configargparse>=1.2.1",
         "typeguard>=2.7.0",
-        "dataclasses; python_version < '3.7'",
         "humanfriendly",
         "scipy>=1.4.1",
-        "matplotlib==3.1.0",
-        "pillow>=6.1.0",
-        "editdistance==0.5.2",
-        "ctc-segmentation>=1.4.0",
-        "wandb",
-        # DNN related packages are installed by Makefile
-        # 'torch==1.0.1'
-        # "chainer==6.0.0",
-        # 'cupy==6.0.0',
-        "tensorboard>=1.14",  # For pytorch>=1.1.0
-        "tensorboardX>=1.8",  # For pytorch<1.1.0
-        # Signal processing related
+        "filelock",
         "librosa>=0.8.0",
-        # Natural language processing related
-        # FIXME(kamo): Sentencepiece 0.1.90 breaks backwardcompatibility?
-        "sentencepiece<0.1.90,>=0.1.82",
-        "nltk>=3.4.5",
-        # File IO related
+        "jamo==0.4.1",  # For kss
         "PyYAML>=5.1.2",
         "soundfile>=0.10.2",
         "h5py>=2.10.0",
         "kaldiio>=2.17.0",
-        # TTS related
+        "torch>=1.3.0",
+        "torch_complex",
+        "nltk>=3.4.5",
+        # ASR
+        "sentencepiece",
+        "ctc-segmentation<1.8,>=1.6.6",
+        # TTS
         "pyworld>=0.2.10",
+        "pypinyin<=0.44.0",
         "espnet_tts_frontend",
-        # ASR frontend related
-        "nara_wpe>=0.0.5",
-        "torch_complex",
+        # ENH
+        "ci_sdr",
         "pytorch_wpe",
     ],
+    # train: The modules invoked when training only.
+    "train": [
+        "matplotlib==3.1.0",
+        "pillow>=6.1.0",
+        "editdistance==0.5.2",
+        "wandb",
+        "tensorboard>=1.14",
+    ],
+    # recipe: The modules actually are not invoked in the main module of espnet,
+    #         but are invoked for the python scripts in each recipe
     "recipe": [
         "espnet_model_zoo",
         "gdown",
@@ -59,8 +59,24 @@
         "pystoi>=0.2.2",
         "mir-eval>=0.6",
         "fastdtw",
+        "nara_wpe>=0.0.5",
+        "sacrebleu>=1.5.1",
+    ],
+    # all: The modules should be optionally installled due to some reason.
+    #      Please consider moving them to "install" occasionally
+    # NOTE(kamo): The modules in "train" and "recipe" are appended into "all"
+    "all": [
+        # NOTE(kamo): Append modules requiring specific pytorch version or torch>1.3.0
+        "torchaudio",
+        "torch_optimizer",
+        "fairscale",
+        "transformers",
+        "gtn==0.0.0",
+    ],
+    "setup": [
+        "numpy<=1.21.3",
+        "pytest-runner",
     ],
-    "setup": ["numpy", "pytest-runner"],
     "test": [
         "pytest>=3.3.0",
         "pytest-timeouts>=1.2.1",
@@ -75,51 +91,18 @@
         "black",
     ],
     "doc": [
+        "Jinja2<3.1",
         "Sphinx==2.1.2",
         "sphinx-rtd-theme>=0.2.4",
         "sphinx-argparse>=0.2.5",
         "commonmark==0.8.1",
         "recommonmark>=0.4.0",
-        "travis-sphinx>=2.0.1",
         "nbsphinx>=0.4.2",
         "sphinx-markdown-tables>=0.0.12",
     ],
 }
-try:
-    # NOTE(kamo): These packages are not listed if installing from the PyPI server
-    import torch
-
-    if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"):
-        requirements["install"].append("torch_optimizer")
-    if LooseVersion(torch.__version__) >= LooseVersion("1.5.1"):
-        requirements["install"].append("fairscale")
-
-    if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"):
-        requirements["install"].append("torchaudio==0.8.0")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.7.1"):
-        requirements["install"].append("torchaudio==0.7.2")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.7.0"):
-        requirements["install"].append("torchaudio==0.7.0")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
-        # Due to https://github.com/pytorch/pytorch/issues/42213,
-        # use torchaudio.functional.istft instead of torch.functional.istft
-        requirements["install"].append("torchaudio==0.6.0")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.5.1"):
-        requirements["install"].append("torchaudio==0.5.1")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.5.0"):
-        requirements["install"].append("torchaudio==0.5.0")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.4.0"):
-        requirements["install"].append("torchaudio==0.4.0")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.3.1"):
-        requirements["install"].append("torchaudio==0.3.2")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.3.0"):
-        requirements["install"].append("torchaudio==0.3.1")
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.2.0"):
-        requirements["install"].append("torchaudio==0.3.0")
-
-    del torch
-except ImportError:
-    pass
+requirements["all"].extend(requirements["train"] + requirements["recipe"])
+requirements["test"].extend(requirements["train"])
 
 install_requires = requirements["install"]
 setup_requires = requirements["setup"]
@@ -150,13 +133,13 @@
     setup_requires=setup_requires,
     tests_require=tests_require,
     extras_require=extras_require,
-    python_requires=">=3.6.0",
+    python_requires=">=3.7.0",
     classifiers=[
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
         "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Science/Research",
         "Operating System :: POSIX :: Linux",
diff --git a/test/espnet2/asr/decoder/test_mlm_decoder.py b/test/espnet2/asr/decoder/test_mlm_decoder.py
new file mode 100644
index 00000000000..97887611abb
--- /dev/null
+++ b/test/espnet2/asr/decoder/test_mlm_decoder.py
@@ -0,0 +1,34 @@
+import pytest
+import torch
+
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+
+
+@pytest.mark.parametrize("input_layer", ["linear", "embed"])
+@pytest.mark.parametrize("normalize_before", [True, False])
+@pytest.mark.parametrize("use_output_layer", [True, False])
+def test_MLMDecoder_backward(input_layer, normalize_before, use_output_layer):
+    vocab_size = 10
+    decoder = MLMDecoder(
+        vocab_size,
+        12,
+        linear_units=10,
+        num_blocks=2,
+        input_layer=input_layer,
+        normalize_before=normalize_before,
+        use_output_layer=use_output_layer,
+    )
+    x = torch.randn(2, 9, 12)
+    x_lens = torch.tensor([9, 7], dtype=torch.long)
+    if input_layer == "embed":
+        t = torch.randint(0, vocab_size + 1, [2, 4], dtype=torch.long)
+    else:
+        t = torch.randn(2, 4, vocab_size + 1)
+    t_lens = torch.tensor([4, 3], dtype=torch.long)
+    z_all, ys_in_lens = decoder(x, x_lens, t, t_lens)
+    z_all.sum().backward()
+
+
+def test_MLMDecoder_invalid_type():
+    with pytest.raises(ValueError):
+        MLMDecoder(10, 12, input_layer="foo")
diff --git a/test/espnet2/asr/decoder/test_transformer_decoder.py b/test/espnet2/asr/decoder/test_transformer_decoder.py
index b8d86f9eb4a..d01c5b07a64 100644
--- a/test/espnet2/asr/decoder/test_transformer_decoder.py
+++ b/test/espnet2/asr/decoder/test_transformer_decoder.py
@@ -91,6 +91,7 @@ def test_TransformerDecoder_invalid_type(decoder_class):
 @pytest.mark.parametrize("normalize_before", [True, False])
 @pytest.mark.parametrize("use_output_layer", [True])
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float64])
+@pytest.mark.parametrize("maxlenratio", [1.0, 0.0, -1.0])
 @pytest.mark.parametrize(
     "decoder_class",
     [
@@ -102,7 +103,7 @@ def test_TransformerDecoder_invalid_type(decoder_class):
     ],
 )
 def test_TransformerDecoder_beam_search(
-    input_layer, normalize_before, use_output_layer, dtype, decoder_class
+    input_layer, normalize_before, use_output_layer, dtype, maxlenratio, decoder_class
 ):
     token_list = ["<blank>", "a", "b", "c", "unk", "<eos>"]
     vocab_size = len(token_list)
@@ -132,7 +133,7 @@ def test_TransformerDecoder_beam_search(
     with torch.no_grad():
         beam(
             x=enc,
-            maxlenratio=0.0,
+            maxlenratio=maxlenratio,
             minlenratio=0.0,
         )
 
@@ -216,7 +217,7 @@ def test_TransformerDecoder_batch_beam_search_online(
         use_output_layer=use_output_layer,
         linear_units=10,
     )
-    ctc = CTC(odim=vocab_size, encoder_output_sizse=encoder_output_size)
+    ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size)
     ctc.to(dtype)
     ctc_scorer = CTCPrefixScorer(ctc=ctc, eos=vocab_size - 1)
     beam = BatchBeamSearchOnlineSim(
diff --git a/test/espnet2/asr/encoder/test_conformer_encoder.py b/test/espnet2/asr/encoder/test_conformer_encoder.py
index fe3348a8dcc..9acb0cd1b8a 100644
--- a/test/espnet2/asr/encoder/test_conformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_conformer_encoder.py
@@ -1,19 +1,41 @@
 import pytest
 import torch
 
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
 
 
 @pytest.mark.parametrize(
-    "input_layer", ["linear", "conv2d", "conv2d6", "conv2d8", "embed"]
+    "input_layer", ["linear", "conv2d", "conv2d2", "conv2d6", "conv2d8", "embed"]
 )
 @pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
 @pytest.mark.parametrize(
-    "pos_enc_layer_type, selfattention_layer_type",
-    [("abs_pos", "selfattn"), ("rel_pos", "rel_selfattn")],
+    "rel_pos_type, pos_enc_layer_type, selfattention_layer_type",
+    [
+        ("legacy", "abs_pos", "selfattn"),
+        ("latest", "rel_pos", "rel_selfattn"),
+        ("legacy", "rel_pos", "rel_selfattn"),
+        ("legacy", "legacy_rel_pos", "legacy_rel_selfattn"),
+    ],
 )
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning",
+    [
+        ([], False),
+        ([1], False),
+        ([1], True),
+    ],
+)
+@pytest.mark.parametrize("stochastic_depth_rate", [0.0, 0.1, [0.1, 0.1]])
 def test_encoder_forward_backward(
-    input_layer, positionwise_layer_type, pos_enc_layer_type, selfattention_layer_type
+    input_layer,
+    positionwise_layer_type,
+    rel_pos_type,
+    pos_enc_layer_type,
+    selfattention_layer_type,
+    interctc_layer_idx,
+    interctc_use_conditioning,
+    stochastic_depth_rate,
 ):
     encoder = ConformerEncoder(
         20,
@@ -23,22 +45,84 @@ def test_encoder_forward_backward(
         num_blocks=2,
         input_layer=input_layer,
         macaron_style=False,
+        rel_pos_type=rel_pos_type,
         pos_enc_layer_type=pos_enc_layer_type,
         selfattention_layer_type=selfattention_layer_type,
         activation_type="swish",
         use_cnn_module=True,
         cnn_module_kernel=3,
         positionwise_layer_type=positionwise_layer_type,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
+        stochastic_depth_rate=stochastic_depth_rate,
     )
     if input_layer == "embed":
         x = torch.randint(0, 10, [2, 32])
     else:
         x = torch.randn(2, 32, 20, requires_grad=True)
     x_lens = torch.LongTensor([32, 28])
-    y, _, _ = encoder(x, x_lens)
+    if len(interctc_layer_idx) > 0:
+        ctc = None
+        if interctc_use_conditioning:
+            vocab_size = 5
+            output_size = encoder.output_size()
+            ctc = CTC(odim=vocab_size, encoder_output_size=output_size)
+            encoder.conditioning_layer = torch.nn.Linear(vocab_size, output_size)
+        y, _, _ = encoder(x, x_lens, ctc=ctc)
+        y = y[0]
+    else:
+        y, _, _ = encoder(x, x_lens)
     y.sum().backward()
 
 
+def test_encoder_invalid_layer_type():
+    with pytest.raises(ValueError):
+        ConformerEncoder(20, rel_pos_type="dummy")
+    with pytest.raises(ValueError):
+        ConformerEncoder(20, pos_enc_layer_type="dummy")
+    with pytest.raises(ValueError):
+        ConformerEncoder(
+            20, pos_enc_layer_type="abc_pos", selfattention_layer_type="dummy"
+        )
+
+
+def test_encoder_invalid_rel_pos_combination():
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            rel_pos_type="latest",
+            pos_enc_layer_type="legacy_rel_pos",
+            selfattention_layer_type="legacy_rel_sselfattn",
+        )
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            pos_enc_layer_type="rel_pos",
+            selfattention_layer_type="legacy_rel_sselfattn",
+        )
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            pos_enc_layer_type="legacy_rel_pos",
+            selfattention_layer_type="rel_sselfattn",
+        )
+
+
+def test_encoder_invalid_interctc_layer_idx():
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[0, 1],
+        )
+    with pytest.raises(AssertionError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[1, 2],
+        )
+
+
 def test_encoder_output_size():
     encoder = ConformerEncoder(20, output_size=256)
     assert encoder.output_size() == 256
@@ -47,3 +131,18 @@ def test_encoder_output_size():
 def test_encoder_invalid_type():
     with pytest.raises(ValueError):
         ConformerEncoder(20, input_layer="fff")
+
+
+def test_encoder_invalid_stochastic_depth_rate():
+    with pytest.raises(ValueError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            stochastic_depth_rate=[0.1],
+        )
+    with pytest.raises(ValueError):
+        ConformerEncoder(
+            20,
+            num_blocks=2,
+            stochastic_depth_rate=[0.1, 0.1, 0.1],
+        )
diff --git a/test/espnet2/asr/encoder/test_longformer_encoder.py b/test/espnet2/asr/encoder/test_longformer_encoder.py
new file mode 100644
index 00000000000..8df5f5fc212
--- /dev/null
+++ b/test/espnet2/asr/encoder/test_longformer_encoder.py
@@ -0,0 +1,83 @@
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+import pytest
+import torch
+
+pytest.importorskip("longformer")
+
+
+@pytest.mark.parametrize(
+    "input_layer", ["linear", "conv2d", "conv2d2", "conv2d6", "conv2d8", "embed"]
+)
+@pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
+@pytest.mark.parametrize(
+    "rel_pos_type, pos_enc_layer_type, selfattention_layer_type",
+    [
+        ("legacy", "abs_pos", "lf_selfattn"),
+    ],
+)
+def test_encoder_forward_backward(
+    input_layer,
+    positionwise_layer_type,
+    rel_pos_type,
+    pos_enc_layer_type,
+    selfattention_layer_type,
+):
+    pytest.importorskip("longformer")
+    encoder = LongformerEncoder(
+        20,
+        output_size=2,
+        attention_heads=2,
+        linear_units=4,
+        num_blocks=2,
+        input_layer=input_layer,
+        macaron_style=False,
+        rel_pos_type=rel_pos_type,
+        pos_enc_layer_type=pos_enc_layer_type,
+        selfattention_layer_type=selfattention_layer_type,
+        activation_type="swish",
+        use_cnn_module=True,
+        cnn_module_kernel=3,
+        positionwise_layer_type=positionwise_layer_type,
+        attention_windows=[10, 10],
+        attention_dilation=[1, 1],
+        attention_mode="sliding_chunks",
+    )
+    if input_layer == "embed":
+        x = torch.randint(0, 10, [2, 32])
+    else:
+        x = torch.randn(2, 32, 20, requires_grad=True)
+    x_lens = torch.LongTensor([32, 28])
+    y, _, _ = encoder(x, x_lens)
+    y.sum().backward()
+
+
+def test_encoder_invalid_layer_type():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, pos_enc_layer_type="abc_pos")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, pos_enc_layer_type="dummy")
+    with pytest.raises(ValueError):
+        LongformerEncoder(
+            20, pos_enc_layer_type="abc_pos", selfattention_layer_type="dummy"
+        )
+
+
+def test_encoder_invalid_windows_parameter():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, attention_windows=[1, 1], num_blocks=4)
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, attention_dilation=[1, 1], num_blocks=4)
+
+
+def test_encoder_output_size():
+    pytest.importorskip("longformer")
+    encoder = LongformerEncoder(20, output_size=256)
+    assert encoder.output_size() == 256
+
+
+def test_encoder_invalid_type():
+    pytest.importorskip("longformer")
+    with pytest.raises(ValueError):
+        LongformerEncoder(20, input_layer="fff")
diff --git a/test/espnet2/asr/encoder/test_transformer_encoder.py b/test/espnet2/asr/encoder/test_transformer_encoder.py
index 82bb317dc9c..fadb4fd2ef2 100644
--- a/test/espnet2/asr/encoder/test_transformer_encoder.py
+++ b/test/espnet2/asr/encoder/test_transformer_encoder.py
@@ -1,29 +1,68 @@
 import pytest
 import torch
 
+from espnet2.asr.ctc import CTC
 from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
 
 
 @pytest.mark.parametrize("input_layer", ["linear", "conv2d", "embed", None])
 @pytest.mark.parametrize("positionwise_layer_type", ["conv1d", "conv1d-linear"])
-def test_Encoder_forward_backward(input_layer, positionwise_layer_type):
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning",
+    [
+        ([], False),
+        ([1], False),
+        ([1], True),
+    ],
+)
+def test_Encoder_forward_backward(
+    input_layer,
+    positionwise_layer_type,
+    interctc_layer_idx,
+    interctc_use_conditioning,
+):
     encoder = TransformerEncoder(
         20,
         output_size=40,
         input_layer=input_layer,
         positionwise_layer_type=positionwise_layer_type,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
     )
     if input_layer == "embed":
         x = torch.randint(0, 10, [2, 10])
-    elif input_layer is None:
-        x = torch.randn(2, 10, 40, requires_grad=True)
     else:
         x = torch.randn(2, 10, 20, requires_grad=True)
     x_lens = torch.LongTensor([10, 8])
-    y, _, _ = encoder(x, x_lens)
+    if len(interctc_layer_idx) > 0:
+        ctc = None
+        if interctc_use_conditioning:
+            vocab_size = 5
+            output_size = encoder.output_size()
+            ctc = CTC(odim=vocab_size, encoder_output_size=output_size)
+            encoder.conditioning_layer = torch.nn.Linear(vocab_size, output_size)
+        y, _, _ = encoder(x, x_lens, ctc=ctc)
+        y = y[0]
+    else:
+        y, _, _ = encoder(x, x_lens)
     y.sum().backward()
 
 
+def test_encoder_invalid_interctc_layer_idx():
+    with pytest.raises(AssertionError):
+        TransformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[0, 1],
+        )
+    with pytest.raises(AssertionError):
+        TransformerEncoder(
+            20,
+            num_blocks=2,
+            interctc_layer_idx=[1, 2],
+        )
+
+
 def test_Encoder_output_size():
     encoder = TransformerEncoder(20, output_size=256)
     assert encoder.output_size() == 256
diff --git a/test/espnet2/asr/frontend/test_fused.py b/test/espnet2/asr/frontend/test_fused.py
new file mode 100644
index 00000000000..4c35cfb5c03
--- /dev/null
+++ b/test/espnet2/asr/frontend/test_fused.py
@@ -0,0 +1,43 @@
+from espnet2.asr.frontend.fused import FusedFrontends
+import torch
+
+
+frontend1 = {"frontend_type": "default", "n_mels": 80, "n_fft": 512}
+frontend2 = {"frontend_type": "default", "hop_length": 128}
+
+list_frontends = [frontend1, frontend2]
+
+
+def test_frontend_init():
+    frontend = FusedFrontends(
+        fs="16k",
+        align_method="linear_projection",
+        proj_dim=100,
+        frontends=list_frontends,
+    )
+    assert len(frontend.frontends) == 2
+    assert len(frontend.factors) == len(frontend.frontends)
+    assert frontend.frontends[0].frontend_type == "default"
+
+
+def test_frontend_output_size():
+    frontend = FusedFrontends(
+        fs="16k",
+        align_method="linear_projection",
+        proj_dim=100,
+        frontends=list_frontends,
+    )
+    assert frontend.output_size() == 100 * len(list_frontends)
+
+
+def test_frontend_backward():
+    frontend = FusedFrontends(
+        fs="16k",
+        align_method="linear_projection",
+        proj_dim=100,
+        frontends=list_frontends,
+    )
+    x = torch.randn(2, 300, requires_grad=True)
+    x_lengths = torch.LongTensor([300, 89])
+    y, y_lengths = frontend(x, x_lengths)
+    y.sum().backward()
diff --git a/test/espnet2/asr/frontend/test_s3prl.py b/test/espnet2/asr/frontend/test_s3prl.py
new file mode 100644
index 00000000000..77564a21a91
--- /dev/null
+++ b/test/espnet2/asr/frontend/test_s3prl.py
@@ -0,0 +1,41 @@
+from distutils.version import LooseVersion
+import os
+
+import torch
+
+is_torch_1_7_plus = LooseVersion(torch.__version__) >= LooseVersion("1.7.0")
+
+if is_torch_1_7_plus:
+    from s3prl.upstream.interfaces import Featurizer
+
+
+def test_frontend_output_size():
+    # Skip some testing cases
+    if not is_torch_1_7_plus:
+        return
+
+    s3prl_path = None
+    python_path_list = os.environ.get("PYTHONPATH", "(None)").split(":")
+    for p in python_path_list:
+        if p.endswith("s3prl"):
+            s3prl_path = p
+            break
+    assert s3prl_path is not None
+
+    s3prl_upstream = torch.hub.load(
+        s3prl_path,
+        "mel",
+        source="local",
+    ).to("cpu")
+
+    feature_selection = "last_hidden_state"
+    s3prl_featurizer = Featurizer(
+        upstream=s3prl_upstream,
+        feature_selection=feature_selection,
+        upstream_device="cpu",
+    )
+
+    wavs = [torch.randn(1600)]
+    feats = s3prl_upstream(wavs)
+    feats = s3prl_featurizer(wavs, feats)
+    assert feats[0].shape[-1] == 80
diff --git a/test/espnet2/asr/postencoder/test_hugging_face_transformers_postencoder.py b/test/espnet2/asr/postencoder/test_hugging_face_transformers_postencoder.py
new file mode 100644
index 00000000000..c3dbeaef4d3
--- /dev/null
+++ b/test/espnet2/asr/postencoder/test_hugging_face_transformers_postencoder.py
@@ -0,0 +1,44 @@
+import pytest
+import torch
+
+from espnet2.asr.postencoder.hugging_face_transformers_postencoder import (
+    HuggingFaceTransformersPostEncoder,  # noqa: H301
+)
+
+
+@pytest.mark.parametrize(
+    "model_name_or_path",
+    [
+        "akreal/tiny-random-bert",
+        "akreal/tiny-random-gpt2",
+        "akreal/tiny-random-xlnet",
+        "akreal/tiny-random-t5",
+        "akreal/tiny-random-mbart",
+        "akreal/tiny-random-mpnet",
+    ],
+)
+@pytest.mark.execution_timeout(50)
+def test_transformers_forward(model_name_or_path):
+    idim = 400
+    postencoder = HuggingFaceTransformersPostEncoder(idim, model_name_or_path)
+    x = torch.randn([4, 50, idim], requires_grad=True)
+    x_lengths = torch.LongTensor([20, 5, 50, 15])
+    y, y_lengths = postencoder(x, x_lengths)
+    y.sum().backward()
+    odim = postencoder.output_size()
+    assert y.shape == torch.Size([4, 50, odim])
+    assert torch.equal(y_lengths, x_lengths)
+
+
+@pytest.mark.execution_timeout(30)
+def test_reload_pretrained_parameters():
+    postencoder = HuggingFaceTransformersPostEncoder(400, "akreal/tiny-random-bert")
+    saved_param = postencoder.parameters().__next__().detach().clone()
+
+    postencoder.parameters().__next__().data *= 0
+    new_param = postencoder.parameters().__next__().detach().clone()
+    assert not torch.equal(saved_param, new_param)
+
+    postencoder.reload_pretrained_parameters()
+    new_param = postencoder.parameters().__next__().detach().clone()
+    assert torch.equal(saved_param, new_param)
diff --git a/test/espnet2/asr/preencoder/test_linear.py b/test/espnet2/asr/preencoder/test_linear.py
new file mode 100644
index 00000000000..bd1d29a9c5c
--- /dev/null
+++ b/test/espnet2/asr/preencoder/test_linear.py
@@ -0,0 +1,14 @@
+from espnet2.asr.preencoder.linear import LinearProjection
+import torch
+
+
+def test_linear_projection_forward():
+    idim = 400
+    odim = 80
+    preencoder = LinearProjection(input_size=idim, output_size=odim)
+    x = torch.randn([2, 50, idim], requires_grad=True)
+    x_lengths = torch.LongTensor([30, 15])
+    y, y_lengths = preencoder(x, x_lengths)
+    y.sum().backward()
+    assert y.shape == torch.Size([2, 50, odim])
+    assert torch.equal(y_lengths, x_lengths)
diff --git a/test/espnet2/asr/specaug/test_specaug.py b/test/espnet2/asr/specaug/test_specaug.py
index 874c13a44b1..e0ab4b747aa 100644
--- a/test/espnet2/asr/specaug/test_specaug.py
+++ b/test/espnet2/asr/specaug/test_specaug.py
@@ -7,19 +7,43 @@
 @pytest.mark.parametrize("apply_time_warp", [False, True])
 @pytest.mark.parametrize("apply_freq_mask", [False, True])
 @pytest.mark.parametrize("apply_time_mask", [False, True])
-def test_SpecAuc(apply_time_warp, apply_freq_mask, apply_time_mask):
-    if not apply_time_warp and not apply_time_mask and not apply_freq_mask:
+@pytest.mark.parametrize("time_mask_width_range", [None, 100, (0, 100)])
+@pytest.mark.parametrize("time_mask_width_ratio_range", [None, 0.1, (0.0, 0.1)])
+def test_SpecAuc(
+    apply_time_warp,
+    apply_freq_mask,
+    apply_time_mask,
+    time_mask_width_range,
+    time_mask_width_ratio_range,
+):
+    if (
+        (not apply_time_warp and not apply_time_mask and not apply_freq_mask)
+        or (
+            apply_time_mask
+            and time_mask_width_range is None
+            and time_mask_width_ratio_range is None
+        )
+        or (
+            apply_time_mask
+            and time_mask_width_range is not None
+            and time_mask_width_ratio_range is not None
+        )
+    ):
         with pytest.raises(ValueError):
             specaug = SpecAug(
                 apply_time_warp=apply_time_warp,
                 apply_freq_mask=apply_freq_mask,
                 apply_time_mask=apply_time_mask,
+                time_mask_width_range=time_mask_width_range,
+                time_mask_width_ratio_range=time_mask_width_ratio_range,
             )
     else:
         specaug = SpecAug(
             apply_time_warp=apply_time_warp,
             apply_freq_mask=apply_freq_mask,
             apply_time_mask=apply_time_mask,
+            time_mask_width_range=time_mask_width_range,
+            time_mask_width_ratio_range=time_mask_width_ratio_range,
         )
         x = torch.randn(2, 1000, 80)
         specaug(x)
@@ -28,12 +52,34 @@ def test_SpecAuc(apply_time_warp, apply_freq_mask, apply_time_mask):
 @pytest.mark.parametrize("apply_time_warp", [False, True])
 @pytest.mark.parametrize("apply_freq_mask", [False, True])
 @pytest.mark.parametrize("apply_time_mask", [False, True])
-def test_SpecAuc_repr(apply_time_warp, apply_freq_mask, apply_time_mask):
-    if not apply_time_warp and not apply_time_mask and not apply_freq_mask:
+@pytest.mark.parametrize("time_mask_width_range", [None, 100, (0, 100)])
+@pytest.mark.parametrize("time_mask_width_ratio_range", [None, 0.1, (0.0, 0.1)])
+def test_SpecAuc_repr(
+    apply_time_warp,
+    apply_freq_mask,
+    apply_time_mask,
+    time_mask_width_range,
+    time_mask_width_ratio_range,
+):
+    if (
+        (not apply_time_warp and not apply_time_mask and not apply_freq_mask)
+        or (
+            apply_time_mask
+            and time_mask_width_range is None
+            and time_mask_width_ratio_range is None
+        )
+        or (
+            apply_time_mask
+            and time_mask_width_range is not None
+            and time_mask_width_ratio_range is not None
+        )
+    ):
         return
     specaug = SpecAug(
         apply_time_warp=apply_time_warp,
         apply_freq_mask=apply_freq_mask,
         apply_time_mask=apply_time_mask,
+        time_mask_width_range=time_mask_width_range,
+        time_mask_width_ratio_range=time_mask_width_ratio_range,
     )
     print(specaug)
diff --git a/test/espnet2/asr/test_ctc.py b/test/espnet2/asr/test_ctc.py
index 3214a8d03ae..5e17121415d 100644
--- a/test/espnet2/asr/test_ctc.py
+++ b/test/espnet2/asr/test_ctc.py
@@ -14,25 +14,33 @@ def ctc_args():
     return h, h_lens, y, y_lens
 
 
-@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc"])
+@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc", "gtnctc"])
 def test_ctc_forward_backward(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc(*ctc_args).sum().backward()
 
 
-@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc"])
+@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc", "gtnctc"])
+def test_ctc_softmax(ctc_type, ctc_args):
+    if ctc_type == "warpctc":
+        pytest.importorskip("warpctc_pytorch")
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
+    ctc.softmax(ctc_args[0])
+
+
+@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc", "gtnctc"])
 def test_ctc_log_softmax(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc.log_softmax(ctc_args[0])
 
 
-@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc"])
+@pytest.mark.parametrize("ctc_type", ["builtin", "warpctc", "gtnctc"])
 def test_ctc_argmax(ctc_type, ctc_args):
     if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
-    ctc = CTC(encoder_output_sizse=10, odim=5, ctc_type=ctc_type)
+    ctc = CTC(encoder_output_size=10, odim=5, ctc_type=ctc_type)
     ctc.argmax(ctc_args[0])
diff --git a/test/espnet2/asr/test_maskctc_model.py b/test/espnet2/asr/test_maskctc_model.py
new file mode 100644
index 00000000000..4631f9be539
--- /dev/null
+++ b/test/espnet2/asr/test_maskctc_model.py
@@ -0,0 +1,77 @@
+import pytest
+import torch
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.decoder.mlm_decoder import MLMDecoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.maskctc_model import MaskCTCInference
+from espnet2.asr.maskctc_model import MaskCTCModel
+
+
+@pytest.mark.parametrize("encoder_arch", [TransformerEncoder, ConformerEncoder])
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning, interctc_weight",
+    [
+        ([], False, 0.0),
+        ([1], True, 0.5),
+    ],
+)
+def test_maskctc(
+    encoder_arch, interctc_layer_idx, interctc_use_conditioning, interctc_weight
+):
+    vocab_size = 5
+    enc_out = 4
+    encoder = encoder_arch(
+        20,
+        output_size=enc_out,
+        linear_units=4,
+        num_blocks=2,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
+    )
+    decoder = MLMDecoder(
+        vocab_size,
+        enc_out,
+        linear_units=4,
+        num_blocks=2,
+    )
+    ctc = CTC(odim=vocab_size, encoder_output_size=enc_out)
+
+    model = MaskCTCModel(
+        vocab_size,
+        token_list=["<blank>", "<unk>", "a", "i", "<eos>"],
+        frontend=None,
+        specaug=None,
+        normalize=None,
+        preencoder=None,
+        encoder=encoder,
+        postencoder=None,
+        decoder=decoder,
+        ctc=ctc,
+        interctc_weight=interctc_weight,
+    )
+
+    inputs = dict(
+        speech=torch.randn(2, 10, 20, requires_grad=True),
+        speech_lengths=torch.tensor([10, 8], dtype=torch.long),
+        text=torch.randint(2, 4, [2, 4], dtype=torch.long),
+        text_lengths=torch.tensor([4, 3], dtype=torch.long),
+    )
+    loss, *_ = model(**inputs)
+    loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        s2t = MaskCTCInference(
+            asr_model=model,
+            n_iterations=2,
+            threshold_probability=0.5,
+        )
+
+        # free running
+        inputs = dict(
+            enc_out=torch.randn(2, 4),
+        )
+        s2t(**inputs)
diff --git a/test/espnet2/asr/transducer/test_beam_search_transducer.py b/test/espnet2/asr/transducer/test_beam_search_transducer.py
new file mode 100644
index 00000000000..d89b00c29a1
--- /dev/null
+++ b/test/espnet2/asr/transducer/test_beam_search_transducer.py
@@ -0,0 +1,53 @@
+import pytest
+import torch
+
+from espnet2.asr.transducer.beam_search_transducer import BeamSearchTransducer
+from espnet2.asr.transducer.joint_network import JointNetwork
+from espnet2.asr.transducer.transducer_decoder import TransducerDecoder
+from espnet2.lm.seq_rnn_lm import SequentialRNNLM
+
+
+@pytest.mark.execution_timeout(5)
+@pytest.mark.parametrize("rnn_type", ["lstm", "gru"])
+@pytest.mark.parametrize(
+    "search_params",
+    [
+        {"search_type": "greedy"},
+        {"search_type": "default", "score_norm": False, "nbest": 4},
+        {"search_type": "alsd", "u_max": 20},
+        {"search_type": "tsd", "max_sym_exp": 3},
+        {"search_type": "nsc", "nstep": 2, "lm": None},
+        {"search_type": "nsc", "nstep": 2},
+        {"search_type": "maes", "nstep": 2, "lm": None},
+        {"search_type": "maes", "nstep": 2},
+    ],
+)
+def test_transducer_beam_search(rnn_type, search_params):
+    token_list = ["<blank>", "a", "b", "c"]
+    vocab_size = len(token_list)
+    beam_size = 1 if search_params["search_type"] == "greedy" else 2
+
+    encoder_output_size = 4
+    decoder_output_size = 4
+
+    decoder = TransducerDecoder(
+        vocab_size, hidden_size=decoder_output_size, rnn_type=rnn_type
+    )
+    joint_net = JointNetwork(
+        vocab_size, encoder_output_size, decoder_output_size, joint_space_size=2
+    )
+
+    lm = search_params.pop("lm", SequentialRNNLM(vocab_size, rnn_type="lstm"))
+
+    beam = BeamSearchTransducer(
+        decoder,
+        joint_net,
+        beam_size=beam_size,
+        lm=lm,
+        **search_params,
+    )
+
+    enc_out = torch.randn(30, encoder_output_size)
+
+    with torch.no_grad():
+        _ = beam(enc_out)
diff --git a/test/espnet2/asr/transducer/test_error_calculator_transducer.py b/test/espnet2/asr/transducer/test_error_calculator_transducer.py
new file mode 100644
index 00000000000..6b5194270a1
--- /dev/null
+++ b/test/espnet2/asr/transducer/test_error_calculator_transducer.py
@@ -0,0 +1,44 @@
+import pytest
+import torch
+
+from espnet2.asr.transducer.error_calculator import ErrorCalculatorTransducer
+from espnet2.asr.transducer.joint_network import JointNetwork
+from espnet2.asr.transducer.transducer_decoder import TransducerDecoder
+
+
+@pytest.mark.parametrize(
+    "report_opts",
+    [
+        {"report_cer": False, "report_wer": False},
+        {"report_cer": True, "report_wer": True},
+    ],
+)
+def test_error_calculator(report_opts):
+    token_list = ["<blank>", "a", "b", "c", "<space>"]
+    vocab_size = len(token_list)
+
+    encoder_output_size = 4
+    decoder_output_size = 4
+
+    decoder = TransducerDecoder(
+        vocab_size,
+        hidden_size=decoder_output_size,
+    )
+    joint_net = JointNetwork(
+        vocab_size, encoder_output_size, decoder_output_size, joint_space_size=2
+    )
+
+    error_calc = ErrorCalculatorTransducer(
+        decoder,
+        joint_net,
+        token_list,
+        "<space>",
+        "<blank>",
+        **report_opts,
+    )
+
+    enc_out = torch.randn(4, 30, encoder_output_size)
+    target = torch.randint(0, vocab_size, [4, 20], dtype=torch.int32)
+
+    with torch.no_grad():
+        _, _ = error_calc(enc_out, target)
diff --git a/test/espnet2/asr/transducer/test_transducer_decoder.py b/test/espnet2/asr/transducer/test_transducer_decoder.py
new file mode 100644
index 00000000000..27f36fc15f3
--- /dev/null
+++ b/test/espnet2/asr/transducer/test_transducer_decoder.py
@@ -0,0 +1,53 @@
+import pytest
+import torch
+
+from espnet2.asr.transducer.beam_search_transducer import Hypothesis
+from espnet2.asr.transducer.transducer_decoder import TransducerDecoder
+
+
+@pytest.mark.parametrize("rnn_type", ["lstm", "gru"])
+def test_TransducerDecoder_forward(rnn_type):
+    ys = torch.randint(0, 10, [4, 10], dtype=torch.long)
+    decoder = TransducerDecoder(10, rnn_type=rnn_type)
+
+    decoder.set_device(ys.device)
+    _ = decoder(ys)
+
+
+def test_TransducerDecoder_invalid_type():
+    with pytest.raises(ValueError):
+        TransducerDecoder(10, rnn_type="foo")
+
+
+def test_TransducerDecoder_score():
+    decoder = TransducerDecoder(10, rnn_type="lstm")
+    dec_state = decoder.init_state(1)
+    hyp = Hypothesis(score=0.0, yseq=[0], dec_state=dec_state)
+
+    _, _, _ = decoder.score(hyp, {})
+
+
+def test_TransducerDecoder_batch_score():
+    decoder = TransducerDecoder(10, rnn_type="lstm")
+    batch_state = decoder.init_state(3)
+    hyps = [
+        Hypothesis(score=0.0, yseq=[0], dec_state=decoder.select_state(batch_state, 0))
+    ]
+
+    _, _, _ = decoder.batch_score(hyps, batch_state, {}, True)
+
+
+def test_TransducerDecoder_cache_score():
+    decoder = TransducerDecoder(10, rnn_type="gru")
+
+    batch_state = decoder.init_state(3)
+
+    hyps = [
+        Hypothesis(score=0.0, yseq=[0], dec_state=decoder.select_state(batch_state, 0))
+    ]
+
+    cache = {"0": hyps[0].dec_state}
+    dec_out, _, _ = decoder.score(hyps[0], cache)
+
+    batch_cache = {"0": (dec_out.view(1, 1, -1), hyps[0].dec_state)}
+    _, _, _ = decoder.batch_score(hyps, batch_state, batch_cache, False)
diff --git a/test/espnet2/bin/test_asr_align.py b/test/espnet2/bin/test_asr_align.py
new file mode 100644
index 00000000000..a3cad5b6872
--- /dev/null
+++ b/test/espnet2/bin/test_asr_align.py
@@ -0,0 +1,122 @@
+"""Tests for asr_align.py."""
+from argparse import ArgumentParser
+from pathlib import Path
+import string
+
+import numpy as np
+import pytest
+
+from espnet2.bin.asr_align import CTCSegmentation
+from espnet2.bin.asr_align import CTCSegmentationTask
+from espnet2.bin.asr_align import get_parser
+from espnet2.bin.asr_align import main
+from espnet2.tasks.asr import ASRTask
+
+
+def test_get_parser():
+    """Check the parser."""
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    """Run main(·) once."""
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    """Obtain a test file with a list of tokens."""
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def asr_config_file(tmp_path: Path, token_list):
+    """Obtain ASR config file for test."""
+    # Write default configuration file
+    ASRTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "asr" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_CTCSegmentation(asr_config_file):
+    """Test CTC segmentation.
+
+    Note that due to the random vector that is given to the CTC segmentation function,
+    there is a small chance that this test might randomly fail. If this ever happens,
+    use the test file test_utils/ctc_align_test.wav instead, or a fixed test vector.
+    """
+
+    num_samples = 100000
+    fs = 16000
+    # text includes:
+    #   one blank line
+    #   kaldi-style utterance names
+    #   one char not included in char list
+    text = (
+        "\n"
+        "utt_a HOTELS\n"
+        "utt_b HOLIDAY'S STRATEGY\n"
+        "utt_c ASSETS\n"
+        "utt_d PROPERTY MANAGEMENT\n"
+    )
+    # speech either from the test audio file or random
+    speech = np.random.randn(num_samples)
+    aligner = CTCSegmentation(
+        asr_train_config=asr_config_file,
+        fs=fs,
+        kaldi_style_text=True,
+        min_window_size=10,
+    )
+    segments = aligner(speech, text, fs=fs)
+    # check segments
+    assert isinstance(segments, CTCSegmentationTask)
+    kaldi_text = str(segments)
+    first_line = kaldi_text.splitlines()[0]
+    assert "utt_a" == first_line.split(" ")[0]
+    start, end, score = segments.segments[0]
+    assert start > 0.0
+    assert start < (num_samples / fs)
+    assert end >= start
+    assert score < 0.0
+    # check options and align with "classic" text converter
+    option_dict = {
+        "fs": 16000,
+        "time_stamps": "fixed",
+        "samples_to_frames_ratio": 512,
+        "min_window_size": 100,
+        "max_window_size": 20000,
+        "set_blank": 0,
+        "scoring_length": 10,
+        "replace_spaces_with_blanks": True,
+        "gratis_blank": True,
+        "kaldi_style_text": False,
+        "text_converter": "classic",
+    }
+    aligner.set_config(**option_dict)
+    assert aligner.warned_about_misconfiguration
+    text = ["HOTELS", "HOLIDAY'S STRATEGY", "ASSETS", "PROPERTY MANAGEMENT"]
+    segments = aligner(speech, text, name="foo")
+    segments_str = str(segments)
+    first_line = segments_str.splitlines()[0]
+    assert "foo_0000" == first_line.split(" ")[0]
+    # test the ratio estimation (result: 509)
+    ratio = aligner.estimate_samples_to_frames_ratio()
+    assert 500 <= ratio <= 520
diff --git a/test/espnet2/bin/test_asr_inference_k2.py b/test/espnet2/bin/test_asr_inference_k2.py
new file mode 100644
index 00000000000..823c5ce848b
--- /dev/null
+++ b/test/espnet2/bin/test_asr_inference_k2.py
@@ -0,0 +1,111 @@
+from argparse import ArgumentParser
+from pathlib import Path
+import string
+
+import numpy as np
+import pytest
+
+from espnet2.tasks.asr import ASRTask
+from espnet2.tasks.lm import LMTask
+
+
+pytest.importorskip("k2")
+
+
+def test_get_parser():
+    from espnet2.bin.asr_inference_k2 import get_parser
+
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    from espnet2.bin.asr_inference_k2 import main
+
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def asr_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    ASRTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "asr" / "config.yaml"
+
+
+@pytest.fixture()
+def lm_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    LMTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "lm"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+        ]
+    )
+    return tmp_path / "lm" / "config.yaml"
+
+
+def asr_config_file_streaming(tmp_path: Path, token_list):
+    # Write default configuration file
+    ASRTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "asr_streaming"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+            "--decoder",
+            "transformer",
+        ]
+    )
+    return tmp_path / "asr_streaming" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_k2Speech2Text(asr_config_file, lm_config_file):
+    from espnet2.bin.asr_inference_k2 import k2Speech2Text
+
+    k2speech2text = k2Speech2Text(
+        asr_train_config=asr_config_file, lm_train_config=lm_config_file, beam_size=1
+    )
+    batch_size = 5
+    num_samples = 100000
+    speech = np.random.randn(batch_size, num_samples).astype("f")
+    speech_lengths = np.repeat(num_samples, batch_size).astype(np.int_)
+    batch = {"speech": speech, "speech_lengths": speech_lengths}
+    results = k2speech2text(batch)
+    for text, token, token_int, score in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(score, float)
diff --git a/test/espnet2/bin/test_asr_inference_maskctc.py b/test/espnet2/bin/test_asr_inference_maskctc.py
new file mode 100644
index 00000000000..21a1d0392b4
--- /dev/null
+++ b/test/espnet2/bin/test_asr_inference_maskctc.py
@@ -0,0 +1,68 @@
+from argparse import ArgumentParser
+from pathlib import Path
+import string
+
+import numpy as np
+import pytest
+
+from espnet.nets.beam_search import Hypothesis
+from espnet2.bin.asr_inference_maskctc import get_parser
+from espnet2.bin.asr_inference_maskctc import main
+from espnet2.bin.asr_inference_maskctc import Speech2Text
+from espnet2.tasks.asr import ASRTask
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        f.write("<blank>\n")
+        for c in string.ascii_letters:
+            f.write(f"{c}\n")
+        f.write("<unk>\n")
+        f.write("<sos/eos>\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def asr_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    ASRTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "asr"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+            "--model",
+            "maskctc",
+            "--encoder",
+            "transformer",
+            "--decoder",
+            "mlm",
+        ]
+    )
+    return tmp_path / "asr" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2Text(asr_config_file):
+    speech2text = Speech2Text(asr_train_config=asr_config_file)
+    speech = np.random.randn(100000)
+    results = speech2text(speech)
+    for text, token, token_int, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(hyp, Hypothesis)
diff --git a/test/espnet2/bin/test_diar_inference.py b/test/espnet2/bin/test_diar_inference.py
new file mode 100644
index 00000000000..8781200eb72
--- /dev/null
+++ b/test/espnet2/bin/test_diar_inference.py
@@ -0,0 +1,54 @@
+from argparse import ArgumentParser
+from pathlib import Path
+
+import pytest
+import torch
+
+from espnet2.bin.diar_inference import DiarizeSpeech
+from espnet2.bin.diar_inference import get_parser
+from espnet2.bin.diar_inference import main
+from espnet2.tasks.diar import DiarizationTask
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def config_file(tmp_path: Path):
+    # Write default configuration file
+    DiarizationTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path),
+            "--num_spk",
+            "2",
+        ]
+    )
+    return tmp_path / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize(
+    "input_size, segment_size, normalize_segment_scale, num_spk",
+    [(16000, None, False, 2), (35000, 2.4, False, 2), (34000, 2.4, True, 2)],
+)
+def test_DiarizeSpeech(
+    config_file, batch_size, input_size, segment_size, normalize_segment_scale, num_spk
+):
+    diarize_speech = DiarizeSpeech(
+        train_config=config_file,
+        segment_size=segment_size,
+        normalize_segment_scale=normalize_segment_scale,
+        num_spk=num_spk,
+    )
+    wav = torch.rand(batch_size, input_size)
+    diarize_speech(wav, fs=8000)
diff --git a/test/espnet2/bin/test_diar_train.py b/test/espnet2/bin/test_diar_train.py
new file mode 100644
index 00000000000..9f0cd5dff2f
--- /dev/null
+++ b/test/espnet2/bin/test_diar_train.py
@@ -0,0 +1,15 @@
+from argparse import ArgumentParser
+
+import pytest
+
+from espnet2.bin.diar_train import get_parser
+from espnet2.bin.diar_train import main
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
diff --git a/test/espnet2/bin/test_enh_inference.py b/test/espnet2/bin/test_enh_inference.py
index bc410c9612b..5150e25823e 100644
--- a/test/espnet2/bin/test_enh_inference.py
+++ b/test/espnet2/bin/test_enh_inference.py
@@ -1,5 +1,4 @@
 from argparse import ArgumentParser
-from distutils.version import LooseVersion
 from pathlib import Path
 
 import pytest
@@ -10,8 +9,6 @@
 from espnet2.bin.enh_inference import SeparateSpeech
 from espnet2.tasks.enh import EnhancementTask
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
-
 
 def test_get_parser():
     assert isinstance(get_parser(), ArgumentParser)
@@ -39,14 +36,17 @@ def config_file(tmp_path: Path):
 @pytest.mark.execution_timeout(5)
 @pytest.mark.parametrize("batch_size", [1, 2])
 @pytest.mark.parametrize(
-    "input_size, segment_size, hop_size", [(16000, None, None), (35000, 2.4, 0.8)]
+    "input_size, segment_size, hop_size, normalize_segment_scale",
+    [(16000, None, None, False), (35000, 2.4, 0.8, False), (35000, 2.4, 0.8, True)],
 )
-def test_SeparateSpeech(config_file, batch_size, input_size, segment_size, hop_size):
-    if not is_torch_1_2_plus:
-        pytest.skip("Pytorch Version Under 1.2 is not supported for Enh task")
-
+def test_SeparateSpeech(
+    config_file, batch_size, input_size, segment_size, hop_size, normalize_segment_scale
+):
     separate_speech = SeparateSpeech(
-        enh_train_config=config_file, segment_size=segment_size, hop_size=hop_size
+        train_config=config_file,
+        segment_size=segment_size,
+        hop_size=hop_size,
+        normalize_segment_scale=normalize_segment_scale,
     )
     wav = torch.rand(batch_size, input_size)
     separate_speech(wav, fs=8000)
diff --git a/test/espnet2/bin/test_hubert_train.py b/test/espnet2/bin/test_hubert_train.py
new file mode 100644
index 00000000000..912cb4cae68
--- /dev/null
+++ b/test/espnet2/bin/test_hubert_train.py
@@ -0,0 +1,15 @@
+from argparse import ArgumentParser
+
+import pytest
+
+from espnet2.bin.hubert_train import get_parser
+from espnet2.bin.hubert_train import main
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
diff --git a/test/espnet2/diar/attractor/test_rnn_attractor.py b/test/espnet2/diar/attractor/test_rnn_attractor.py
new file mode 100644
index 00000000000..a65862e3254
--- /dev/null
+++ b/test/espnet2/diar/attractor/test_rnn_attractor.py
@@ -0,0 +1,27 @@
+import pytest
+import torch
+
+from espnet2.diar.attractor.rnn_attractor import RnnAttractor
+
+
+@pytest.mark.parametrize("encoder_output_size", [10])
+@pytest.mark.parametrize("layer", [1])
+@pytest.mark.parametrize("unit", [10])
+@pytest.mark.parametrize("dropout", [0.1])
+def test_rnn_attractor(encoder_output_size, layer, unit, dropout):
+    eda = RnnAttractor(
+        encoder_output_size=encoder_output_size,
+        layer=layer,
+        unit=unit,
+        dropout=dropout,
+    )
+    enc_input = torch.rand(5, 100, encoder_output_size)
+    ilens = torch.tensor([100, 100, 100, 100, 100])
+    dec_input = torch.zeros(5, 3, encoder_output_size)
+    attractor, att_prob = eda.forward(
+        enc_input=enc_input,
+        ilens=ilens,
+        dec_input=dec_input,
+    )
+    assert attractor.shape == (5, 3, encoder_output_size)
+    assert att_prob.shape == (5, 3, 1)
diff --git a/test/espnet2/diar/decoder/test_linear_decoder.py b/test/espnet2/diar/decoder/test_linear_decoder.py
new file mode 100644
index 00000000000..652f3582a4a
--- /dev/null
+++ b/test/espnet2/diar/decoder/test_linear_decoder.py
@@ -0,0 +1,16 @@
+import pytest
+import torch
+
+from espnet2.diar.decoder.linear_decoder import LinearDecoder
+
+
+@pytest.mark.parametrize("encoder_output_size", [10])
+@pytest.mark.parametrize("num_spk", [2])
+def test_linear_decoder(encoder_output_size, num_spk):
+    linear_decoder = LinearDecoder(
+        encoder_output_size=encoder_output_size, num_spk=num_spk
+    )
+    input = torch.rand(5, 100, encoder_output_size)
+    ilens = torch.tensor([100, 100, 100, 100, 100])
+    output = linear_decoder.forward(input=input, ilens=ilens)
+    assert output.shape == (5, 100, num_spk)
diff --git a/test/espnet2/enh/decoder/test_stft_decoder.py b/test/espnet2/enh/decoder/test_stft_decoder.py
index 7f53801af20..4389d7b858f 100644
--- a/test/espnet2/enh/decoder/test_stft_decoder.py
+++ b/test/espnet2/enh/decoder/test_stft_decoder.py
@@ -1,5 +1,3 @@
-from distutils.version import LooseVersion
-
 import pytest
 
 import torch
@@ -7,8 +5,6 @@
 
 from espnet2.enh.decoder.stft_decoder import STFTDecoder
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
-
 
 @pytest.mark.parametrize("n_fft", [512])
 @pytest.mark.parametrize("win_length", [512])
@@ -20,8 +16,6 @@
 def test_STFTDecoder_backward(
     n_fft, win_length, hop_length, window, center, normalized, onesided
 ):
-    if not is_torch_1_2_plus:
-        pytest.skip("Pytorch Version Under 1.2 is not supported for Enh task")
     decoder = STFTDecoder(
         n_fft=n_fft,
         win_length=win_length,
@@ -50,9 +44,6 @@ def test_STFTDecoder_backward(
 def test_STFTDecoder_invalid_type(
     n_fft, win_length, hop_length, window, center, normalized, onesided
 ):
-    if not is_torch_1_2_plus:
-        pytest.skip("Pytorch Version Under 1.2 is not supported for Enh task")
-
     decoder = STFTDecoder(
         n_fft=n_fft,
         win_length=win_length,
diff --git a/test/espnet2/enh/encoder/test_stft_encoder.py b/test/espnet2/enh/encoder/test_stft_encoder.py
index 42f7f2fdd15..135cb587888 100644
--- a/test/espnet2/enh/encoder/test_stft_encoder.py
+++ b/test/espnet2/enh/encoder/test_stft_encoder.py
@@ -1,12 +1,8 @@
-from distutils.version import LooseVersion
-
 import pytest
 import torch
 
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
-
 
 @pytest.mark.parametrize("n_fft", [512])
 @pytest.mark.parametrize("win_length", [512])
@@ -15,12 +11,17 @@
 @pytest.mark.parametrize("center", [True])
 @pytest.mark.parametrize("normalized", [True, False])
 @pytest.mark.parametrize("onesided", [True, False])
+@pytest.mark.parametrize("use_builtin_complex", [True, False])
 def test_STFTEncoder_backward(
-    n_fft, win_length, hop_length, window, center, normalized, onesided
+    n_fft,
+    win_length,
+    hop_length,
+    window,
+    center,
+    normalized,
+    onesided,
+    use_builtin_complex,
 ):
-    if not is_torch_1_2_plus:
-        pytest.skip("Pytorch Version Under 1.2 is not supported for Enh task")
-
     encoder = STFTEncoder(
         n_fft=n_fft,
         win_length=win_length,
@@ -29,6 +30,7 @@ def test_STFTEncoder_backward(
         center=center,
         normalized=normalized,
         onesided=onesided,
+        use_builtin_complex=use_builtin_complex,
     )
 
     x = torch.rand(2, 32000, requires_grad=True)
diff --git a/test/espnet2/enh/layers/test_complex_utils.py b/test/espnet2/enh/layers/test_complex_utils.py
new file mode 100644
index 00000000000..ca988a7bd2b
--- /dev/null
+++ b/test/espnet2/enh/layers/test_complex_utils.py
@@ -0,0 +1,210 @@
+from distutils.version import LooseVersion
+
+import numpy as np
+import pytest
+import torch
+import torch_complex.functional as FC
+from torch_complex.tensor import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import cat
+from espnet2.enh.layers.complex_utils import complex_norm
+from espnet2.enh.layers.complex_utils import einsum
+from espnet2.enh.layers.complex_utils import inverse
+from espnet2.enh.layers.complex_utils import matmul
+from espnet2.enh.layers.complex_utils import solve
+from espnet2.enh.layers.complex_utils import stack
+from espnet2.enh.layers.complex_utils import trace
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+# invertible matrix
+mat_np = np.array(
+    [
+        [
+            [-0.211 + 1.8293j, -0.1138 + 0.0754j, -1.3574 - 0.6358j],
+            [-1.1041 - 1.0455j, -0.8856 - 0.7828j, 1.6058 + 0.8616j],
+            [0.3877 - 1.3823j, 1.2027 - 0.4265j, 0.4436 - 0.0173j],
+        ],
+        [
+            [0.5322 - 0.2629j, 1.774 - 0.9664j, -0.1956 + 0.8791j],
+            [-0.156 - 0.1044j, 0.2576 + 1.2311j, 0.0493 - 2.5577j],
+            [0.4465 - 1.1056j, 0.4398 + 1.4871j, -0.34 + 1.095j],
+        ],
+    ],
+    dtype=np.complex64,
+)
+
+
+@pytest.mark.parametrize("dim", [0, 1, 2])
+def test_cat(dim):
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        mat1 = complex_wrapper(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+        mat2 = complex_wrapper(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+        ret = cat([mat1, mat2], dim=dim)
+        ret2 = complex_module.cat([mat1, mat2], dim=dim)
+        assert complex_module.allclose(ret, ret2)
+
+
+@pytest.mark.parametrize("dim", [0, 1, 2])
+@pytest.mark.skipif(not is_torch_1_9_plus, reason="Require torch 1.9.0+")
+def test_complex_norm(dim):
+    mat = ComplexTensor(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+    mat_th = torch.complex(mat.real, mat.imag)
+    norm = complex_norm(mat, dim=dim, keepdim=True)
+    norm_th = complex_norm(mat_th, dim=dim, keepdim=True)
+    assert (
+        torch.allclose(norm, norm_th)
+        and norm.ndim == mat.ndim
+        and mat.numel() == norm.numel() * mat.size(dim)
+    )
+
+
+@pytest.mark.parametrize("real_vec", [True, False])
+def test_einsum(real_vec):
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        mat = complex_wrapper(torch.rand(2, 3, 3), torch.rand(2, 3, 3))
+        if real_vec:
+            vec = torch.rand(2, 3, 1)
+            vec2 = complex_wrapper(vec, torch.zeros_like(vec))
+        else:
+            vec = complex_wrapper(torch.rand(2, 3, 1), torch.rand(2, 3, 1))
+            vec2 = vec
+        ret = einsum("bec,bcf->bef", mat, vec)
+        ret2 = complex_module.einsum("bec,bcf->bef", mat, vec2)
+        assert complex_module.allclose(ret, ret2)
+
+
+def test_inverse():
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    eye = torch.eye(3).expand(2, 3, 3)
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        mat = complex_wrapper(
+            torch.from_numpy(mat_np.real), torch.from_numpy(mat_np.imag)
+        )
+        eye_complex = complex_wrapper(eye, torch.zeros_like(eye))
+        assert complex_module.allclose(mat @ inverse(mat), eye_complex, atol=1e-6)
+
+
+@pytest.mark.parametrize("real_vec", [True, False])
+def test_matmul(real_vec):
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        mat = complex_wrapper(torch.rand(2, 3, 3), torch.rand(2, 3, 3))
+        if real_vec:
+            vec = torch.rand(2, 3, 1)
+            vec2 = complex_wrapper(vec, torch.zeros_like(vec))
+        else:
+            vec = complex_wrapper(torch.rand(2, 3, 1), torch.rand(2, 3, 1))
+            vec2 = vec
+        ret = matmul(mat, vec)
+        ret2 = complex_module.matmul(mat, vec2)
+        assert complex_module.allclose(ret, ret2)
+
+
+def test_trace():
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        mat = complex_wrapper(torch.rand(2, 3, 3), torch.rand(2, 3, 3))
+        tr = trace(mat)
+        tr2 = sum([mat[..., i, i] for i in range(mat.size(-1))])
+        assert complex_module.allclose(tr, tr2)
+
+
+@pytest.mark.parametrize("real_vec", [True, False])
+def test_solve(real_vec):
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        mat = complex_wrapper(
+            torch.from_numpy(mat_np.real), torch.from_numpy(mat_np.imag)
+        )
+        if not real_vec or complex_wrapper is ComplexTensor:
+            vec = complex_wrapper(torch.rand(2, 3, 1), torch.rand(2, 3, 1))
+            vec2 = vec
+        else:
+            vec = torch.rand(2, 3, 1)
+            vec2 = complex_wrapper(vec, torch.zeros_like(vec))
+        ret = solve(vec, mat)
+        if isinstance(vec2, ComplexTensor):
+            ret2 = FC.solve(vec2, mat, return_LU=False)
+        else:
+            return torch.linalg.solve(mat, vec2)
+        assert complex_module.allclose(ret, ret2)
+
+
+@pytest.mark.parametrize("dim", [0, 1, 2])
+def test_stack(dim):
+    if is_torch_1_9_plus:
+        wrappers = [ComplexTensor, torch.complex]
+        modules = [FC, torch]
+    else:
+        wrappers = [ComplexTensor]
+        modules = [FC]
+
+    for complex_wrapper, complex_module in zip(wrappers, modules):
+        print(complex_wrapper, complex_module)
+        mat1 = complex_wrapper(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+        mat2 = complex_wrapper(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+        ret = stack([mat1, mat2], dim=dim)
+        ret2 = complex_module.stack([mat1, mat2], dim=dim)
+        assert complex_module.allclose(ret, ret2)
+
+
+def test_complex_impl_consistency():
+    if not is_torch_1_9_plus:
+        return
+    mat_th = torch.complex(torch.from_numpy(mat_np.real), torch.from_numpy(mat_np.imag))
+    mat_ct = ComplexTensor(torch.from_numpy(mat_np.real), torch.from_numpy(mat_np.imag))
+    bs = mat_th.shape[0]
+    rank = mat_th.shape[-1]
+    vec_th = torch.complex(torch.rand(bs, rank), torch.rand(bs, rank)).type_as(mat_th)
+    vec_ct = ComplexTensor(vec_th.real, vec_th.imag)
+
+    for result_th, result_ct in (
+        (abs(mat_th), abs(mat_ct)),
+        (inverse(mat_th), inverse(mat_ct)),
+        (matmul(mat_th, vec_th.unsqueeze(-1)), matmul(mat_ct, vec_ct.unsqueeze(-1))),
+        (solve(vec_th.unsqueeze(-1), mat_th), solve(vec_ct.unsqueeze(-1), mat_ct)),
+        (
+            einsum("bec,bc->be", mat_th, vec_th),
+            einsum("bec,bc->be", mat_ct, vec_ct),
+        ),
+    ):
+        np.testing.assert_allclose(result_th.numpy(), result_ct.numpy(), atol=1e-6)
diff --git a/test/espnet2/enh/layers/test_conv_utils.py b/test/espnet2/enh/layers/test_conv_utils.py
new file mode 100644
index 00000000000..7e7ea22672c
--- /dev/null
+++ b/test/espnet2/enh/layers/test_conv_utils.py
@@ -0,0 +1,63 @@
+import pytest
+import torch
+
+from espnet2.enh.layers.conv_utils import conv2d_output_shape
+from espnet2.enh.layers.conv_utils import convtransp2d_output_shape
+
+
+@pytest.mark.parametrize("input_dim", [(10, 17), (10, 33)])
+@pytest.mark.parametrize("kernel_size", [(1, 3), (3, 5)])
+@pytest.mark.parametrize("stride", [(1, 1), (1, 2)])
+@pytest.mark.parametrize("padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("dilation", [(1, 1), (1, 2)])
+def test_conv2d_output_shape(input_dim, kernel_size, stride, padding, dilation):
+    h, w = conv2d_output_shape(
+        input_dim,
+        kernel_size=kernel_size,
+        stride=stride,
+        pad=padding,
+        dilation=dilation,
+    )
+    conv = torch.nn.Conv2d(
+        1, 1, kernel_size, stride=stride, padding=padding, dilation=dilation
+    )
+    x = torch.rand(1, 1, *input_dim)
+    assert conv(x).shape[2:] == (h, w)
+
+
+@pytest.mark.parametrize("input_dim", [(10, 17), (10, 33)])
+@pytest.mark.parametrize("kernel_size", [(1, 3), (3, 5)])
+@pytest.mark.parametrize("stride", [(1, 1), (1, 2)])
+@pytest.mark.parametrize("padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("output_padding", [(0, 0), (0, 1)])
+@pytest.mark.parametrize("dilation", [(1, 1), (1, 2)])
+def test_deconv2d_output_shape(
+    input_dim, kernel_size, stride, padding, output_padding, dilation
+):
+    if (
+        output_padding[0] >= stride[0]
+        or output_padding[0] >= dilation[0]
+        or output_padding[1] >= stride[1]
+        or output_padding[1] >= dilation[1]
+    ):
+        # skip invalid cases
+        return
+    h, w = convtransp2d_output_shape(
+        input_dim,
+        kernel_size=kernel_size,
+        stride=stride,
+        pad=padding,
+        dilation=dilation,
+        out_pad=output_padding,
+    )
+    deconv = torch.nn.ConvTranspose2d(
+        1,
+        1,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        dilation=dilation,
+    )
+    x = torch.rand(1, 1, *input_dim)
+    assert deconv(x).shape[2:] == (h, w)
diff --git a/test/espnet2/enh/layers/test_enh_layers.py b/test/espnet2/enh/layers/test_enh_layers.py
index 33e6d381240..62f4554b10b 100644
--- a/test/espnet2/enh/layers/test_enh_layers.py
+++ b/test/espnet2/enh/layers/test_enh_layers.py
@@ -1,15 +1,20 @@
 from distutils.version import LooseVersion
 
+import numpy as np
 import pytest
 import torch
 import torch_complex.functional as FC
 from torch_complex.tensor import ComplexTensor
 
+from espnet2.enh.layers.beamformer import generalized_eigenvalue_decomposition
 from espnet2.enh.layers.beamformer import get_rtf
+from espnet2.enh.layers.beamformer import gev_phase_correction
 from espnet2.enh.layers.beamformer import signal_framing
+from espnet2.enh.layers.complex_utils import solve
 from espnet2.layers.stft import Stft
 
 is_torch_1_1_plus = LooseVersion(torch.__version__) >= LooseVersion("1.1.0")
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 
 
 random_speech = torch.tensor(
@@ -56,7 +61,17 @@
 
 
 @pytest.mark.parametrize("ch", [2, 4, 6, 8])
-def test_get_rtf(ch):
+@pytest.mark.parametrize("mode", ["power", "evd"])
+def test_get_rtf(ch, mode):
+    if not is_torch_1_9_plus and mode == "evd":
+        # torch 1.9.0+ is required for "evd" mode
+        return
+    if mode == "evd":
+        complex_wrapper = torch.complex
+        complex_module = torch
+    else:
+        complex_wrapper = ComplexTensor
+        complex_module = FC
     stft = Stft(
         n_fft=8,
         win_length=None,
@@ -68,16 +83,20 @@ def test_get_rtf(ch):
     )
     torch.random.manual_seed(0)
     x = random_speech[..., :ch]
-    n = torch.rand(2, 16, ch, dtype=torch.double)
     ilens = torch.LongTensor([16, 12])
     # (B, T, C, F) -> (B, F, C, T)
-    X = ComplexTensor(*torch.unbind(stft(x, ilens)[0], dim=-1)).transpose(-1, -3)
-    N = ComplexTensor(*torch.unbind(stft(n, ilens)[0], dim=-1)).transpose(-1, -3)
+    X = complex_wrapper(*torch.unbind(stft(x, ilens)[0], dim=-1)).transpose(-1, -3)
     # (B, F, C, C)
-    Phi_X = FC.einsum("...ct,...et->...ce", [X, X.conj()])
-    Phi_N = FC.einsum("...ct,...et->...ce", [N, N.conj()])
+    Phi_X = complex_module.einsum("...ct,...et->...ce", [X, X.conj()])
+
+    is_singular = True
+    while is_singular:
+        N = complex_wrapper(torch.randn_like(X.real), torch.randn_like(X.imag))
+        Phi_N = complex_module.einsum("...ct,...et->...ce", [N, N.conj()])
+        is_singular = not np.all(np.linalg.matrix_rank(Phi_N.numpy()) == ch)
+
     # (B, F, C, 1)
-    rtf = get_rtf(Phi_X, Phi_N, reference_vector=0, iterations=20)
+    rtf = get_rtf(Phi_X, Phi_N, mode=mode, reference_vector=0, iterations=20)
     if is_torch_1_1_plus:
         rtf = rtf / (rtf.abs().max(dim=-2, keepdim=True).values + 1e-15)
     else:
@@ -85,15 +104,15 @@ def test_get_rtf(ch):
     # rtf \approx Phi_N MaxEigVec(Phi_N^-1 @ Phi_X)
     if is_torch_1_1_plus:
         # torch.solve is required, which is only available after pytorch 1.1.0+
-        mat = FC.solve(Phi_X, Phi_N)[0]
-        max_eigenvec = FC.solve(rtf, Phi_N)[0]
+        mat = solve(Phi_X, Phi_N)[0]
+        max_eigenvec = solve(rtf, Phi_N)[0]
     else:
-        mat = FC.matmul(Phi_N.inverse2(), Phi_X)
-        max_eigenvec = FC.matmul(Phi_N.inverse2(), rtf)
-    factor = FC.matmul(mat, max_eigenvec)
-    assert FC.allclose(
-        FC.matmul(max_eigenvec, factor.transpose(-1, -2)),
-        FC.matmul(factor, max_eigenvec.transpose(-1, -2)),
+        mat = complex_module.matmul(Phi_N.inverse2(), Phi_X)
+        max_eigenvec = complex_module.matmul(Phi_N.inverse2(), rtf)
+    factor = complex_module.matmul(mat, max_eigenvec)
+    assert complex_module.allclose(
+        complex_module.matmul(max_eigenvec, factor.transpose(-1, -2)),
+        complex_module.matmul(factor, max_eigenvec.transpose(-1, -2)),
     )
 
 
@@ -117,3 +136,49 @@ def test_signal_framing():
     X2 = signal_framing(X, taps + 1, 1, delay, do_padding=True)
     assert X2.shape == torch.Size([2, 10, 6, 20, taps + 1])
     assert FC.allclose(X2[..., -1], X)
+
+
+@pytest.mark.skipif(not is_torch_1_9_plus, reason="Require torch 1.9.0+")
+@pytest.mark.parametrize("ch", [2, 4, 6, 8])
+def test_gevd(ch):
+    stft = Stft(
+        n_fft=8,
+        win_length=None,
+        hop_length=2,
+        center=True,
+        window="hann",
+        normalized=False,
+        onesided=True,
+    )
+    torch.random.manual_seed(0)
+    x = random_speech[..., :ch]
+    ilens = torch.LongTensor([16, 12])
+    # (B, T, C, F) -> (B, F, C, T)
+    X = torch.complex(*torch.unbind(stft(x, ilens)[0], dim=-1)).transpose(-1, -3)
+    # (B, F, C, C)
+    Phi_X = torch.einsum("...ct,...et->...ce", [X, X.conj()])
+
+    is_singular = True
+    while is_singular:
+        N = torch.randn_like(X)
+        Phi_N = torch.einsum("...ct,...et->...ce", [N, N.conj()])
+        is_singular = not torch.linalg.matrix_rank(Phi_N).eq(ch).all()
+    # Phi_N = torch.eye(ch, dtype=Phi_X.dtype).view(1, 1, ch, ch).expand_as(Phi_X)
+
+    # e_val: (B, F, C)
+    # e_vec: (B, F, C, C)
+    e_val, e_vec = generalized_eigenvalue_decomposition(Phi_X, Phi_N)
+    e_val = e_val.to(dtype=e_vec.dtype)
+    assert torch.allclose(
+        torch.matmul(Phi_X, e_vec),
+        torch.matmul(torch.matmul(Phi_N, e_vec), e_val.diag_embed()),
+    )
+
+
+@pytest.mark.skipif(not is_torch_1_9_plus, reason="Require torch 1.9.0+")
+def test_gev_phase_correction():
+    mat = ComplexTensor(torch.rand(2, 3, 4), torch.rand(2, 3, 4))
+    mat_th = torch.complex(mat.real, mat.imag)
+    norm = gev_phase_correction(mat)
+    norm_th = gev_phase_correction(mat_th)
+    assert np.allclose(norm.numpy(), norm_th.numpy())
diff --git a/test/espnet2/enh/loss/criterions/test_tf_domain.py b/test/espnet2/enh/loss/criterions/test_tf_domain.py
new file mode 100644
index 00000000000..75e13037217
--- /dev/null
+++ b/test/espnet2/enh/loss/criterions/test_tf_domain.py
@@ -0,0 +1,30 @@
+import pytest
+import torch
+
+from torch_complex import ComplexTensor
+
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
+
+
+@pytest.mark.parametrize("criterion_class", [FrequencyDomainL1, FrequencyDomainMSE])
+@pytest.mark.parametrize(
+    "mask_type", ["IBM", "IRM", "IAM", "PSM", "NPSM", "PSM^2", "CIRM"]
+)
+@pytest.mark.parametrize("compute_on_mask", [True, False])
+def test_tf_domain_criterion_forward(criterion_class, mask_type, compute_on_mask):
+
+    criterion = criterion_class(compute_on_mask=compute_on_mask, mask_type=mask_type)
+
+    batch = 2
+    inf = [torch.rand(batch, 10, 200)]
+    ref_spec = [ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200))]
+    mix_spec = ComplexTensor(torch.rand(batch, 10, 200), torch.rand(batch, 10, 200))
+
+    if compute_on_mask:
+        ref = criterion.create_mask_label(mix_spec, ref_spec)
+    else:
+        ref = [abs(r) for r in ref_spec]
+
+    loss = criterion(ref[0], inf[0])
+    assert loss.shape == (batch,)
diff --git a/test/espnet2/enh/loss/criterions/test_time_domain.py b/test/espnet2/enh/loss/criterions/test_time_domain.py
new file mode 100644
index 00000000000..250aa604cd0
--- /dev/null
+++ b/test/espnet2/enh/loss/criterions/test_time_domain.py
@@ -0,0 +1,19 @@
+import pytest
+import torch
+
+from espnet2.enh.loss.criterions.time_domain import CISDRLoss
+from espnet2.enh.loss.criterions.time_domain import SISNRLoss
+from espnet2.enh.loss.criterions.time_domain import SNRLoss
+
+
+@pytest.mark.parametrize("criterion_class", [CISDRLoss, SISNRLoss, SNRLoss])
+def test_tf_domain_criterion_forward(criterion_class):
+
+    criterion = criterion_class()
+
+    batch = 2
+    inf = torch.rand(batch, 2000)
+    ref = torch.rand(batch, 2000)
+
+    loss = criterion(ref, inf)
+    assert loss.shape == (batch,)
diff --git a/test/espnet2/enh/loss/wrappers/test_fixed_order_solver.py b/test/espnet2/enh/loss/wrappers/test_fixed_order_solver.py
new file mode 100644
index 00000000000..4cb0e04b6f7
--- /dev/null
+++ b/test/espnet2/enh/loss/wrappers/test_fixed_order_solver.py
@@ -0,0 +1,16 @@
+import pytest
+import torch
+
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
+from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_PITSolver_forward(num_spk):
+
+    batch = 2
+    inf = [torch.rand(batch, 10, 100) for spk in range(num_spk)]
+    ref = [inf[num_spk - spk - 1] for spk in range(num_spk)]  # reverse inf as ref
+    solver = FixedOrderSolver(FrequencyDomainL1())
+
+    loss, stats, others = solver(ref, inf)
diff --git a/test/espnet2/enh/loss/wrappers/test_pit_solver.py b/test/espnet2/enh/loss/wrappers/test_pit_solver.py
new file mode 100644
index 00000000000..40248a6b6bd
--- /dev/null
+++ b/test/espnet2/enh/loss/wrappers/test_pit_solver.py
@@ -0,0 +1,25 @@
+import pytest
+import torch
+
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
+from espnet2.enh.loss.wrappers.pit_solver import PITSolver
+
+
+@pytest.mark.parametrize("num_spk", [1, 2, 3])
+def test_PITSolver_forward(num_spk):
+
+    batch = 2
+    inf = [torch.rand(batch, 10, 100) for spk in range(num_spk)]
+    ref = [inf[num_spk - spk - 1] for spk in range(num_spk)]  # reverse inf as ref
+    solver = PITSolver(FrequencyDomainL1(), independent_perm=True)
+
+    loss, stats, others = solver(ref, inf)
+    perm = others["perm"]
+    correct_perm = list(range(num_spk))
+    correct_perm.reverse()
+    assert perm[0].equal(torch.tensor(correct_perm))
+
+    # test for independent_perm is False
+
+    solver = PITSolver(FrequencyDomainL1(), independent_perm=False)
+    loss, stats, others = solver(ref, inf, {"perm": perm})
diff --git a/test/espnet2/enh/separator/test_beamformer.py b/test/espnet2/enh/separator/test_beamformer.py
index f00801bde4c..3a10c7a9643 100644
--- a/test/espnet2/enh/separator/test_beamformer.py
+++ b/test/espnet2/enh/separator/test_beamformer.py
@@ -1,13 +1,13 @@
 from distutils.version import LooseVersion
-
 import pytest
 import torch
 
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
+from espnet2.enh.layers.dnn_beamformer import BEAMFORMER_TYPES
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
 
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 random_speech = torch.tensor(
     [
         [
@@ -75,19 +75,7 @@
 @pytest.mark.parametrize("ref_channel", [-1, 0])
 @pytest.mark.parametrize("use_noise_mask", [True])
 @pytest.mark.parametrize("bnonlinear", ["sigmoid", "relu", "tanh", "crelu"])
-@pytest.mark.parametrize(
-    "beamformer_type",
-    [
-        "mvdr_souden",
-        "mpdr_souden",
-        "wmpdr_souden",
-        "wpd_souden",
-        "mvdr",
-        "mpdr",
-        "wmpdr",
-        "wpd",
-    ],
-)
+@pytest.mark.parametrize("beamformer_type", BEAMFORMER_TYPES)
 def test_neural_beamformer_forward_backward(
     n_fft,
     win_length,
@@ -119,9 +107,19 @@ def test_neural_beamformer_forward_backward(
         if not multi_source_wpe:
             # Single-source WPE is not supported with beamformer in multi-speaker cases
             return
-    elif num_spk == 1 and multi_source_wpe:
-        # When num_spk == 1, `multi_source_wpe` has no effect
-        return
+    elif num_spk == 1:
+        if multi_source_wpe:
+            # When num_spk == 1, `multi_source_wpe` has no effect
+            return
+        elif beamformer_type in (
+            "lcmv",
+            "lcmp",
+            "wlcmp",
+            "mvdr_tfs",
+            "mvdr_tfs_souden",
+        ):
+            # only support multiple-source cases
+            return
     if bnonlinear != "sigmoid" and (
         beamformer_type != "mvdr_souden" or multi_source_wpe
     ):
@@ -203,7 +201,10 @@ def test_neural_beamformer_wpe_output(
     else:
         assert len(specs) == num_spk
     assert specs[0].shape == input_spectrum.shape
-    assert specs[0].dtype == torch.float
+    if is_torch_1_9_plus and torch.is_complex(specs[0]):
+        assert specs[0].dtype == torch.complex64
+    else:
+        assert specs[0].dtype == torch.float
     assert isinstance(others, dict)
     if use_dnn_mask_for_wpe:
         assert "mask_dereverb1" in others, others.keys()
@@ -212,20 +213,29 @@ def test_neural_beamformer_wpe_output(
 
 @pytest.mark.parametrize("num_spk", [1, 2])
 @pytest.mark.parametrize("use_noise_mask", [True, False])
+@pytest.mark.parametrize("beamformer_type", BEAMFORMER_TYPES)
 @pytest.mark.parametrize(
-    "beamformer_type",
-    [
-        "mvdr_souden",
-        "mpdr_souden",
-        "wmpdr_souden",
-        "wpd_souden",
-        "mvdr",
-        "mpdr",
-        "wmpdr",
-        "wpd",
-    ],
+    "diagonal_loading, mask_flooring, use_torch_solver",
+    [(True, True, True), (False, False, False)],
 )
-def test_neural_beamformer_bf_output(num_spk, use_noise_mask, beamformer_type):
+def test_neural_beamformer_bf_output(
+    num_spk,
+    use_noise_mask,
+    beamformer_type,
+    diagonal_loading,
+    mask_flooring,
+    use_torch_solver,
+):
+    if num_spk == 1 and beamformer_type in (
+        "lcmv",
+        "lcmp",
+        "wlcmp",
+        "mvdr_tfs",
+        "mvdr_tfs_souden",
+    ):
+        # only support multiple-source cases
+        return
+
     ch = 2
     inputs = random_speech[..., :ch].float()
     ilens = torch.LongTensor([16, 12])
@@ -245,6 +255,9 @@ def test_neural_beamformer_bf_output(num_spk, use_noise_mask, beamformer_type):
         badim=2,
         use_noise_mask=use_noise_mask,
         beamformer_type=beamformer_type,
+        diagonal_loading=diagonal_loading,
+        mask_flooring=mask_flooring,
+        use_torch_solver=use_torch_solver,
     )
     model.eval()
     input_spectrum, flens = stft(inputs, ilens)
@@ -260,7 +273,10 @@ def test_neural_beamformer_bf_output(num_spk, use_noise_mask, beamformer_type):
         assert others["mask_spk{}".format(n)].shape[-2] == ch
         assert specs[n - 1].shape == others["mask_spk{}".format(n)][..., 0, :].shape
         assert specs[n - 1].shape == input_spectrum[..., 0, :].shape
-        assert specs[n - 1].dtype == torch.float
+        if is_torch_1_9_plus and torch.is_complex(specs[n - 1]):
+            assert specs[n - 1].dtype == torch.complex64
+        else:
+            assert specs[n - 1].dtype == torch.float
 
 
 def test_beamformer_net_invalid_bf_type():
diff --git a/test/espnet2/enh/separator/test_dc_crn_separator.py b/test/espnet2/enh/separator/test_dc_crn_separator.py
new file mode 100644
index 00000000000..712de05e063
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dc_crn_separator.py
@@ -0,0 +1,164 @@
+from distutils.version import LooseVersion
+import pytest
+
+import torch
+from torch_complex import ComplexTensor
+
+from espnet2.enh.layers.complex_utils import is_complex
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
+
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+@pytest.mark.parametrize("input_dim", [33, 65])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("input_channels", [[2, 4], [2, 4, 4]])
+@pytest.mark.parametrize("enc_hid_channels", [2, 5])
+@pytest.mark.parametrize("enc_layers", [2])
+@pytest.mark.parametrize("glstm_groups", [2])
+@pytest.mark.parametrize("glstm_layers", [1, 2])
+@pytest.mark.parametrize("glstm_bidirectional", [True, False])
+@pytest.mark.parametrize("glstm_rearrange", [True, False])
+@pytest.mark.parametrize("mode", ["mapping", "masking"])
+def test_dc_crn_separator_forward_backward_complex(
+    input_dim,
+    num_spk,
+    input_channels,
+    enc_hid_channels,
+    enc_layers,
+    glstm_groups,
+    glstm_layers,
+    glstm_bidirectional,
+    glstm_rearrange,
+    mode,
+):
+    model = DC_CRNSeparator(
+        input_dim=input_dim,
+        num_spk=num_spk,
+        input_channels=input_channels,
+        enc_hid_channels=enc_hid_channels,
+        enc_kernel_size=(1, 3),
+        enc_padding=(0, 1),
+        enc_last_kernel_size=(1, 3),
+        enc_last_stride=(1, 2),
+        enc_last_padding=(0, 1),
+        enc_layers=enc_layers,
+        skip_last_kernel_size=(1, 3),
+        skip_last_stride=(1, 1),
+        skip_last_padding=(0, 1),
+        glstm_groups=glstm_groups,
+        glstm_layers=glstm_layers,
+        glstm_bidirectional=glstm_bidirectional,
+        glstm_rearrange=glstm_rearrange,
+        mode=mode,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert is_complex(masked[0])
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("input_channels", [[4, 4], [6, 4, 4]])
+@pytest.mark.parametrize(
+    "enc_kernel_size, enc_padding", [((1, 3), (0, 1)), ((1, 5), (0, 2))]
+)
+@pytest.mark.parametrize("enc_last_stride", [(1, 2)])
+@pytest.mark.parametrize(
+    "enc_last_kernel_size, enc_last_padding",
+    [((1, 4), (0, 1)), ((1, 5), (0, 2))],
+)
+@pytest.mark.parametrize("skip_last_stride", [(1, 1)])
+@pytest.mark.parametrize(
+    "skip_last_kernel_size, skip_last_padding",
+    [((1, 3), (0, 1)), ((1, 5), (0, 2))],
+)
+def test_dc_crn_separator_multich_input(
+    num_spk,
+    input_channels,
+    enc_kernel_size,
+    enc_padding,
+    enc_last_kernel_size,
+    enc_last_stride,
+    enc_last_padding,
+    skip_last_kernel_size,
+    skip_last_stride,
+    skip_last_padding,
+):
+    model = DC_CRNSeparator(
+        input_dim=33,
+        num_spk=num_spk,
+        input_channels=input_channels,
+        enc_hid_channels=2,
+        enc_kernel_size=enc_kernel_size,
+        enc_padding=enc_padding,
+        enc_last_kernel_size=enc_last_kernel_size,
+        enc_last_stride=enc_last_stride,
+        enc_last_padding=enc_last_padding,
+        enc_layers=3,
+        skip_last_kernel_size=skip_last_kernel_size,
+        skip_last_stride=skip_last_stride,
+        skip_last_padding=skip_last_padding,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_channels[0] // 2, 33)
+    imag = torch.rand(2, 10, input_channels[0] // 2, 33)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert is_complex(masked[0])
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dc_crn_separator_invalid_enc_layer():
+    with pytest.raises(AssertionError):
+        DC_CRNSeparator(
+            input_dim=17,
+            input_channels=[2, 2, 4],
+            enc_layers=1,
+        )
+
+
+def test_dc_crn_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DC_CRNSeparator(
+            input_dim=17,
+            input_channels=[2, 2, 4],
+            mode="xxx",
+        )
+
+
+def test_dc_crn_separator_output():
+    real = torch.rand(2, 10, 17)
+    imag = torch.rand(2, 10, 17)
+    x = torch.complex(real, imag) if is_torch_1_9_plus else ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = DC_CRNSeparator(
+            input_dim=17,
+            num_spk=num_spk,
+            input_channels=[2, 2, 4],
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_dccrn_separator.py b/test/espnet2/enh/separator/test_dccrn_separator.py
new file mode 100644
index 00000000000..acf30c1ed98
--- /dev/null
+++ b/test/espnet2/enh/separator/test_dccrn_separator.py
@@ -0,0 +1,102 @@
+from distutils.version import LooseVersion
+import pytest
+
+import torch
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
+
+
+@pytest.mark.parametrize("input_dim", [9])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("rnn_layer", [2, 3])
+@pytest.mark.parametrize("rnn_units", [256])
+@pytest.mark.parametrize("masking_mode", ["E", "C", "R"])
+@pytest.mark.parametrize("use_clstm", [True, False])
+@pytest.mark.parametrize("bidirectional", [True, False])
+@pytest.mark.parametrize("use_cbn", [True, False])
+@pytest.mark.parametrize("kernel_size", [5])
+@pytest.mark.parametrize("use_builtin_complex", [True, False])
+@pytest.mark.parametrize("use_noise_mask", [True, False])
+def test_dccrn_separator_forward_backward_complex(
+    input_dim,
+    num_spk,
+    rnn_layer,
+    rnn_units,
+    masking_mode,
+    use_clstm,
+    bidirectional,
+    use_cbn,
+    kernel_size,
+    use_builtin_complex,
+    use_noise_mask,
+):
+    model = DCCRNSeparator(
+        input_dim=input_dim,
+        num_spk=num_spk,
+        rnn_layer=rnn_layer,
+        rnn_units=rnn_units,
+        masking_mode=masking_mode,
+        use_clstm=use_clstm,
+        bidirectional=bidirectional,
+        use_cbn=use_cbn,
+        kernel_size=kernel_size,
+        kernel_num=[
+            32,
+            64,
+            128,
+        ],
+        use_builtin_complex=use_builtin_complex,
+        use_noise_mask=use_noise_mask,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    if use_builtin_complex and is_torch_1_9_plus:
+        assert isinstance(masked[0], torch.Tensor)
+    else:
+        assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_dccrn_separator_invalid_type():
+    with pytest.raises(ValueError):
+        DCCRNSeparator(
+            input_dim=10,
+            masking_mode="fff",
+        )
+
+
+def test_rnn_separator_output():
+    real = torch.rand(2, 10, 9)
+    imag = torch.rand(2, 10, 9)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = DCCRNSeparator(
+            input_dim=9,
+            num_spk=num_spk,
+            kernel_num=[
+                32,
+                64,
+                128,
+            ],
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/separator/test_fasnet_separator.py b/test/espnet2/enh/separator/test_fasnet_separator.py
new file mode 100644
index 00000000000..603dc9ce680
--- /dev/null
+++ b/test/espnet2/enh/separator/test_fasnet_separator.py
@@ -0,0 +1,83 @@
+import pytest
+
+import torch
+from torch import Tensor
+
+from espnet2.enh.separator.fasnet_separator import FaSNetSeparator
+
+
+@pytest.mark.parametrize("input_dim", [1])
+@pytest.mark.parametrize("enc_dim", [4])
+@pytest.mark.parametrize("feature_dim", [4])
+@pytest.mark.parametrize("hidden_dim", [4])
+@pytest.mark.parametrize("segment_size", [2])
+@pytest.mark.parametrize("layer", [1, 2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("win_len", [2, 4])
+@pytest.mark.parametrize("context_len", [2, 4])
+@pytest.mark.parametrize("fasnet_type", ["fasnet", "ifasnet"])
+@pytest.mark.parametrize("sr", [100])
+def test_fasnet_separator_forward_backward_real(
+    input_dim,
+    enc_dim,
+    feature_dim,
+    hidden_dim,
+    segment_size,
+    layer,
+    num_spk,
+    win_len,
+    context_len,
+    fasnet_type,
+    sr,
+):
+    model = FaSNetSeparator(
+        input_dim=input_dim,
+        enc_dim=enc_dim,
+        feature_dim=feature_dim,
+        hidden_dim=hidden_dim,
+        segment_size=segment_size,
+        layer=layer,
+        num_spk=num_spk,
+        win_len=win_len,
+        context_len=context_len,
+        fasnet_type=fasnet_type,
+        sr=sr,
+    )
+    model.train()
+
+    x = torch.rand(2, 400, 4)
+    x_lens = torch.tensor([400, 300], dtype=torch.long)
+
+    separated, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(separated[0], Tensor)
+    assert len(separated) == num_spk
+
+    separated[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("fasnet_type", ["fasnet", "ifasnet"])
+def test_fasnet_separator_output(fasnet_type):
+
+    x = torch.rand(2, 800, 4)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = FaSNetSeparator(
+            input_dim=16,
+            enc_dim=16,
+            feature_dim=16,
+            hidden_dim=16,
+            segment_size=4,
+            layer=2,
+            num_spk=num_spk,
+            win_len=2,
+            context_len=2,
+            fasnet_type=fasnet_type,
+            sr=100,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        assert x[:, :, 0].shape == specs[0].shape
diff --git a/test/espnet2/enh/separator/test_skim_separator.py b/test/espnet2/enh/separator/test_skim_separator.py
new file mode 100644
index 00000000000..e1594cd5620
--- /dev/null
+++ b/test/espnet2/enh/separator/test_skim_separator.py
@@ -0,0 +1,142 @@
+import pytest
+
+import torch
+from torch import Tensor
+from torch_complex import ComplexTensor
+
+from espnet2.enh.separator.skim_separator import SkiMSeparator
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("mem_type", ["hc", "c", "h", None])
+@pytest.mark.parametrize("segment_size", [2, 4])
+@pytest.mark.parametrize("seg_overlap", [False, True])
+def test_skim_separator_forward_backward_complex(
+    input_dim,
+    layer,
+    causal,
+    unit,
+    dropout,
+    num_spk,
+    nonlinear,
+    mem_type,
+    segment_size,
+    seg_overlap,
+):
+    model = SkiMSeparator(
+        input_dim=input_dim,
+        causal=causal,
+        num_spk=num_spk,
+        nonlinear=nonlinear,
+        layer=layer,
+        unit=unit,
+        segment_size=segment_size,
+        dropout=dropout,
+        mem_type=mem_type,
+        seg_overlap=seg_overlap,
+    )
+    model.train()
+
+    real = torch.rand(2, 10, input_dim)
+    imag = torch.rand(2, 10, input_dim)
+    x = ComplexTensor(real, imag)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], ComplexTensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+@pytest.mark.parametrize("input_dim", [5])
+@pytest.mark.parametrize("layer", [1, 3])
+@pytest.mark.parametrize("causal", [True, False])
+@pytest.mark.parametrize("unit", [8])
+@pytest.mark.parametrize("dropout", [0.0, 0.2])
+@pytest.mark.parametrize("num_spk", [1, 2])
+@pytest.mark.parametrize("nonlinear", ["relu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("mem_type", ["hc", "c", "h", "id", None])
+@pytest.mark.parametrize("segment_size", [2, 4])
+@pytest.mark.parametrize("seg_overlap", [False, True])
+def test_skim_separator_forward_backward_real(
+    input_dim,
+    layer,
+    causal,
+    unit,
+    dropout,
+    num_spk,
+    nonlinear,
+    mem_type,
+    segment_size,
+    seg_overlap,
+):
+    model = SkiMSeparator(
+        input_dim=input_dim,
+        causal=causal,
+        num_spk=num_spk,
+        nonlinear=nonlinear,
+        layer=layer,
+        unit=unit,
+        segment_size=segment_size,
+        dropout=dropout,
+        mem_type=mem_type,
+        seg_overlap=seg_overlap,
+    )
+    model.train()
+
+    x = torch.rand(2, 10, input_dim)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    masked, flens, others = model(x, ilens=x_lens)
+
+    assert isinstance(masked[0], Tensor)
+    assert len(masked) == num_spk
+
+    masked[0].abs().mean().backward()
+
+
+def test_skim_separator_invalid_type():
+    with pytest.raises(ValueError):
+        SkiMSeparator(
+            input_dim=10,
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            nonlinear="fff",
+            mem_type="aaa",
+            segment_size=2,
+        )
+
+
+def test_skim_separator_output():
+
+    x = torch.rand(2, 10, 10)
+    x_lens = torch.tensor([10, 8], dtype=torch.long)
+
+    for num_spk in range(1, 3):
+        model = SkiMSeparator(
+            input_dim=10,
+            layer=2,
+            unit=10,
+            dropout=0.1,
+            num_spk=2,
+            nonlinear="relu",
+            segment_size=2,
+        )
+        model.eval()
+        specs, _, others = model(x, x_lens)
+        assert isinstance(specs, list)
+        assert isinstance(others, dict)
+        assert x.shape == specs[0].shape
+        for n in range(num_spk):
+            assert "mask_spk{}".format(n + 1) in others
+            assert specs[n].shape == others["mask_spk{}".format(n + 1)].shape
diff --git a/test/espnet2/enh/test_espnet_model.py b/test/espnet2/enh/test_espnet_model.py
index ef081df1f3e..f3a29f1757d 100644
--- a/test/espnet2/enh/test_espnet_model.py
+++ b/test/espnet2/enh/test_espnet_model.py
@@ -8,47 +8,65 @@
 from espnet2.enh.encoder.conv_encoder import ConvEncoder
 from espnet2.enh.encoder.stft_encoder import STFTEncoder
 from espnet2.enh.espnet_model import ESPnetEnhancementModel
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainL1
+from espnet2.enh.loss.criterions.tf_domain import FrequencyDomainMSE
+from espnet2.enh.loss.criterions.time_domain import SISNRLoss
+from espnet2.enh.loss.wrappers.fixed_order import FixedOrderSolver
+from espnet2.enh.loss.wrappers.pit_solver import PITSolver
+from espnet2.enh.separator.dc_crn_separator import DC_CRNSeparator
+from espnet2.enh.separator.dccrn_separator import DCCRNSeparator
 from espnet2.enh.separator.dprnn_separator import DPRNNSeparator
 from espnet2.enh.separator.neural_beamformer import NeuralBeamformer
 from espnet2.enh.separator.rnn_separator import RNNSeparator
 from espnet2.enh.separator.tcn_separator import TCNSeparator
 from espnet2.enh.separator.transformer_separator import TransformerSeparator
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
+
+is_torch_1_9_plus = LooseVersion(torch.__version__) >= LooseVersion("1.9.0")
 
 
 stft_encoder = STFTEncoder(
-    n_fft=28,
+    n_fft=32,
+    hop_length=16,
+)
+
+stft_encoder_bultin_complex = STFTEncoder(
+    n_fft=32,
     hop_length=16,
+    use_builtin_complex=True,
 )
 
 stft_decoder = STFTDecoder(
-    n_fft=28,
+    n_fft=32,
     hop_length=16,
 )
 
 conv_encoder = ConvEncoder(
-    channel=15,
-    kernel_size=32,
-    stride=16,
+    channel=17,
+    kernel_size=36,
+    stride=18,
 )
 
 conv_decoder = ConvDecoder(
-    channel=15,
-    kernel_size=32,
-    stride=16,
+    channel=17,
+    kernel_size=36,
+    stride=18,
 )
 
 rnn_separator = RNNSeparator(
-    input_dim=15,
+    input_dim=17,
     layer=1,
     unit=10,
 )
 
-dprnn_separator = DPRNNSeparator(input_dim=15, layer=1, unit=10, segment_size=4)
+dc_crn_separator = DC_CRNSeparator(input_dim=17, input_channels=[2, 2, 4])
+
+dccrn_separator = DCCRNSeparator(input_dim=17, num_spk=1, kernel_num=[32, 64, 128])
+
+dprnn_separator = DPRNNSeparator(input_dim=17, layer=1, unit=10, segment_size=4)
 
 tcn_separator = TCNSeparator(
-    input_dim=15,
+    input_dim=17,
     layer=2,
     stack=1,
     bottleneck_dim=10,
@@ -57,66 +75,57 @@
 )
 
 transformer_separator = TransformerSeparator(
-    input_dim=15,
+    input_dim=17,
     adim=8,
     aheads=2,
     layers=2,
     linear_units=10,
 )
 
+si_snr_loss = SISNRLoss()
+tf_mse_loss = FrequencyDomainMSE()
+tf_l1_loss = FrequencyDomainL1()
+
+pit_wrapper = PITSolver(criterion=si_snr_loss)
+fix_order_solver = FixedOrderSolver(criterion=tf_mse_loss)
+
 
 @pytest.mark.parametrize(
-    "encoder, decoder", [(stft_encoder, stft_decoder), (conv_encoder, conv_decoder)]
-)
-@pytest.mark.parametrize(
-    "separator", [rnn_separator, dprnn_separator, tcn_separator, transformer_separator]
+    "encoder, decoder",
+    [
+        (stft_encoder, stft_decoder),
+        (stft_encoder_bultin_complex, stft_decoder),
+        (conv_encoder, conv_decoder),
+    ],
 )
 @pytest.mark.parametrize(
-    "loss_type", ["si_snr", "mask_mse", "magnitude", "spectrum", "spectrum_log"]
+    "separator",
+    [
+        rnn_separator,
+        dprnn_separator,
+        dc_crn_separator,
+        dccrn_separator,
+        tcn_separator,
+        transformer_separator,
+    ],
 )
-@pytest.mark.parametrize("stft_consistency", [True, False])
-@pytest.mark.parametrize("mask_type", ["IBM", "IRM", "IAM", "PSM", "NPSM", "PSM^2"])
 @pytest.mark.parametrize("training", [True, False])
-def test_single_channel_model(
-    encoder, decoder, separator, stft_consistency, loss_type, mask_type, training
-):
-    if not is_torch_1_2_plus:
-        pytest.skip("Pytorch Version Under 1.2 is not supported for Enh task")
-
-    inputs = torch.randn(2, 100)
-    ilens = torch.LongTensor([100, 80])
-    speech_refs = [torch.randn(2, 100).float(), torch.randn(2, 100).float()]
-
-    if loss_type != "si_snr" and isinstance(encoder, ConvEncoder):
-        with pytest.raises(TypeError):
-            enh_model = ESPnetEnhancementModel(
-                encoder=encoder,
-                separator=separator,
-                decoder=decoder,
-                stft_consistency=stft_consistency,
-                loss_type=loss_type,
-                mask_type=mask_type,
-            )
+@pytest.mark.parametrize("loss_wrappers", [[pit_wrapper, fix_order_solver]])
+def test_single_channel_model(encoder, decoder, separator, training, loss_wrappers):
+    if not isinstance(encoder, STFTEncoder) and isinstance(
+        separator, (DCCRNSeparator, DC_CRNSeparator)
+    ):
+        # skip because DCCRNSeparator and DC_CRNSeparator only work
+        # for complex spectrum features
         return
-    if stft_consistency and loss_type in ["mask_mse", "si_snr"]:
-        with pytest.raises(ValueError):
-            enh_model = ESPnetEnhancementModel(
-                encoder=encoder,
-                separator=separator,
-                decoder=decoder,
-                stft_consistency=stft_consistency,
-                loss_type=loss_type,
-                mask_type=mask_type,
-            )
-        return
-
+    inputs = torch.randn(2, 300)
+    ilens = torch.LongTensor([300, 200])
+    speech_refs = [torch.randn(2, 300).float(), torch.randn(2, 300).float()]
     enh_model = ESPnetEnhancementModel(
         encoder=encoder,
         separator=separator,
         decoder=decoder,
-        stft_consistency=stft_consistency,
-        loss_type=loss_type,
-        mask_type=mask_type,
+        loss_wrappers=loss_wrappers,
     )
 
     if training:
@@ -175,38 +184,39 @@ def test_single_channel_model(
 )
 
 
+pit_wrapper = PITSolver(criterion=FrequencyDomainMSE(compute_on_mask=True))
+
+
 @pytest.mark.parametrize("training", [True, False])
 @pytest.mark.parametrize("mask_type", ["IBM", "IRM", "IAM", "PSM", "PSM^2"])
 @pytest.mark.parametrize(
     "loss_type", ["mask_mse", "magnitude", "spectrum", "spectrum_log"]
 )
 @pytest.mark.parametrize("num_spk", [1, 2, 3])
-@pytest.mark.parametrize("use_noise_mask", [True, False])
-@pytest.mark.parametrize("stft_consistency", [True, False])
+@pytest.mark.parametrize("use_builtin_complex", [True, False])
+@pytest.mark.parametrize("loss_wrappers", [[pit_wrapper]])
 def test_forward_with_beamformer_net(
-    training, mask_type, loss_type, num_spk, use_noise_mask, stft_consistency
+    training, mask_type, loss_type, num_spk, use_builtin_complex, loss_wrappers
 ):
-    if not is_torch_1_2_plus:
-        pytest.skip("Pytorch Version Under 1.2 is not supported for Enh task")
-
     # Skip some testing cases
     if not loss_type.startswith("mask") and mask_type != "IBM":
         # `mask_type` has no effect when `loss_type` is not "mask..."
         return
+    if not is_torch_1_9_plus and use_builtin_complex:
+        # builtin complex support is only available in PyTorch 1.8+
+        return
 
-    ch = 2
+    ch = 3
     inputs = random_speech[..., :ch].float()
     ilens = torch.LongTensor([16, 12])
     speech_refs = [torch.randn(2, 16, ch).float() for spk in range(num_spk)]
     noise_ref1 = torch.randn(2, 16, ch, dtype=torch.float)
     dereverb_ref1 = torch.randn(2, 16, ch, dtype=torch.float)
-    encoder = STFTEncoder(n_fft=8, hop_length=2)
+    encoder = STFTEncoder(
+        n_fft=8, hop_length=2, use_builtin_complex=use_builtin_complex
+    )
     decoder = STFTDecoder(n_fft=8, hop_length=2)
 
-    if stft_consistency and loss_type in ["mask_mse", "si_snr"]:
-        # skip this condition
-        return
-
     beamformer = NeuralBeamformer(
         input_dim=5,
         loss_type=loss_type,
@@ -223,27 +233,21 @@ def test_forward_with_beamformer_net(
         bprojs=2,
         badim=2,
         ref_channel=0,
-        use_noise_mask=use_noise_mask,
+        use_noise_mask=False,
         beamformer_type="mvdr_souden",
     )
     enh_model = ESPnetEnhancementModel(
         encoder=encoder,
         decoder=decoder,
         separator=beamformer,
-        stft_consistency=stft_consistency,
         loss_type=loss_type,
         mask_type=mask_type,
+        loss_wrappers=loss_wrappers,
     )
     if training:
         enh_model.train()
-        if stft_consistency and not is_torch_1_2_plus:
-            # torchaudio.functional.istft is only available with pytorch 1.2+
-            return
     else:
         enh_model.eval()
-        if not is_torch_1_2_plus:
-            # torchaudio.functional.istft is only available with pytorch 1.2+
-            return
 
     kwargs = {
         "speech_mix": inputs,
diff --git a/test/espnet2/gan_tts/hifigan/test_hifigan.py b/test/espnet2/gan_tts/hifigan/test_hifigan.py
new file mode 100644
index 00000000000..1bfc7308103
--- /dev/null
+++ b/test/espnet2/gan_tts/hifigan/test_hifigan.py
@@ -0,0 +1,200 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Test code for HiFi-GAN modules."""
+
+import numpy as np
+import pytest
+import torch
+
+from espnet2.gan_tts.hifigan import HiFiGANGenerator
+from espnet2.gan_tts.hifigan import HiFiGANMultiScaleMultiPeriodDiscriminator
+from espnet2.gan_tts.hifigan.loss import DiscriminatorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import FeatureMatchLoss
+from espnet2.gan_tts.hifigan.loss import GeneratorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import MelSpectrogramLoss
+
+
+def make_hifigan_generator_args(**kwargs):
+    defaults = dict(
+        in_channels=5,
+        out_channels=1,
+        channels=32,
+        kernel_size=7,
+        upsample_scales=(2, 2),
+        upsample_kernel_sizes=(4, 4),
+        resblock_kernel_sizes=(3, 7),
+        resblock_dilations=[(1, 3), (1, 3)],
+        use_additional_convs=True,
+        bias=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_hifigan_multi_scale_multi_period_discriminator_args(**kwargs):
+    defaults = dict(
+        scales=2,
+        scale_downsample_pooling="AvgPool1d",
+        scale_downsample_pooling_params={
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 2,
+        },
+        scale_discriminator_params={
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [15, 41, 5, 3],
+            "channels": 16,
+            "max_downsample_channels": 16,
+            "max_groups": 16,
+            "bias": True,
+            "downsample_scales": [2, 2],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+        },
+        follow_official_norm=False,
+        periods=[2, 3],
+        period_discriminator_params={
+            "in_channels": 1,
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 8,
+            "downsample_scales": [3, 3],
+            "max_downsample_channels": 32,
+            "bias": True,
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+            "use_weight_norm": True,
+            "use_spectral_norm": False,
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_mel_loss_args(**kwargs):
+    defaults = dict(
+        fs=120,
+        n_fft=16,
+        hop_length=4,
+        win_length=None,
+        window="hann",
+        n_mels=2,
+        fmin=None,
+        fmax=None,
+        center=True,
+        normalized=False,
+        onesided=True,
+        log_base=10.0,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@pytest.mark.parametrize(
+    "dict_g, dict_d, dict_loss, average, include",
+    [
+        ({}, {}, {}, True, True),
+        ({}, {}, {}, False, False),
+        ({}, {"scales": 1}, {}, False, True),
+        ({}, {"periods": [2]}, {}, False, True),
+        ({}, {"scales": 1, "periods": [2]}, {}, False, True),
+        ({}, {"follow_official_norm": True}, {}, False, True),
+        ({"use_additional_convs": False}, {}, {}, False, True),
+        ({"global_channels": 4}, {}, {}, True, True),
+    ],
+)
+def test_hifigan_generator_and_discriminator_and_loss(
+    dict_g, dict_d, dict_loss, average, include
+):
+    batch_size = 2
+    batch_length = 128
+    args_g = make_hifigan_generator_args(**dict_g)
+    args_d = make_hifigan_multi_scale_multi_period_discriminator_args(**dict_d)
+    args_loss = make_mel_loss_args(**dict_loss)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["in_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    g = None
+    if args_g.get("global_channels") is not None:
+        g = torch.randn(batch_size, args_g["global_channels"], 1)
+    model_g = HiFiGANGenerator(**args_g)
+    model_d = HiFiGANMultiScaleMultiPeriodDiscriminator(**args_d)
+    aux_criterion = MelSpectrogramLoss(**args_loss)
+    feat_match_criterion = FeatureMatchLoss(
+        average_by_layers=average,
+        average_by_discriminators=average,
+        include_final_outputs=include,
+    )
+    gen_adv_criterion = GeneratorAdversarialLoss(
+        average_by_discriminators=average,
+    )
+    dis_adv_criterion = DiscriminatorAdversarialLoss(
+        average_by_discriminators=average,
+    )
+    optimizer_g = torch.optim.AdamW(model_g.parameters())
+    optimizer_d = torch.optim.AdamW(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c, g=g)
+    p_hat = model_d(y_hat)
+    aux_loss = aux_criterion(y_hat, y)
+    adv_loss = gen_adv_criterion(p_hat)
+    with torch.no_grad():
+        p = model_d(y)
+    fm_loss = feat_match_criterion(p_hat, p)
+    loss_g = adv_loss + aux_loss + fm_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+try:
+    import parallel_wavegan  # NOQA
+
+    is_parallel_wavegan_available = True
+except ImportError:
+    is_parallel_wavegan_available = False
+
+
+@pytest.mark.skipif(
+    not is_parallel_wavegan_available, reason="parallel_wavegan is not installed."
+)
+def test_parallel_wavegan_compatibility():
+    from parallel_wavegan.models import HiFiGANGenerator as PWGHiFiGANGenerator
+
+    model_pwg = PWGHiFiGANGenerator(**make_hifigan_generator_args())
+    model_espnet2 = HiFiGANGenerator(**make_hifigan_generator_args())
+    model_espnet2.load_state_dict(model_pwg.state_dict())
+    model_pwg.eval()
+    model_espnet2.eval()
+
+    with torch.no_grad():
+        c = torch.randn(3, 5)
+        out_pwg = model_pwg.inference(c)
+        out_espnet2 = model_espnet2.inference(c)
+        np.testing.assert_array_equal(
+            out_pwg.cpu().numpy(),
+            out_espnet2.cpu().numpy(),
+        )
diff --git a/test/espnet2/gan_tts/joint/test_joint_text2wav.py b/test/espnet2/gan_tts/joint/test_joint_text2wav.py
new file mode 100644
index 00000000000..1badd3a892f
--- /dev/null
+++ b/test/espnet2/gan_tts/joint/test_joint_text2wav.py
@@ -0,0 +1,545 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Test VITS related modules."""
+
+from distutils.version import LooseVersion
+
+import pytest
+import torch
+
+from espnet2.gan_tts.joint import JointText2Wav
+
+
+def make_text2mel_args(**kwargs):
+    defaults = dict(
+        text2mel_type="fastspeech2",
+        text2mel_params={
+            "adim": 4,
+            "aheads": 2,
+            "elayers": 2,
+            "eunits": 4,
+            "dlayers": 2,
+            "dunits": 3,
+            "postnet_layers": 2,
+            "postnet_chans": 4,
+            "postnet_filts": 5,
+            "postnet_dropout_rate": 0.5,
+            "positionwise_layer_type": "conv1d",
+            "positionwise_conv_kernel_size": 1,
+            "use_scaled_pos_enc": True,
+            "use_batch_norm": True,
+            "encoder_normalize_before": True,
+            "decoder_normalize_before": True,
+            "encoder_concat_after": False,
+            "decoder_concat_after": False,
+            "reduction_factor": 1,
+            "encoder_type": "conformer",
+            "decoder_type": "conformer",
+            "transformer_enc_dropout_rate": 0.1,
+            "transformer_enc_positional_dropout_rate": 0.1,
+            "transformer_enc_attn_dropout_rate": 0.1,
+            "transformer_dec_dropout_rate": 0.1,
+            "transformer_dec_positional_dropout_rate": 0.1,
+            "transformer_dec_attn_dropout_rate": 0.1,
+            "conformer_rel_pos_type": "latest",
+            "conformer_pos_enc_layer_type": "rel_pos",
+            "conformer_self_attn_layer_type": "rel_selfattn",
+            "conformer_activation_type": "swish",
+            "use_macaron_style_in_conformer": True,
+            "use_cnn_in_conformer": True,
+            "zero_triu": False,
+            "conformer_enc_kernel_size": 7,
+            "conformer_dec_kernel_size": 31,
+            "duration_predictor_layers": 2,
+            "duration_predictor_chans": 4,
+            "duration_predictor_kernel_size": 3,
+            "duration_predictor_dropout_rate": 0.1,
+            "energy_predictor_layers": 2,
+            "energy_predictor_chans": 4,
+            "energy_predictor_kernel_size": 3,
+            "energy_predictor_dropout": 0.5,
+            "energy_embed_kernel_size": 1,
+            "energy_embed_dropout": 0.5,
+            "stop_gradient_from_energy_predictor": False,
+            "pitch_predictor_layers": 2,
+            "pitch_predictor_chans": 4,
+            "pitch_predictor_kernel_size": 5,
+            "pitch_predictor_dropout": 0.5,
+            "pitch_embed_kernel_size": 1,
+            "pitch_embed_dropout": 0.5,
+            "stop_gradient_from_pitch_predictor": True,
+            "spks": -1,
+            "langs": -1,
+            "spk_embed_dim": None,
+            "spk_embed_integration_type": "add",
+            "use_gst": False,
+            "init_type": "xavier_uniform",
+            "init_enc_alpha": 1.0,
+            "init_dec_alpha": 1.0,
+            "use_masking": False,
+            "use_weighted_masking": False,
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_vocoder_args(**kwargs):
+    defaults = dict(
+        vocoder_type="hifigan_generator",
+        vocoder_params={
+            "out_channels": 1,
+            "channels": 32,
+            "global_channels": -1,
+            "kernel_size": 7,
+            "upsample_scales": [2, 2],
+            "upsample_kernel_sizes": [4, 4],
+            "resblock_kernel_sizes": [3, 7],
+            "resblock_dilations": [[1, 3], [1, 3]],
+            "use_additional_convs": True,
+            "bias": True,
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.1},
+            "use_weight_norm": True,
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_discriminator_args(**kwargs):
+    defaults = dict(
+        discriminator_type="hifigan_multi_scale_multi_period_discriminator",
+        discriminator_params={
+            "scales": 1,
+            "scale_downsample_pooling": "AvgPool1d",
+            "scale_downsample_pooling_params": {
+                "kernel_size": 4,
+                "stride": 2,
+                "padding": 2,
+            },
+            "scale_discriminator_params": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [15, 41, 5, 3],
+                "channels": 16,
+                "max_downsample_channels": 32,
+                "max_groups": 16,
+                "bias": True,
+                "downsample_scales": [2, 1],
+                "nonlinear_activation": "LeakyReLU",
+                "nonlinear_activation_params": {"negative_slope": 0.1},
+            },
+            "follow_official_norm": True,
+            "periods": [2, 3],
+            "period_discriminator_params": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 4,
+                "downsample_scales": [3, 1],
+                "max_downsample_channels": 16,
+                "bias": True,
+                "nonlinear_activation": "LeakyReLU",
+                "nonlinear_activation_params": {"negative_slope": 0.1},
+                "use_weight_norm": True,
+                "use_spectral_norm": False,
+            },
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_loss_args(**kwargs):
+    defaults = dict(
+        lambda_text2mel=1.0,
+        lambda_adv=1.0,
+        lambda_feat_match=2.0,
+        lambda_mel=1.0,
+        generator_adv_loss_params={
+            "average_by_discriminators": False,
+            "loss_type": "mse",
+        },
+        discriminator_adv_loss_params={
+            "average_by_discriminators": False,
+            "loss_type": "mse",
+        },
+        use_feat_match_loss=True,
+        feat_match_loss_params={
+            "average_by_discriminators": False,
+            "average_by_layers": False,
+            "include_final_outputs": True,
+        },
+        use_mel_loss=True,
+        mel_loss_params={
+            "fs": 22050,
+            "n_fft": 16,
+            "hop_length": 4,
+            "win_length": None,
+            "window": "hann",
+            "n_mels": 4,
+            "fmin": 0,
+            "fmax": None,
+            "log_base": None,
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.skipif(
+    LooseVersion(torch.__version__) < LooseVersion("1.4"),
+    reason="Pytorch >= 1.4 is required.",
+)
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@pytest.mark.parametrize(
+    "t2m_dict, voc_dict, dis_dict, loss_dict",
+    [
+        ({}, {}, {}, {}),
+        (
+            {
+                "text2mel_type": "tacotron2",
+                "text2mel_params": {
+                    "embed_dim": 8,
+                    "elayers": 1,
+                    "eunits": 8,
+                    "econv_layers": 2,
+                    "econv_chans": 8,
+                    "econv_filts": 5,
+                    "atype": "location",
+                    "adim": 4,
+                    "aconv_chans": 4,
+                    "aconv_filts": 3,
+                    "cumulate_att_w": True,
+                    "dlayers": 2,
+                    "dunits": 8,
+                    "prenet_layers": 2,
+                    "prenet_units": 4,
+                    "postnet_layers": 2,
+                    "postnet_chans": 4,
+                    "postnet_filts": 3,
+                    "output_activation": None,
+                    "use_batch_norm": True,
+                    "use_concate": True,
+                    "use_residual": False,
+                    "reduction_factor": 1,
+                    "spk_embed_dim": None,
+                    "spk_embed_integration_type": "concat",
+                    "use_gst": False,
+                    "dropout_rate": 0.5,
+                    "zoneout_rate": 0.1,
+                    "use_masking": True,
+                    "use_weighted_masking": False,
+                    "bce_pos_weight": 5.0,
+                    "loss_type": "L1+L2",
+                    "use_guided_attn_loss": True,
+                    "guided_attn_loss_sigma": 0.4,
+                    "guided_attn_loss_lambda": 1.0,
+                },
+            },
+            {},
+            {},
+            {},
+        ),
+        (
+            {
+                "text2mel_type": "transformer",
+                "text2mel_params": {
+                    "embed_dim": 4,
+                    "eprenet_conv_layers": 2,
+                    "eprenet_conv_chans": 4,
+                    "eprenet_conv_filts": 3,
+                    "dprenet_layers": 2,
+                    "dprenet_units": 7,
+                    "elayers": 2,
+                    "eunits": 4,
+                    "adim": 4,
+                    "aheads": 2,
+                    "dlayers": 2,
+                    "dunits": 3,
+                    "postnet_layers": 1,
+                    "postnet_chans": 2,
+                    "postnet_filts": 3,
+                    "positionwise_layer_type": "conv1d",
+                    "positionwise_conv_kernel_size": 1,
+                    "reduction_factor": 1,
+                    "spk_embed_dim": None,
+                    "use_gst": False,
+                },
+            },
+            {},
+            {},
+            {},
+        ),
+        (
+            {
+                "text2mel_type": "fastspeech",
+                "text2mel_params": {
+                    # network structure related
+                    "adim": 4,
+                    "aheads": 2,
+                    "elayers": 2,
+                    "eunits": 3,
+                    "dlayers": 2,
+                    "dunits": 3,
+                    "postnet_layers": 2,
+                    "postnet_chans": 3,
+                    "postnet_filts": 5,
+                    "positionwise_layer_type": "conv1d",
+                    "positionwise_conv_kernel_size": 1,
+                    "use_scaled_pos_enc": True,
+                    "use_batch_norm": True,
+                    "encoder_normalize_before": True,
+                    "decoder_normalize_before": True,
+                    "encoder_concat_after": False,
+                    "decoder_concat_after": False,
+                    "duration_predictor_layers": 2,
+                    "duration_predictor_chans": 3,
+                    "duration_predictor_kernel_size": 3,
+                    "reduction_factor": 1,
+                    "encoder_type": "transformer",
+                    "decoder_type": "transformer",
+                    "spk_embed_dim": None,
+                    "use_gst": False,
+                },
+            },
+            {},
+            {},
+            {},
+        ),
+        (
+            {},
+            {
+                "vocoder_type": "parallel_wavegan_generator",
+                "vocoder_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_size": 3,
+                    "layers": 6,
+                    "stacks": 3,
+                    "residual_channels": 8,
+                    "gate_channels": 16,
+                    "skip_channels": 8,
+                    "aux_channels": 5,
+                    "aux_context_window": 0,
+                    "upsample_net": "ConvInUpsampleNetwork",
+                    "upsample_params": {"upsample_scales": [4, 4]},
+                },
+            },
+            {},
+            {},
+        ),
+        (
+            {},
+            {},
+            {
+                "discriminator_type": "parallel_wavegan_discriminator",
+                "discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_size": 3,
+                    "layers": 5,
+                    "conv_channels": 16,
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "vocoder_type": "melgan_generator",
+                "vocoder_params": {
+                    "in_channels": 5,
+                    "out_channels": 1,
+                    "kernel_size": 7,
+                    "channels": 32,
+                    "bias": True,
+                    "upsample_scales": [4, 2],
+                    "stack_kernel_size": 3,
+                    "stacks": 1,
+                    "pad": "ReplicationPad1d",
+                },
+            },
+            {},
+            {},
+        ),
+        (
+            {},
+            {},
+            {
+                "discriminator_type": "melgan_multi_scale_discriminator",
+                "discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "scales": 2,
+                    "kernel_sizes": [5, 3],
+                    "channels": 16,
+                    "max_downsample_channels": 32,
+                    "bias": True,
+                    "downsample_scales": [2, 2],
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "vocoder_type": "style_melgan_generator",
+                "vocoder_params": {
+                    "in_channels": 32,
+                    "aux_channels": 5,
+                    "channels": 16,
+                    "out_channels": 1,
+                    "kernel_size": 9,
+                    "dilation": 2,
+                    "bias": True,
+                    "noise_upsample_scales": [2, 2],
+                    "noise_upsample_activation": "LeakyReLU",
+                    "noise_upsample_activation_params": {"negative_slope": 0.2},
+                    "upsample_scales": [4, 4],
+                },
+            },
+            {},
+            {},
+        ),
+        (
+            {},
+            {},
+            {
+                "discriminator_type": "style_melgan_discriminator",
+                "discriminator_params": {
+                    "repeats": 2,
+                    "window_sizes": [4, 8],
+                    "pqmf_params": [
+                        [1, None, None, None],
+                        [2, 62, 0.26700, 9.0],
+                    ],
+                    "discriminator_params": {
+                        "out_channels": 1,
+                        "kernel_sizes": [5, 3],
+                        "channels": 16,
+                        "max_downsample_channels": 32,
+                        "bias": True,
+                        "downsample_scales": [2, 2],
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.2},
+                        "pad": "ReplicationPad1d",
+                        "pad_params": {},
+                    },
+                    "use_weight_norm": True,
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "vocoder_params": {
+                    "out_channels": 4,
+                    "channels": 32,
+                    "global_channels": -1,
+                    "kernel_size": 7,
+                    "upsample_scales": [4, 2],
+                    "upsample_kernel_sizes": [8, 4],
+                    "resblock_kernel_sizes": [3, 7],
+                    "resblock_dilations": [[1, 3], [1, 3]],
+                },
+                "use_pqmf": True,
+            },
+            {},
+            {},
+        ),
+        (
+            {},
+            {
+                "vocoder_type": "melgan_generator",
+                "vocoder_params": {
+                    "in_channels": 5,
+                    "out_channels": 4,
+                    "kernel_size": 7,
+                    "channels": 32,
+                    "bias": True,
+                    "upsample_scales": [4, 2],
+                    "stack_kernel_size": 3,
+                    "stacks": 1,
+                    "pad": "ReplicationPad1d",
+                },
+                "use_pqmf": True,
+            },
+            {},
+            {},
+        ),
+    ],
+)
+def test_joint_model_is_trainable_and_decodable(
+    t2m_dict, voc_dict, dis_dict, loss_dict
+):
+    idim = 10
+    odim = 5
+    t2m_args = make_text2mel_args(**t2m_dict)
+    voc_args = make_vocoder_args(**voc_dict)
+    dis_args = make_discriminator_args(**dis_dict)
+    loss_args = make_loss_args(**loss_dict)
+    model = JointText2Wav(
+        idim=idim,
+        odim=odim,
+        segment_size=4,
+        **t2m_args,
+        **voc_args,
+        **dis_args,
+        **loss_args,
+    )
+    model.train()
+    upsample_factor = model.generator["vocoder"].upsample_factor
+    inputs = dict(
+        text=torch.randint(0, idim, (2, 8)),
+        text_lengths=torch.tensor([8, 5], dtype=torch.long),
+        feats=torch.randn(2, 16, odim),
+        feats_lengths=torch.tensor([16, 13], dtype=torch.long),
+        speech=torch.randn(2, 16 * upsample_factor),
+        speech_lengths=torch.tensor([16, 13] * upsample_factor, dtype=torch.long),
+    )
+    if t2m_args["text2mel_type"] in ["fastspeech", "fastspeech2"]:
+        inputs.update(
+            durations=torch.tensor(
+                [
+                    # +1 element for <eos>
+                    [2, 2, 2, 2, 2, 2, 2, 2, 0],
+                    [3, 3, 3, 3, 1, 0, 0, 0, 0],
+                ],
+                dtype=torch.long,
+            ),
+            # +1 element for <eos>
+            durations_lengths=torch.tensor([8 + 1, 5 + 1], dtype=torch.long),
+        )
+    if t2m_args["text2mel_type"] in ["fastspeech2"]:
+        inputs.update(
+            pitch=torch.randn(2, 9, 1),
+            pitch_lengths=torch.tensor([9, 7], dtype=torch.long),
+            energy=torch.randn(2, 9, 1),
+            energy_lengths=torch.tensor([9, 7], dtype=torch.long),
+        )
+    gen_loss = model(forward_generator=True, **inputs)["loss"]
+    gen_loss.backward()
+    dis_loss = model(forward_generator=False, **inputs)["loss"]
+    dis_loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        # check inference
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (10,),
+            )
+        )
+        output_dict = model.inference(**inputs)
+        assert len(output_dict["wav"]) == len(output_dict["feat_gen"]) * upsample_factor
diff --git a/test/espnet2/gan_tts/melgan/test_melgan.py b/test/espnet2/gan_tts/melgan/test_melgan.py
new file mode 100644
index 00000000000..81d5874007b
--- /dev/null
+++ b/test/espnet2/gan_tts/melgan/test_melgan.py
@@ -0,0 +1,152 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Test code for MelGAN modules."""
+
+import numpy as np
+import pytest
+import torch
+
+from espnet2.gan_tts.hifigan.loss import DiscriminatorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import FeatureMatchLoss
+from espnet2.gan_tts.hifigan.loss import GeneratorAdversarialLoss
+from espnet2.gan_tts.melgan import MelGANGenerator
+from espnet2.gan_tts.melgan import MelGANMultiScaleDiscriminator
+
+
+def make_melgan_generator_args(**kwargs):
+    defaults = dict(
+        in_channels=80,
+        out_channels=1,
+        kernel_size=7,
+        channels=32,
+        bias=True,
+        upsample_scales=[4, 4],
+        stack_kernel_size=3,
+        stacks=2,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_final_nonlinear_activation=True,
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_melgan_discriminator_args(**kwargs):
+    defaults = dict(
+        in_channels=1,
+        out_channels=1,
+        scales=2,
+        downsample_pooling="AvgPool1d",
+        # follow the official implementation setting
+        downsample_pooling_params={
+            "kernel_size": 4,
+            "stride": 2,
+            "padding": 1,
+            "count_include_pad": False,
+        },
+        kernel_sizes=[5, 3],
+        channels=16,
+        max_downsample_channels=32,
+        bias=True,
+        downsample_scales=[2, 2],
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        pad="ReflectionPad1d",
+        pad_params={},
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@pytest.mark.parametrize(
+    "dict_g, dict_d",
+    [
+        ({}, {}),
+        ({}, {"scales": 4}),
+        ({}, {"kernel_sizes": [7, 5]}),
+        ({}, {"max_downsample_channels": 128}),
+        ({}, {"downsample_scales": [4, 4]}),
+        ({}, {"pad": "ConstantPad1d", "pad_params": {"value": 0.0}}),
+        ({}, {"nonlinear_activation": "ReLU", "nonlinear_activation_params": {}}),
+    ],
+)
+def test_melgan_generator_and_discriminator(dict_g, dict_d):
+    # setup
+    batch_size = 2
+    batch_length = 512
+    args_g = make_melgan_generator_args(**dict_g)
+    args_d = make_melgan_discriminator_args(**dict_d)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["in_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    model_g = MelGANGenerator(**args_g)
+    model_d = MelGANMultiScaleDiscriminator(**args_d)
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    dis_adv_criterion = DiscriminatorAdversarialLoss()
+    feat_match_criterion = FeatureMatchLoss()
+    optimizer_g = torch.optim.Adam(model_g.parameters())
+    optimizer_d = torch.optim.Adam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c)
+    p_hat = model_d(y_hat)
+    adv_loss = gen_adv_criterion(p_hat)
+    with torch.no_grad():
+        p = model_d(y)
+    fm_loss = feat_match_criterion(p_hat, p)
+    loss_g = adv_loss + fm_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+try:
+    import parallel_wavegan  # NOQA
+
+    is_parallel_wavegan_available = True
+except ImportError:
+    is_parallel_wavegan_available = False
+
+
+@pytest.mark.skipif(
+    not is_parallel_wavegan_available, reason="parallel_wavegan is not installed."
+)
+def test_parallel_wavegan_compatibility():
+    from parallel_wavegan.models import MelGANGenerator as PWGMelGANGenerator
+
+    model_pwg = PWGMelGANGenerator(**make_melgan_generator_args())
+    model_espnet2 = MelGANGenerator(**make_melgan_generator_args())
+    model_espnet2.load_state_dict(model_pwg.state_dict())
+    model_pwg.eval()
+    model_espnet2.eval()
+
+    with torch.no_grad():
+        c = torch.randn(5, 80)
+        out_pwg = model_pwg.inference(c)
+        out_espnet2 = model_espnet2.inference(c)
+        np.testing.assert_array_equal(
+            out_pwg.cpu().numpy(),
+            out_espnet2.cpu().numpy(),
+        )
diff --git a/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py b/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py
new file mode 100644
index 00000000000..098ce45ea8c
--- /dev/null
+++ b/test/espnet2/gan_tts/parallel_wavegan/test_parallel_wavegan.py
@@ -0,0 +1,155 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Test code for ParallelWaveGAN modules."""
+
+import numpy as np
+import pytest
+import torch
+
+from espnet2.gan_tts.hifigan.loss import DiscriminatorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import GeneratorAdversarialLoss
+from espnet2.gan_tts.parallel_wavegan import ParallelWaveGANDiscriminator
+from espnet2.gan_tts.parallel_wavegan import ParallelWaveGANGenerator
+
+
+def make_generator_args(**kwargs):
+    defaults = dict(
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=6,
+        stacks=3,
+        residual_channels=8,
+        gate_channels=16,
+        skip_channels=8,
+        aux_channels=10,
+        aux_context_window=0,
+        use_weight_norm=True,
+        upsample_conditional_features=True,
+        upsample_net="ConvInUpsampleNetwork",
+        upsample_params={"upsample_scales": [4, 4]},
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_discriminator_args(**kwargs):
+    defaults = dict(
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=5,
+        conv_channels=16,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.2},
+        bias=True,
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.parametrize(
+    "dict_g, dict_d",
+    [
+        ({}, {}),
+        ({"layers": 1, "stacks": 1}, {}),
+        ({}, {"layers": 1}),
+        ({"kernel_size": 5}, {}),
+        ({}, {"kernel_size": 5}),
+        ({"gate_channels": 8}, {}),
+        ({"stacks": 1}, {}),
+        ({"use_weight_norm": False}, {"use_weight_norm": False}),
+        ({"aux_context_window": 2}, {}),
+        ({"upsample_net": "UpsampleNetwork"}, {}),
+        (
+            {"upsample_params": {"upsample_scales": [4], "freq_axis_kernel_size": 3}},
+            {},
+        ),
+        (
+            {
+                "upsample_params": {
+                    "upsample_scales": [4],
+                    "nonlinear_activation": "ReLU",
+                }
+            },
+            {},
+        ),
+        (
+            {
+                "upsample_conditional_features": False,
+                "upsample_params": {"upsample_scales": [1]},
+            },
+            {},
+        ),
+    ],
+)
+def test_parallel_wavegan_generator_and_discriminator(dict_g, dict_d):
+    # setup
+    batch_size = 4
+    batch_length = 4096
+    args_g = make_generator_args(**dict_g)
+    args_d = make_discriminator_args(**dict_d)
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["aux_channels"],
+        batch_length // np.prod(args_g["upsample_params"]["upsample_scales"]),
+    )
+    model_g = ParallelWaveGANGenerator(**args_g)
+    model_d = ParallelWaveGANDiscriminator(**args_d)
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    dis_adv_criterion = DiscriminatorAdversarialLoss()
+    optimizer_g = torch.optim.Adam(model_g.parameters())
+    optimizer_d = torch.optim.Adam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c)
+    p_hat = model_d(y_hat)
+    loss_g = gen_adv_criterion(p_hat)
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+try:
+    import parallel_wavegan  # NOQA
+
+    is_parallel_wavegan_available = True
+except ImportError:
+    is_parallel_wavegan_available = False
+
+
+@pytest.mark.skipif(
+    not is_parallel_wavegan_available, reason="parallel_wavegan is not installed."
+)
+def test_parallel_wavegan_compatibility():
+    from parallel_wavegan.models import (
+        ParallelWaveGANGenerator as PWGParallelWaveGANGenerator,  # NOQA
+    )
+
+    model_pwg = PWGParallelWaveGANGenerator(**make_generator_args())
+    model_espnet2 = ParallelWaveGANGenerator(**make_generator_args())
+    model_espnet2.load_state_dict(model_pwg.state_dict())
+    model_pwg.eval()
+    model_espnet2.eval()
+
+    with torch.no_grad():
+        z = torch.randn(3 * 16, 1)
+        c = torch.randn(3, 10)
+        out_pwg = model_pwg.inference(c, z)
+        out_espnet2 = model_espnet2.inference(c, z)
+        np.testing.assert_array_equal(
+            out_pwg.cpu().numpy(),
+            out_espnet2.cpu().numpy(),
+        )
diff --git a/test/espnet2/gan_tts/style_melgan/test_style_melgan.py b/test/espnet2/gan_tts/style_melgan/test_style_melgan.py
new file mode 100644
index 00000000000..8f8f3f546f2
--- /dev/null
+++ b/test/espnet2/gan_tts/style_melgan/test_style_melgan.py
@@ -0,0 +1,144 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Test code for StyleMelGAN modules."""
+
+import numpy as np
+import pytest
+import torch
+
+from espnet2.gan_tts.hifigan.loss import DiscriminatorAdversarialLoss
+from espnet2.gan_tts.hifigan.loss import GeneratorAdversarialLoss
+from espnet2.gan_tts.style_melgan import StyleMelGANDiscriminator
+from espnet2.gan_tts.style_melgan import StyleMelGANGenerator
+
+
+def make_style_melgan_generator_args(**kwargs):
+    defaults = dict(
+        in_channels=32,
+        aux_channels=5,
+        channels=16,
+        out_channels=1,
+        kernel_size=9,
+        dilation=2,
+        bias=True,
+        noise_upsample_scales=[11, 2, 2, 2],
+        noise_upsample_activation="LeakyReLU",
+        noise_upsample_activation_params={"negative_slope": 0.2},
+        upsample_scales=[2, 2],
+        upsample_mode="nearest",
+        gated_function="softmax",
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_style_melgan_discriminator_args(**kwargs):
+    defaults = dict(
+        repeats=2,
+        window_sizes=[128, 256],
+        pqmf_params=[
+            [1, None, None, None],
+            [2, 62, 0.26700, 9.0],
+        ],
+        discriminator_params={
+            "out_channels": 1,
+            "kernel_sizes": [5, 3],
+            "channels": 16,
+            "max_downsample_channels": 32,
+            "bias": True,
+            "downsample_scales": [4, 4],
+            "nonlinear_activation": "LeakyReLU",
+            "nonlinear_activation_params": {"negative_slope": 0.2},
+            "pad": "ReflectionPad1d",
+            "pad_params": {},
+        },
+        use_weight_norm=True,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@pytest.mark.parametrize(
+    "dict_g, dict_d",
+    [
+        ({}, {}),
+        ({"gated_function": "sigmoid"}, {}),
+    ],
+)
+def test_style_melgan_trainable(dict_g, dict_d):
+    # setup
+    args_g = make_style_melgan_generator_args(**dict_g)
+    args_d = make_style_melgan_discriminator_args(**dict_d)
+    batch_size = 2
+    batch_length = np.prod(args_g["noise_upsample_scales"]) * np.prod(
+        args_g["upsample_scales"]
+    )
+    y = torch.randn(batch_size, 1, batch_length)
+    c = torch.randn(
+        batch_size,
+        args_g["aux_channels"],
+        batch_length // np.prod(args_g["upsample_scales"]),
+    )
+    model_g = StyleMelGANGenerator(**args_g)
+    model_d = StyleMelGANDiscriminator(**args_d)
+    gen_adv_criterion = GeneratorAdversarialLoss()
+    dis_adv_criterion = DiscriminatorAdversarialLoss()
+    optimizer_g = torch.optim.Adam(model_g.parameters())
+    optimizer_d = torch.optim.Adam(model_d.parameters())
+
+    # check generator trainable
+    y_hat = model_g(c)
+    p_hat = model_d(y_hat)
+    adv_loss = gen_adv_criterion(p_hat)
+    loss_g = adv_loss
+    optimizer_g.zero_grad()
+    loss_g.backward()
+    optimizer_g.step()
+
+    # check discriminator trainable
+    p = model_d(y)
+    p_hat = model_d(y_hat.detach())
+    real_loss, fake_loss = dis_adv_criterion(p_hat, p)
+    loss_d = real_loss + fake_loss
+    optimizer_d.zero_grad()
+    loss_d.backward()
+    optimizer_d.step()
+
+
+try:
+    import parallel_wavegan  # NOQA
+
+    is_parallel_wavegan_available = True
+except ImportError:
+    is_parallel_wavegan_available = False
+
+
+@pytest.mark.skipif(
+    not is_parallel_wavegan_available, reason="parallel_wavegan is not installed."
+)
+def test_parallel_wavegan_compatibility():
+    from parallel_wavegan.models import StyleMelGANGenerator as PWGStyleMelGANGenerator
+
+    model_pwg = PWGStyleMelGANGenerator(**make_style_melgan_generator_args())
+    model_espnet2 = StyleMelGANGenerator(**make_style_melgan_generator_args())
+    model_espnet2.load_state_dict(model_pwg.state_dict())
+    model_pwg.eval()
+    model_espnet2.eval()
+
+    with torch.no_grad():
+        c = torch.randn(3, 5)
+        torch.manual_seed(1)
+        out_pwg = model_pwg.inference(c)
+        torch.manual_seed(1)
+        out_espnet2 = model_espnet2.inference(c)
+        np.testing.assert_array_equal(
+            out_pwg.cpu().numpy(),
+            out_espnet2.cpu().numpy(),
+        )
diff --git a/test/espnet2/gan_tts/vits/test_generator.py b/test/espnet2/gan_tts/vits/test_generator.py
new file mode 100644
index 00000000000..7ac9f3f879e
--- /dev/null
+++ b/test/espnet2/gan_tts/vits/test_generator.py
@@ -0,0 +1,342 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Test VITS generator modules."""
+
+from distutils.version import LooseVersion
+
+import pytest
+import torch
+
+from espnet2.gan_tts.vits.generator import VITSGenerator
+
+
+def make_generator_args(**kwargs):
+    defaults = dict(
+        vocabs=10,
+        aux_channels=5,
+        hidden_channels=4,
+        spks=-1,
+        langs=-1,
+        spk_embed_dim=-1,
+        global_channels=-1,
+        segment_size=4,
+        text_encoder_attention_heads=2,
+        text_encoder_ffn_expand=4,
+        text_encoder_blocks=2,
+        text_encoder_positionwise_layer_type="conv1d",
+        text_encoder_positionwise_conv_kernel_size=1,
+        text_encoder_normalize_before=True,
+        text_encoder_dropout_rate=0.1,
+        text_encoder_positional_dropout_rate=0.0,
+        text_encoder_attention_dropout_rate=0.0,
+        text_encoder_conformer_kernel_size=7,
+        use_macaron_style_in_text_encoder=True,
+        use_conformer_conv_in_text_encoder=True,
+        decoder_kernel_size=7,
+        decoder_channels=16,
+        decoder_upsample_scales=[16, 16],
+        decoder_upsample_kernel_sizes=[32, 32],
+        decoder_resblock_kernel_sizes=[3, 5],
+        decoder_resblock_dilations=[[1, 3], [1, 3]],
+        use_weight_norm_in_decoder=True,
+        posterior_encoder_kernel_size=5,
+        posterior_encoder_layers=2,
+        posterior_encoder_stacks=1,
+        posterior_encoder_base_dilation=1,
+        posterior_encoder_dropout_rate=0.0,
+        use_weight_norm_in_posterior_encoder=True,
+        flow_flows=2,
+        flow_kernel_size=5,
+        flow_base_dilation=1,
+        flow_layers=2,
+        flow_dropout_rate=0.0,
+        use_weight_norm_in_flow=True,
+        use_only_mean_in_flow=True,
+        stochastic_duration_predictor_kernel_size=3,
+        stochastic_duration_predictor_dropout_rate=0.5,
+        stochastic_duration_predictor_flows=2,
+        stochastic_duration_predictor_dds_conv_layers=3,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+# NOTE(kan-bayashi): first forward requires jit compile
+#   so a little bit more time is needed to run. Therefore,
+#   here we extend execution timeout from 2 sec to 5 sec.
+@pytest.mark.execution_timeout(5)
+@pytest.mark.skipif(
+    LooseVersion(torch.__version__) < LooseVersion("1.4"),
+    reason="Pytorch >= 1.4 is required.",
+)
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@torch.no_grad()
+@pytest.mark.parametrize(
+    "model_dict",
+    [
+        ({}),
+        ({"text_encoder_positionwise_layer_type": "linear"}),
+        ({"text_encoder_positionwise_layer_type": "conv1d-linear"}),
+        ({"text_encoder_normalize_before": False}),
+        ({"use_macaron_style_in_text_encoder": False}),
+        ({"use_conformer_conv_in_text_encoder": False}),
+        (
+            {
+                "text_encoder_positional_encoding_layer_type": "scaled_abs_pos",
+                "text_encoder_self_attention_layer_type": "selfattn",
+            }
+        ),
+        ({"spk_embed_dim": 16, "global_channels": 4}),
+        ({"langs": 16, "global_channels": 4}),
+    ],
+)
+def test_vits_generator_forward(model_dict):
+    idim = 10
+    odim = 5
+    args = make_generator_args(vocabs=idim, aux_channels=odim, **model_dict)
+    model = VITSGenerator(**args)
+
+    # check forward
+    inputs = dict(
+        text=torch.randint(0, idim, (2, 8)),
+        text_lengths=torch.tensor([8, 5], dtype=torch.long),
+        feats=torch.randn(2, odim, 16),
+        feats_lengths=torch.tensor([16, 13], dtype=torch.long),
+    )
+    if args["spk_embed_dim"] > 0:
+        inputs["spembs"] = torch.randn(2, args["spk_embed_dim"])
+    if args["langs"] > 0:
+        inputs["lids"] = torch.randint(0, args["langs"], (2, 1))
+    outputs = model(**inputs)
+    for i, output in enumerate(outputs):
+        if not isinstance(output, tuple):
+            print(f"{i+1}: {output.shape}")
+        else:
+            for j, output_ in enumerate(output):
+                print(f"{i+j+1}: {output_.shape}")
+
+    # check inference
+    inputs = dict(
+        text=torch.randint(
+            0,
+            idim,
+            (
+                2,
+                5,
+            ),
+        ),
+        text_lengths=torch.tensor([5, 3], dtype=torch.long),
+    )
+    if args["spk_embed_dim"] > 0:
+        inputs["spembs"] = torch.randn(args["spk_embed_dim"])
+    if args["langs"] > 0:
+        inputs["lids"] = torch.randint(0, args["langs"], (1,))
+    outputs = model.inference(**inputs)
+    for i, output in enumerate(outputs):
+        if not isinstance(output, tuple):
+            print(f"{i+1}: {output.shape}")
+        else:
+            for j, output_ in enumerate(output):
+                print(f"{i+j+1}: {output_.shape}")
+
+    # check inference with predifined duration
+    inputs = dict(
+        text=torch.randint(
+            0,
+            idim,
+            (
+                1,
+                5,
+            ),
+        ),
+        text_lengths=torch.tensor([5], dtype=torch.long),
+        dur=torch.tensor([[[1, 2, 3, 4, 5]]], dtype=torch.long),
+    )
+    if args["spk_embed_dim"] > 0:
+        inputs["spembs"] = torch.randn(args["spk_embed_dim"])
+    if args["langs"] > 0:
+        inputs["lids"] = torch.randint(0, args["langs"], (1,))
+    outputs = model.inference(**inputs)
+    assert outputs[0].size(1) == inputs["dur"].sum() * model.upsample_factor
+    for i, output in enumerate(outputs):
+        if not isinstance(output, tuple):
+            print(f"{i+1}: {output.shape}")
+        else:
+            for j, output_ in enumerate(output):
+                print(f"{i+j+1}: {output_.shape}")
+
+    # check inference with teacher forcing
+    inputs = dict(
+        text=torch.randint(
+            0,
+            idim,
+            (
+                1,
+                5,
+            ),
+        ),
+        text_lengths=torch.tensor([5], dtype=torch.long),
+        feats=torch.randn(1, odim, 16),
+        feats_lengths=torch.tensor([16], dtype=torch.long),
+    )
+    if args["spk_embed_dim"] > 0:
+        inputs["spembs"] = torch.randn(args["spk_embed_dim"])
+    if args["langs"] > 0:
+        inputs["lids"] = torch.randint(0, args["langs"], (1,))
+    outputs = model.inference(**inputs, use_teacher_forcing=True)
+    assert outputs[0].size(1) == inputs["feats"].size(2) * model.upsample_factor
+    for i, output in enumerate(outputs):
+        if not isinstance(output, tuple):
+            print(f"{i+1}: {output.shape}")
+        else:
+            for j, output_ in enumerate(output):
+                print(f"{i+j+1}: {output_.shape}")
+
+
+@pytest.mark.skipif(
+    LooseVersion(torch.__version__) < LooseVersion("1.4"),
+    reason="Pytorch >= 1.4 is required.",
+)
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@torch.no_grad()
+@pytest.mark.parametrize(
+    "model_dict",
+    [
+        ({}),
+        ({"text_encoder_positionwise_layer_type": "linear"}),
+        ({"text_encoder_positionwise_layer_type": "conv1d-linear"}),
+        ({"text_encoder_normalize_before": False}),
+        ({"use_macaron_style_in_text_encoder": False}),
+        ({"use_conformer_conv_in_text_encoder": False}),
+        (
+            {
+                "text_encoder_positional_encoding_layer_type": "scaled_abs_pos",
+                "text_encoder_self_attention_layer_type": "selfattn",
+            }
+        ),
+        ({"spk_embed_dim": 16}),
+        ({"langs": 16}),
+    ],
+)
+def test_multi_speaker_vits_generator_forward(model_dict):
+    idim = 10
+    odim = 5
+    spks = 10
+    global_channels = 8
+    args = make_generator_args(
+        vocabs=idim,
+        aux_channels=odim,
+        spks=spks,
+        global_channels=global_channels,
+        **model_dict,
+    )
+    model = VITSGenerator(**args)
+
+    # check forward
+    inputs = dict(
+        text=torch.randint(0, idim, (2, 8)),
+        text_lengths=torch.tensor([8, 5], dtype=torch.long),
+        feats=torch.randn(2, odim, 16),
+        feats_lengths=torch.tensor([16, 13], dtype=torch.long),
+        sids=torch.randint(0, spks, (2,)),
+    )
+    if args["spk_embed_dim"] > 0:
+        inputs["spembs"] = torch.randn(2, args["spk_embed_dim"])
+    if args["langs"] > 0:
+        inputs["lids"] = torch.randint(0, args["langs"], (2, 1))
+    outputs = model(**inputs)
+    for i, output in enumerate(outputs):
+        if not isinstance(output, tuple):
+            print(f"{i+1}: {output.shape}")
+        else:
+            for j, output_ in enumerate(output):
+                print(f"{i+j+1}: {output_.shape}")
+
+    # check inference
+    inputs = dict(
+        text=torch.randint(
+            0,
+            idim,
+            (
+                2,
+                5,
+            ),
+        ),
+        text_lengths=torch.tensor([5, 3], dtype=torch.long),
+        sids=torch.randint(0, spks, (1,)),
+    )
+    if args["spk_embed_dim"] > 0:
+        inputs["spembs"] = torch.randn(args["spk_embed_dim"])
+    if args["langs"] > 0:
+        inputs["lids"] = torch.randint(0, args["langs"], (1,))
+    outputs = model.inference(**inputs)
+    for i, output in enumerate(outputs):
+        if not isinstance(output, tuple):
+            print(f"{i+1}: {output.shape}")
+        else:
+            for j, output_ in enumerate(output):
+                print(f"{i+j+1}: {output_.shape}")
+
+    # check inference with predefined duration
+    inputs = dict(
+        text=torch.randint(
+            0,
+            idim,
+            (
+                1,
+                5,
+            ),
+        ),
+        text_lengths=torch.tensor([5], dtype=torch.long),
+        sids=torch.randint(0, spks, (1,)),
+        dur=torch.tensor([[[1, 2, 3, 4, 5]]], dtype=torch.long),
+    )
+    if args["spk_embed_dim"] > 0:
+        inputs["spembs"] = torch.randn(args["spk_embed_dim"])
+    if args["langs"] > 0:
+        inputs["lids"] = torch.randint(0, args["langs"], (1,))
+    outputs = model.inference(**inputs)
+    assert outputs[0].size(1) == inputs["dur"].sum() * model.upsample_factor
+    for i, output in enumerate(outputs):
+        if not isinstance(output, tuple):
+            print(f"{i+1}: {output.shape}")
+        else:
+            for j, output_ in enumerate(output):
+                print(f"{i+j+1}: {output_.shape}")
+
+    # check inference with teacher forcing
+    inputs = dict(
+        text=torch.randint(
+            0,
+            idim,
+            (
+                1,
+                5,
+            ),
+        ),
+        text_lengths=torch.tensor([5], dtype=torch.long),
+        feats=torch.randn(1, odim, 16),
+        feats_lengths=torch.tensor([16], dtype=torch.long),
+        sids=torch.randint(0, spks, (1,)),
+    )
+    if args["spk_embed_dim"] > 0:
+        inputs["spembs"] = torch.randn(args["spk_embed_dim"])
+    if args["langs"] > 0:
+        inputs["lids"] = torch.randint(0, args["langs"], (1,))
+    outputs = model.inference(**inputs, use_teacher_forcing=True)
+    assert outputs[0].size(1) == inputs["feats"].size(2) * model.upsample_factor
+    for i, output in enumerate(outputs):
+        if not isinstance(output, tuple):
+            print(f"{i+1}: {output.shape}")
+        else:
+            for j, output_ in enumerate(output):
+                print(f"{i+j+1}: {output_.shape}")
diff --git a/test/espnet2/gan_tts/vits/test_vits.py b/test/espnet2/gan_tts/vits/test_vits.py
new file mode 100644
index 00000000000..a35d8c66bf1
--- /dev/null
+++ b/test/espnet2/gan_tts/vits/test_vits.py
@@ -0,0 +1,1040 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Test VITS related modules."""
+
+from distutils.version import LooseVersion
+
+import pytest
+import torch
+
+from espnet2.gan_tts.vits import VITS
+
+
+def make_vits_generator_args(**kwargs):
+    defaults = dict(
+        generator_type="vits_generator",
+        generator_params={
+            "vocabs": 10,
+            "aux_channels": 5,
+            "hidden_channels": 4,
+            "spks": -1,
+            "langs": -1,
+            "spk_embed_dim": -1,
+            "global_channels": -1,
+            "segment_size": 4,
+            "text_encoder_attention_heads": 2,
+            "text_encoder_ffn_expand": 2,
+            "text_encoder_blocks": 2,
+            "text_encoder_positionwise_layer_type": "conv1d",
+            "text_encoder_positionwise_conv_kernel_size": 1,
+            "text_encoder_positional_encoding_layer_type": "rel_pos",
+            "text_encoder_self_attention_layer_type": "rel_selfattn",
+            "text_encoder_activation_type": "swish",
+            "text_encoder_normalize_before": True,
+            "text_encoder_dropout_rate": 0.1,
+            "text_encoder_positional_dropout_rate": 0.0,
+            "text_encoder_attention_dropout_rate": 0.0,
+            "text_encoder_conformer_kernel_size": 7,
+            "use_macaron_style_in_text_encoder": True,
+            "use_conformer_conv_in_text_encoder": True,
+            "decoder_kernel_size": 7,
+            "decoder_channels": 16,
+            "decoder_upsample_scales": (16, 16),
+            "decoder_upsample_kernel_sizes": (32, 32),
+            "decoder_resblock_kernel_sizes": (3, 5),
+            "decoder_resblock_dilations": [(1, 3), (1, 3)],
+            "use_weight_norm_in_decoder": True,
+            "posterior_encoder_kernel_size": 5,
+            "posterior_encoder_layers": 2,
+            "posterior_encoder_stacks": 1,
+            "posterior_encoder_base_dilation": 1,
+            "posterior_encoder_dropout_rate": 0.0,
+            "use_weight_norm_in_posterior_encoder": True,
+            "flow_flows": 2,
+            "flow_kernel_size": 5,
+            "flow_base_dilation": 1,
+            "flow_layers": 2,
+            "flow_dropout_rate": 0.0,
+            "use_weight_norm_in_flow": True,
+            "use_only_mean_in_flow": True,
+            "stochastic_duration_predictor_kernel_size": 3,
+            "stochastic_duration_predictor_dropout_rate": 0.5,
+            "stochastic_duration_predictor_flows": 2,
+            "stochastic_duration_predictor_dds_conv_layers": 3,
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_vits_discriminator_args(**kwargs):
+    defaults = dict(
+        discriminator_type="hifigan_multi_scale_multi_period_discriminator",
+        discriminator_params={
+            "scales": 2,
+            "scale_downsample_pooling": "AvgPool1d",
+            "scale_downsample_pooling_params": {
+                "kernel_size": 4,
+                "stride": 2,
+                "padding": 2,
+            },
+            "scale_discriminator_params": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [15, 41, 5, 3],
+                "channels": 16,
+                "max_downsample_channels": 32,
+                "max_groups": 16,
+                "bias": True,
+                "downsample_scales": [2, 1],
+                "nonlinear_activation": "LeakyReLU",
+                "nonlinear_activation_params": {"negative_slope": 0.1},
+            },
+            "follow_official_norm": True,
+            "periods": [2, 3],
+            "period_discriminator_params": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 4,
+                "downsample_scales": [3, 1],
+                "max_downsample_channels": 16,
+                "bias": True,
+                "nonlinear_activation": "LeakyReLU",
+                "nonlinear_activation_params": {"negative_slope": 0.1},
+                "use_weight_norm": True,
+                "use_spectral_norm": False,
+            },
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+def make_vits_loss_args(**kwargs):
+    defaults = dict(
+        lambda_adv=1.0,
+        lambda_mel=45.0,
+        lambda_feat_match=2.0,
+        lambda_dur=1.0,
+        lambda_kl=1.0,
+        generator_adv_loss_params={
+            "average_by_discriminators": False,
+            "loss_type": "mse",
+        },
+        discriminator_adv_loss_params={
+            "average_by_discriminators": False,
+            "loss_type": "mse",
+        },
+        feat_match_loss_params={
+            "average_by_discriminators": False,
+            "average_by_layers": False,
+            "include_final_outputs": True,
+        },
+        mel_loss_params={
+            "fs": 22050,
+            "n_fft": 1024,
+            "hop_length": 256,
+            "win_length": None,
+            "window": "hann",
+            "n_mels": 80,
+            "fmin": 0,
+            "fmax": None,
+            "log_base": None,
+        },
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.skipif(
+    LooseVersion(torch.__version__) < LooseVersion("1.4"),
+    reason="Pytorch >= 1.4 is required.",
+)
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@pytest.mark.parametrize(
+    "gen_dict, dis_dict, loss_dict",
+    [
+        ({}, {}, {}),
+        ({}, {}, {"cache_generator_outputs": True}),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_multi_scale_discriminator",
+                "discriminator_params": {
+                    "scales": 2,
+                    "downsample_pooling": "AvgPool1d",
+                    "downsample_pooling_params": {
+                        "kernel_size": 4,
+                        "stride": 2,
+                        "padding": 2,
+                    },
+                    "discriminator_params": {
+                        "in_channels": 1,
+                        "out_channels": 1,
+                        "kernel_sizes": [15, 41, 5, 3],
+                        "channels": 16,
+                        "max_downsample_channels": 32,
+                        "max_groups": 16,
+                        "bias": True,
+                        "downsample_scales": [2, 2, 1],
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.1},
+                    },
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_multi_period_discriminator",
+                "discriminator_params": {
+                    "periods": [2, 3],
+                    "discriminator_params": {
+                        "in_channels": 1,
+                        "out_channels": 1,
+                        "kernel_sizes": [5, 3],
+                        "channels": 16,
+                        "downsample_scales": [3, 3, 1],
+                        "max_downsample_channels": 32,
+                        "bias": True,
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.1},
+                        "use_weight_norm": True,
+                        "use_spectral_norm": False,
+                    },
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_period_discriminator",
+                "discriminator_params": {
+                    "period": 2,
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [5, 3],
+                    "channels": 16,
+                    "downsample_scales": [3, 3, 1],
+                    "max_downsample_channels": 32,
+                    "bias": True,
+                    "nonlinear_activation": "LeakyReLU",
+                    "nonlinear_activation_params": {"negative_slope": 0.1},
+                    "use_weight_norm": True,
+                    "use_spectral_norm": False,
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_scale_discriminator",
+                "discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [15, 41, 5, 3],
+                    "channels": 16,
+                    "max_downsample_channels": 32,
+                    "max_groups": 16,
+                    "bias": True,
+                    "downsample_scales": [2, 2, 1],
+                    "nonlinear_activation": "LeakyReLU",
+                    "nonlinear_activation_params": {"negative_slope": 0.1},
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {},
+            {
+                "generator_adv_loss_params": {
+                    "average_by_discriminators": True,
+                    "loss_type": "mse",
+                },
+                "discriminator_adv_loss_params": {
+                    "average_by_discriminators": True,
+                    "loss_type": "mse",
+                },
+            },
+        ),
+        (
+            {},
+            {},
+            {
+                "generator_adv_loss_params": {
+                    "average_by_discriminators": False,
+                    "loss_type": "hinge",
+                },
+                "discriminator_adv_loss_params": {
+                    "average_by_discriminators": False,
+                    "loss_type": "hinge",
+                },
+            },
+        ),
+    ],
+)
+def test_vits_is_trainable_and_decodable(gen_dict, dis_dict, loss_dict):
+    idim = 10
+    odim = 5
+    gen_args = make_vits_generator_args(**gen_dict)
+    dis_args = make_vits_discriminator_args(**dis_dict)
+    loss_args = make_vits_loss_args(**loss_dict)
+    model = VITS(
+        idim=idim,
+        odim=odim,
+        **gen_args,
+        **dis_args,
+        **loss_args,
+    )
+    model.train()
+    upsample_factor = model.generator.upsample_factor
+    inputs = dict(
+        text=torch.randint(0, idim, (2, 8)),
+        text_lengths=torch.tensor([8, 5], dtype=torch.long),
+        feats=torch.randn(2, 16, odim),
+        feats_lengths=torch.tensor([16, 13], dtype=torch.long),
+        speech=torch.randn(2, 16 * upsample_factor),
+        speech_lengths=torch.tensor([16, 13] * upsample_factor, dtype=torch.long),
+    )
+    gen_loss = model(forward_generator=True, **inputs)["loss"]
+    gen_loss.backward()
+    dis_loss = model(forward_generator=False, **inputs)["loss"]
+    dis_loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        # check inference
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            )
+        )
+        model.inference(**inputs)
+
+        # check inference with predefined durations
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+            durations=torch.tensor([1, 2, 3, 4, 5], dtype=torch.long),
+        )
+        output_dict = model.inference(**inputs)
+        assert output_dict["wav"].size(0) == inputs["durations"].sum() * upsample_factor
+
+        # check inference with teachder forcing
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+            feats=torch.randn(16, odim),
+        )
+        output_dict = model.inference(**inputs, use_teacher_forcing=True)
+        assert output_dict["wav"].size(0) == inputs["feats"].size(0) * upsample_factor
+
+
+@pytest.mark.skipif(
+    LooseVersion(torch.__version__) < LooseVersion("1.4"),
+    reason="Pytorch >= 1.4 is required.",
+)
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="Group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@pytest.mark.parametrize(
+    "gen_dict, dis_dict, loss_dict,",
+    [
+        ({}, {}, {}),
+        ({}, {}, {"cache_generator_outputs": True}),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_multi_scale_discriminator",
+                "discriminator_params": {
+                    "scales": 2,
+                    "downsample_pooling": "AvgPool1d",
+                    "downsample_pooling_params": {
+                        "kernel_size": 4,
+                        "stride": 2,
+                        "padding": 2,
+                    },
+                    "discriminator_params": {
+                        "in_channels": 1,
+                        "out_channels": 1,
+                        "kernel_sizes": [15, 41, 5, 3],
+                        "channels": 16,
+                        "max_downsample_channels": 32,
+                        "max_groups": 16,
+                        "bias": True,
+                        "downsample_scales": [2, 2, 1],
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.1},
+                    },
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_multi_period_discriminator",
+                "discriminator_params": {
+                    "periods": [2, 3],
+                    "discriminator_params": {
+                        "in_channels": 1,
+                        "out_channels": 1,
+                        "kernel_sizes": [5, 3],
+                        "channels": 16,
+                        "downsample_scales": [3, 3, 1],
+                        "max_downsample_channels": 32,
+                        "bias": True,
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.1},
+                        "use_weight_norm": True,
+                        "use_spectral_norm": False,
+                    },
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_period_discriminator",
+                "discriminator_params": {
+                    "period": 2,
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [5, 3],
+                    "channels": 16,
+                    "downsample_scales": [3, 3, 1],
+                    "max_downsample_channels": 32,
+                    "bias": True,
+                    "nonlinear_activation": "LeakyReLU",
+                    "nonlinear_activation_params": {"negative_slope": 0.1},
+                    "use_weight_norm": True,
+                    "use_spectral_norm": False,
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_scale_discriminator",
+                "discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [15, 41, 5, 3],
+                    "channels": 16,
+                    "max_downsample_channels": 32,
+                    "max_groups": 16,
+                    "bias": True,
+                    "downsample_scales": [2, 2, 1],
+                    "nonlinear_activation": "LeakyReLU",
+                    "nonlinear_activation_params": {"negative_slope": 0.1},
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {},
+            {
+                "generator_adv_loss_params": {
+                    "average_by_discriminators": True,
+                    "loss_type": "mse",
+                },
+                "discriminator_adv_loss_params": {
+                    "average_by_discriminators": True,
+                    "loss_type": "mse",
+                },
+            },
+        ),
+        (
+            {},
+            {},
+            {
+                "generator_adv_loss_params": {
+                    "average_by_discriminators": False,
+                    "loss_type": "hinge",
+                },
+                "discriminator_adv_loss_params": {
+                    "average_by_discriminators": False,
+                    "loss_type": "hinge",
+                },
+            },
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "spks, spk_embed_dim, langs", [(10, -1, -1), (-1, 5, -1), (-1, -1, 3), (4, 5, 3)]
+)
+def test_multi_speaker_vits_is_trainable_and_decodable(
+    gen_dict, dis_dict, loss_dict, spks, spk_embed_dim, langs
+):
+    idim = 10
+    odim = 5
+    global_channels = 8
+    gen_args = make_vits_generator_args(**gen_dict)
+    gen_args["generator_params"]["spks"] = spks
+    gen_args["generator_params"]["langs"] = langs
+    gen_args["generator_params"]["spk_embed_dim"] = spk_embed_dim
+    gen_args["generator_params"]["global_channels"] = global_channels
+    dis_args = make_vits_discriminator_args(**dis_dict)
+    loss_args = make_vits_loss_args(**loss_dict)
+    model = VITS(
+        idim=idim,
+        odim=odim,
+        **gen_args,
+        **dis_args,
+        **loss_args,
+    )
+    model.train()
+    upsample_factor = model.generator.upsample_factor
+    inputs = dict(
+        text=torch.randint(0, idim, (2, 8)),
+        text_lengths=torch.tensor([8, 5], dtype=torch.long),
+        feats=torch.randn(2, 16, odim),
+        feats_lengths=torch.tensor([16, 13], dtype=torch.long),
+        speech=torch.randn(2, 16 * upsample_factor),
+        speech_lengths=torch.tensor([16, 13] * upsample_factor, dtype=torch.long),
+    )
+    if spks > 0:
+        inputs["sids"] = torch.randint(0, spks, (2, 1))
+    if langs > 0:
+        inputs["lids"] = torch.randint(0, langs, (2, 1))
+    if spk_embed_dim > 0:
+        inputs["spembs"] = torch.randn(2, spk_embed_dim)
+    gen_loss = model(forward_generator=True, **inputs)["loss"]
+    gen_loss.backward()
+    dis_loss = model(forward_generator=False, **inputs)["loss"]
+    dis_loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        # check inference
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+        )
+        if spks > 0:
+            inputs["sids"] = torch.randint(0, spks, (1,))
+        if langs > 0:
+            inputs["lids"] = torch.randint(0, langs, (1,))
+        if spk_embed_dim > 0:
+            inputs["spembs"] = torch.randn(spk_embed_dim)
+        model.inference(**inputs)
+
+        # check inference with predefined duration
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+            durations=torch.tensor([1, 2, 3, 4, 5], dtype=torch.long),
+        )
+        if spks > 0:
+            inputs["sids"] = torch.randint(0, spks, (1,))
+        if langs > 0:
+            inputs["lids"] = torch.randint(0, langs, (1,))
+        if spk_embed_dim > 0:
+            inputs["spembs"] = torch.randn(spk_embed_dim)
+        output_dict = model.inference(**inputs)
+        assert output_dict["wav"].size(0) == inputs["durations"].sum() * upsample_factor
+
+        # check inference with teachder forcing
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+            feats=torch.randn(16, odim),
+        )
+        if spks > 0:
+            inputs["sids"] = torch.randint(0, spks, (1,))
+        if langs > 0:
+            inputs["lids"] = torch.randint(0, langs, (1,))
+        if spk_embed_dim > 0:
+            inputs["spembs"] = torch.randn(spk_embed_dim)
+        output_dict = model.inference(**inputs, use_teacher_forcing=True)
+        assert output_dict["wav"].size(0) == inputs["feats"].size(0) * upsample_factor
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="GPU is needed.",
+)
+@pytest.mark.skipif(
+    LooseVersion(torch.__version__) < LooseVersion("1.4"),
+    reason="Pytorch >= 1.4 is required.",
+)
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@pytest.mark.parametrize(
+    "gen_dict, dis_dict, loss_dict",
+    [
+        ({}, {}, {}),
+        ({}, {}, {"cache_generator_outputs": True}),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_multi_scale_discriminator",
+                "discriminator_params": {
+                    "scales": 2,
+                    "downsample_pooling": "AvgPool1d",
+                    "downsample_pooling_params": {
+                        "kernel_size": 4,
+                        "stride": 2,
+                        "padding": 2,
+                    },
+                    "discriminator_params": {
+                        "in_channels": 1,
+                        "out_channels": 1,
+                        "kernel_sizes": [15, 41, 5, 3],
+                        "channels": 16,
+                        "max_downsample_channels": 32,
+                        "max_groups": 16,
+                        "bias": True,
+                        "downsample_scales": [2, 2, 1],
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.1},
+                    },
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_multi_period_discriminator",
+                "discriminator_params": {
+                    "periods": [2, 3],
+                    "discriminator_params": {
+                        "in_channels": 1,
+                        "out_channels": 1,
+                        "kernel_sizes": [5, 3],
+                        "channels": 16,
+                        "downsample_scales": [3, 3, 1],
+                        "max_downsample_channels": 32,
+                        "bias": True,
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.1},
+                        "use_weight_norm": True,
+                        "use_spectral_norm": False,
+                    },
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_period_discriminator",
+                "discriminator_params": {
+                    "period": 2,
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [5, 3],
+                    "channels": 16,
+                    "downsample_scales": [3, 3, 1],
+                    "max_downsample_channels": 32,
+                    "bias": True,
+                    "nonlinear_activation": "LeakyReLU",
+                    "nonlinear_activation_params": {"negative_slope": 0.1},
+                    "use_weight_norm": True,
+                    "use_spectral_norm": False,
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_scale_discriminator",
+                "discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [15, 41, 5, 3],
+                    "channels": 16,
+                    "max_downsample_channels": 32,
+                    "max_groups": 16,
+                    "bias": True,
+                    "downsample_scales": [2, 2, 1],
+                    "nonlinear_activation": "LeakyReLU",
+                    "nonlinear_activation_params": {"negative_slope": 0.1},
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {},
+            {
+                "generator_adv_loss_params": {
+                    "average_by_discriminators": True,
+                    "loss_type": "mse",
+                },
+                "discriminator_adv_loss_params": {
+                    "average_by_discriminators": True,
+                    "loss_type": "mse",
+                },
+            },
+        ),
+        (
+            {},
+            {},
+            {
+                "generator_adv_loss_params": {
+                    "average_by_discriminators": False,
+                    "loss_type": "hinge",
+                },
+                "discriminator_adv_loss_params": {
+                    "average_by_discriminators": False,
+                    "loss_type": "hinge",
+                },
+            },
+        ),
+    ],
+)
+def test_vits_is_trainable_and_decodable_on_gpu(gen_dict, dis_dict, loss_dict):
+    idim = 10
+    odim = 5
+    gen_args = make_vits_generator_args(**gen_dict)
+    dis_args = make_vits_discriminator_args(**dis_dict)
+    loss_args = make_vits_loss_args(**loss_dict)
+    model = VITS(
+        idim=idim,
+        odim=odim,
+        **gen_args,
+        **dis_args,
+        **loss_args,
+    )
+    model.train()
+    upsample_factor = model.generator.upsample_factor
+    inputs = dict(
+        text=torch.randint(0, idim, (2, 8)),
+        text_lengths=torch.tensor([8, 5], dtype=torch.long),
+        feats=torch.randn(2, 16, odim),
+        feats_lengths=torch.tensor([16, 13], dtype=torch.long),
+        speech=torch.randn(2, 16 * upsample_factor),
+        speech_lengths=torch.tensor([16, 13] * upsample_factor, dtype=torch.long),
+    )
+    device = torch.device("cuda")
+    model.to(device)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    gen_loss = model(forward_generator=True, **inputs)["loss"]
+    gen_loss.backward()
+    dis_loss = model(forward_generator=False, **inputs)["loss"]
+    dis_loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        # check inference
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            )
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        model.inference(**inputs)
+
+        # check inference with predefined duration
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+            durations=torch.tensor([1, 2, 3, 4, 5], dtype=torch.long),
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        output_dict = model.inference(**inputs)
+        assert output_dict["wav"].size(0) == inputs["durations"].sum() * upsample_factor
+
+        # check inference with teachder forcing
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+            feats=torch.randn(16, odim),
+        )
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        output_dict = model.inference(**inputs, use_teacher_forcing=True)
+        assert output_dict["wav"].size(0) == inputs["feats"].size(0) * upsample_factor
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="GPU is needed.",
+)
+@pytest.mark.skipif(
+    LooseVersion(torch.__version__) < LooseVersion("1.4"),
+    reason="Pytorch >= 1.4 is required.",
+)
+@pytest.mark.skipif(
+    "1.6" in torch.__version__,
+    reason="Group conv in pytorch 1.6 has an issue. "
+    "See https://github.com/pytorch/pytorch/issues/42446.",
+)
+@pytest.mark.parametrize(
+    "gen_dict, dis_dict, loss_dict",
+    [
+        ({}, {}, {}),
+        ({}, {}, {"cache_generator_outputs": True}),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_multi_scale_discriminator",
+                "discriminator_params": {
+                    "scales": 2,
+                    "downsample_pooling": "AvgPool1d",
+                    "downsample_pooling_params": {
+                        "kernel_size": 4,
+                        "stride": 2,
+                        "padding": 2,
+                    },
+                    "discriminator_params": {
+                        "in_channels": 1,
+                        "out_channels": 1,
+                        "kernel_sizes": [15, 41, 5, 3],
+                        "channels": 16,
+                        "max_downsample_channels": 32,
+                        "max_groups": 16,
+                        "bias": True,
+                        "downsample_scales": [2, 2, 1],
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.1},
+                    },
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_multi_period_discriminator",
+                "discriminator_params": {
+                    "periods": [2, 3],
+                    "discriminator_params": {
+                        "in_channels": 1,
+                        "out_channels": 1,
+                        "kernel_sizes": [5, 3],
+                        "channels": 16,
+                        "downsample_scales": [3, 3, 1],
+                        "max_downsample_channels": 32,
+                        "bias": True,
+                        "nonlinear_activation": "LeakyReLU",
+                        "nonlinear_activation_params": {"negative_slope": 0.1},
+                        "use_weight_norm": True,
+                        "use_spectral_norm": False,
+                    },
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_period_discriminator",
+                "discriminator_params": {
+                    "period": 2,
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [5, 3],
+                    "channels": 16,
+                    "downsample_scales": [3, 3, 1],
+                    "max_downsample_channels": 32,
+                    "bias": True,
+                    "nonlinear_activation": "LeakyReLU",
+                    "nonlinear_activation_params": {"negative_slope": 0.1},
+                    "use_weight_norm": True,
+                    "use_spectral_norm": False,
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {
+                "discriminator_type": "hifigan_scale_discriminator",
+                "discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [15, 41, 5, 3],
+                    "channels": 16,
+                    "max_downsample_channels": 32,
+                    "max_groups": 16,
+                    "bias": True,
+                    "downsample_scales": [2, 2, 1],
+                    "nonlinear_activation": "LeakyReLU",
+                    "nonlinear_activation_params": {"negative_slope": 0.1},
+                },
+            },
+            {},
+        ),
+        (
+            {},
+            {},
+            {
+                "generator_adv_loss_params": {
+                    "average_by_discriminators": True,
+                    "loss_type": "mse",
+                },
+                "discriminator_adv_loss_params": {
+                    "average_by_discriminators": True,
+                    "loss_type": "mse",
+                },
+            },
+        ),
+        (
+            {},
+            {},
+            {
+                "generator_adv_loss_params": {
+                    "average_by_discriminators": False,
+                    "loss_type": "hinge",
+                },
+                "discriminator_adv_loss_params": {
+                    "average_by_discriminators": False,
+                    "loss_type": "hinge",
+                },
+            },
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "spks, spk_embed_dim, langs", [(10, -1, -1), (-1, 5, -1), (-1, -1, 3), (4, 5, 3)]
+)
+def test_multi_speaker_vits_is_trainable_and_decodable_on_gpu(
+    gen_dict, dis_dict, loss_dict, spks, spk_embed_dim, langs
+):
+    idim = 10
+    odim = 5
+    global_channels = 8
+    gen_args = make_vits_generator_args(**gen_dict)
+    gen_args["generator_params"]["spks"] = spks
+    gen_args["generator_params"]["langs"] = langs
+    gen_args["generator_params"]["spk_embed_dim"] = spk_embed_dim
+    gen_args["generator_params"]["global_channels"] = global_channels
+    dis_args = make_vits_discriminator_args(**dis_dict)
+    loss_args = make_vits_loss_args(**loss_dict)
+    model = VITS(
+        idim=idim,
+        odim=odim,
+        **gen_args,
+        **dis_args,
+        **loss_args,
+    )
+    model.train()
+    upsample_factor = model.generator.upsample_factor
+    inputs = dict(
+        text=torch.randint(0, idim, (2, 8)),
+        text_lengths=torch.tensor([8, 5], dtype=torch.long),
+        feats=torch.randn(2, 16, odim),
+        feats_lengths=torch.tensor([16, 13], dtype=torch.long),
+        speech=torch.randn(2, 16 * upsample_factor),
+        speech_lengths=torch.tensor([16, 13] * upsample_factor, dtype=torch.long),
+    )
+    if spks > 0:
+        inputs["sids"] = torch.randint(0, spks, (2, 1))
+    if langs > 0:
+        inputs["lids"] = torch.randint(0, langs, (2, 1))
+    if spk_embed_dim > 0:
+        inputs["spembs"] = torch.randn(2, spk_embed_dim)
+    device = torch.device("cuda")
+    model.to(device)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    gen_loss = model(forward_generator=True, **inputs)["loss"]
+    gen_loss.backward()
+    dis_loss = model(forward_generator=False, **inputs)["loss"]
+    dis_loss.backward()
+
+    with torch.no_grad():
+        model.eval()
+
+        # check inference
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+        )
+        if spks > 0:
+            inputs["sids"] = torch.randint(0, spks, (1,))
+        if langs > 0:
+            inputs["lids"] = torch.randint(0, langs, (1,))
+        if spk_embed_dim > 0:
+            inputs["spembs"] = torch.randn(spk_embed_dim)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        model.inference(**inputs)
+
+        # check inference with predefined duration
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+            durations=torch.tensor([1, 2, 3, 4, 5], dtype=torch.long),
+        )
+        if spks > 0:
+            inputs["sids"] = torch.randint(0, spks, (1,))
+        if langs > 0:
+            inputs["lids"] = torch.randint(0, langs, (1,))
+        if spk_embed_dim > 0:
+            inputs["spembs"] = torch.randn(spk_embed_dim)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        output_dict = model.inference(**inputs)
+        assert output_dict["wav"].size(0) == inputs["durations"].sum() * upsample_factor
+
+        # check inference with teachder forcing
+        inputs = dict(
+            text=torch.randint(
+                0,
+                idim,
+                (5,),
+            ),
+            feats=torch.randn(16, odim),
+        )
+        if spks > 0:
+            inputs["sids"] = torch.randint(0, spks, (1,))
+        if langs > 0:
+            inputs["lids"] = torch.randint(0, langs, (1,))
+        if spk_embed_dim > 0:
+            inputs["spembs"] = torch.randn(spk_embed_dim)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        output_dict = model.inference(**inputs, use_teacher_forcing=True)
+        assert output_dict["wav"].size(0) == inputs["feats"].size(0) * upsample_factor
diff --git a/test/espnet2/gan_tts/wavenet/test_wavenet.py b/test/espnet2/gan_tts/wavenet/test_wavenet.py
new file mode 100644
index 00000000000..d272d6fab9c
--- /dev/null
+++ b/test/espnet2/gan_tts/wavenet/test_wavenet.py
@@ -0,0 +1,67 @@
+# Copyright 2021 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Test code for WaveNet modules."""
+
+import pytest
+import torch
+
+from espnet2.gan_tts.wavenet import WaveNet
+
+
+def make_wavenet_args(**kwargs):
+    defaults = dict(
+        in_channels=1,
+        out_channels=1,
+        kernel_size=3,
+        layers=4,
+        stacks=1,
+        base_dilation=2,
+        residual_channels=4,
+        aux_channels=-1,
+        gate_channels=8,
+        skip_channels=8,
+        global_channels=-1,
+        dropout_rate=0.0,
+        bias=True,
+        use_weight_norm=True,
+        use_first_conv=True,
+        use_last_conv=False,
+        scale_residual=False,
+        scale_skip_connect=False,
+    )
+    defaults.update(kwargs)
+    return defaults
+
+
+@pytest.mark.parametrize(
+    "model_dict",
+    [
+        ({}),
+        ({"use_first_conv": False}),
+        ({"use_last_conv": True}),
+        ({"global_channels": 3}),
+        ({"aux_channels": 3}),
+        ({"scale_residual": True}),
+        ({"scale_skip_connect": True}),
+    ],
+)
+def test_wavenet_forward(model_dict):
+    batch_size = 2
+    batch_length = 128
+    args = make_wavenet_args(**model_dict)
+    if args["use_first_conv"]:
+        y = torch.randn(batch_size, 1, batch_length)
+    else:
+        y = torch.randn(batch_size, args["residual_channels"], batch_length)
+    c, g = None, None
+    if args["aux_channels"] > 0:
+        c = torch.randn(batch_size, args["aux_channels"], batch_length)
+    if args["global_channels"] > 0:
+        g = torch.randn(batch_size, args["global_channels"], 1)
+    model = WaveNet(**args)
+    out = model(y, c=c, g=g)
+    if args["use_last_conv"]:
+        out.size(1) == args["out_channels"]
+    else:
+        out.size(1) == args["skip_channels"]
diff --git a/test/espnet2/hubert/test_hubert_loss.py b/test/espnet2/hubert/test_hubert_loss.py
new file mode 100644
index 00000000000..f51aecafb1c
--- /dev/null
+++ b/test/espnet2/hubert/test_hubert_loss.py
@@ -0,0 +1,39 @@
+import pytest
+import torch
+
+from espnet2.hubert.hubert_loss import HubertPretrainLoss  # noqa: H301
+from espnet2.asr.encoder.hubert_encoder import (
+    FairseqHubertPretrainEncoder,  # noqa: H301
+)
+
+pytest.importorskip("fairseq")
+
+
+@pytest.fixture
+def hubert_args():
+    encoder = FairseqHubertPretrainEncoder(
+        output_size=32,
+        linear_units=32,
+        attention_heads=4,
+        num_blocks=2,
+        hubert_dict="../../../test_utils/hubert_test.txt",
+    )
+    bs = 2
+    n_cls = 10
+    logit_m_list = [torch.randn(bs, n_cls + 1)]
+    logit_u_list = [torch.randn(bs, n_cls + 1)]
+    padding_mask = torch.tensor([[False for _ in range(20)]])
+    features_pen = torch.tensor(0.0)
+    enc_outputs = {
+        "logit_m_list": logit_m_list,
+        "logit_u_list": logit_u_list,
+        "padding_mask": padding_mask,
+        "features_pen": features_pen,
+    }
+
+    return encoder.encoder, enc_outputs
+
+
+def test_hubert_loss_forward_backward(hubert_args):
+    hloss = HubertPretrainLoss()
+    hloss(*hubert_args)
diff --git a/test/espnet2/layers/test_global_mvn.py b/test/espnet2/layers/test_global_mvn.py
index bb570df234a..258bd484fcb 100644
--- a/test/espnet2/layers/test_global_mvn.py
+++ b/test/espnet2/layers/test_global_mvn.py
@@ -17,7 +17,7 @@ def stats_file(tmp_path: Path):
     x = np.random.randn(count, 80)
     s = x.sum(0)
     s = np.pad(s, [0, 1], mode="constant", constant_values=count)
-    s2 = (x ** 2).sum(0)
+    s2 = (x**2).sum(0)
     s2 = np.pad(s2, [0, 1], mode="constant", constant_values=0.0)
 
     stats = np.stack([s, s2])
@@ -34,7 +34,7 @@ def stats_file2(tmp_path: Path):
     np.random.seed(0)
     x = np.random.randn(count, 80)
     s = x.sum(0)
-    s2 = (x ** 2).sum(0)
+    s2 = (x**2).sum(0)
 
     np.savez(p, sum=s, sum_square=s2, count=count)
     return p
diff --git a/test/espnet2/layers/test_label_aggregation.py b/test/espnet2/layers/test_label_aggregation.py
new file mode 100644
index 00000000000..7d03732bbe1
--- /dev/null
+++ b/test/espnet2/layers/test_label_aggregation.py
@@ -0,0 +1,22 @@
+import pytest
+import torch
+
+from espnet2.layers.label_aggregation import LabelAggregate
+
+
+@pytest.mark.parametrize(
+    ("input_label", "expected_output"),
+    [
+        (torch.ones(10, 20000, 2), torch.ones(10, 157, 2)),
+        (torch.zeros(10, 20000, 2), torch.zeros(10, 157, 2)),
+    ],
+)
+def test_LabelAggregate(input_label, expected_output):
+    label_aggregate = LabelAggregate(win_length=512, hop_length=128, center=True)
+    aggregated_label, _ = label_aggregate.forward(input=input_label)
+    assert torch.equal(aggregated_label, expected_output)
+
+
+def test_LabelAggregate_repr():
+    label_aggregate = LabelAggregate(win_length=512, hop_length=128, center=True)
+    print(label_aggregate)
diff --git a/test/espnet2/layers/test_stft.py b/test/espnet2/layers/test_stft.py
index 37ed929aa39..f6703dbdb12 100644
--- a/test/espnet2/layers/test_stft.py
+++ b/test/espnet2/layers/test_stft.py
@@ -1,6 +1,3 @@
-from distutils.version import LooseVersion
-
-import pytest
 import torch
 
 from espnet2.layers.stft import Stft
@@ -34,10 +31,6 @@ def test_backward_not_leaf_in():
     y.sum().backward()
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.3"),
-    reason="requires pytorch1.3 or higher",
-)
 def test_inverse():
     layer = Stft()
     x = torch.randn(2, 400, requires_grad=True)
diff --git a/test/espnet2/schedulers/test_noam_lr.py b/test/espnet2/schedulers/test_noam_lr.py
index 1ceab7b359c..1e34d468444 100644
--- a/test/espnet2/schedulers/test_noam_lr.py
+++ b/test/espnet2/schedulers/test_noam_lr.py
@@ -1,15 +1,8 @@
-from distutils.version import LooseVersion
-
-import pytest
 import torch
 
 from espnet2.schedulers.noam_lr import NoamLR
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.1.0"),
-    reason="Require pytorch>=1.1.0",
-)
 def test_NoamLR():
     linear = torch.nn.Linear(2, 2)
     opt = torch.optim.SGD(linear.parameters(), 0.1)
diff --git a/test/espnet2/schedulers/test_warmup_lr.py b/test/espnet2/schedulers/test_warmup_lr.py
index b8c58b336ec..c73cae409b6 100644
--- a/test/espnet2/schedulers/test_warmup_lr.py
+++ b/test/espnet2/schedulers/test_warmup_lr.py
@@ -1,17 +1,10 @@
-from distutils.version import LooseVersion
-
 import numpy as np
-import pytest
 import torch
 
 from espnet2.schedulers.noam_lr import NoamLR
 from espnet2.schedulers.warmup_lr import WarmupLR
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.1.0"),
-    reason="Require pytorch>=1.1.0",
-)
 def test_WarumupLR():
     linear = torch.nn.Linear(2, 2)
     opt = torch.optim.SGD(linear.parameters(), 0.1)
@@ -24,10 +17,6 @@ def test_WarumupLR():
     assert lr != lr2
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.1.0"),
-    reason="Require pytorch>=1.1.0",
-)
 def test_WarumupLR_is_compatible_with_NoamLR():
     lr = 10
     model_size = 32
diff --git a/test/espnet2/tasks/test_diar.py b/test/espnet2/tasks/test_diar.py
new file mode 100644
index 00000000000..202a24f242c
--- /dev/null
+++ b/test/espnet2/tasks/test_diar.py
@@ -0,0 +1,36 @@
+import pytest
+
+from espnet2.tasks.diar import DiarizationTask
+
+
+def test_add_arguments():
+    DiarizationTask.get_parser()
+
+
+def test_add_arguments_help():
+    parser = DiarizationTask.get_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--help"])
+
+
+def test_main_help():
+    with pytest.raises(SystemExit):
+        DiarizationTask.main(cmd=["--help"])
+
+
+def test_main_print_config():
+    with pytest.raises(SystemExit):
+        DiarizationTask.main(cmd=["--print_config"])
+
+
+def test_main_with_no_args():
+    with pytest.raises(SystemExit):
+        DiarizationTask.main(cmd=[])
+
+
+def test_print_config_and_load_it(tmp_path):
+    config_file = tmp_path / "config.yaml"
+    with config_file.open("w") as f:
+        DiarizationTask.print_config(f)
+    parser = DiarizationTask.get_parser()
+    parser.parse_args(["--config", str(config_file)])
diff --git a/test/espnet2/tasks/test_hubert.py b/test/espnet2/tasks/test_hubert.py
new file mode 100644
index 00000000000..20230b4be45
--- /dev/null
+++ b/test/espnet2/tasks/test_hubert.py
@@ -0,0 +1,36 @@
+import pytest
+
+from espnet2.tasks.hubert import HubertTask
+
+
+def test_add_arguments():
+    HubertTask.get_parser()
+
+
+def test_add_arguments_help():
+    parser = HubertTask.get_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--help"])
+
+
+def test_main_help():
+    with pytest.raises(SystemExit):
+        HubertTask.main(cmd=["--help"])
+
+
+def test_main_print_config():
+    with pytest.raises(SystemExit):
+        HubertTask.main(cmd=["--print_config"])
+
+
+def test_main_with_no_args():
+    with pytest.raises(SystemExit):
+        HubertTask.main(cmd=[])
+
+
+def test_print_config_and_load_it(tmp_path):
+    config_file = tmp_path / "config.yaml"
+    with config_file.open("w") as f:
+        HubertTask.print_config(f)
+    parser = HubertTask.get_parser()
+    parser.parse_args(["--config", str(config_file)])
diff --git a/test/espnet2/text/test_phoneme_tokenizer.py b/test/espnet2/text/test_phoneme_tokenizer.py
index 0075acb40e8..35ec36f22b4 100644
--- a/test/espnet2/text/test_phoneme_tokenizer.py
+++ b/test/espnet2/text/test_phoneme_tokenizer.py
@@ -12,6 +12,7 @@
             "pyopenjtalk_accent",
             "pyopenjtalk_kana",
             "pyopenjtalk_accent_with_pause",
+            "pyopenjtalk_prosody",
         ]
     )
     del pyopenjtalk
@@ -28,9 +29,27 @@
     import phonemizer
 
     params.extend(["espeak_ng_arabic"])
+    params.extend(["espeak_ng_german"])
+    params.extend(["espeak_ng_french"])
+    params.extend(["espeak_ng_spanish"])
+    params.extend(["espeak_ng_russian"])
+    params.extend(["espeak_ng_greek"])
+    params.extend(["espeak_ng_finnish"])
+    params.extend(["espeak_ng_hungarian"])
+    params.extend(["espeak_ng_dutch"])
+    params.extend(["espeak_ng_english_us_vits"])
+    params.extend(["espeak_ng_hindi"])
     del phonemizer
 except ImportError:
     pass
+try:
+    import g2pk
+
+    params.extend(["g2pk", "g2pk_no_space"])
+    del g2pk
+except ImportError:
+    pass
+params.extend(["korean_jaso", "korean_jaso_no_space"])
 
 
 @pytest.fixture(params=params)
@@ -45,8 +64,8 @@ def test_repr(phoneme_tokenizer: PhonemeTokenizer):
 @pytest.mark.execution_timeout(5)
 def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer):
     if phoneme_tokenizer.g2p_type is None:
-        input = "HH AH0 L OW1 <space> W ER1 L D"
-        output = ["HH", "AH0", "L", "OW1", "<space>", "W", "ER1", "L", "D"]
+        input = "HH AH0 L OW1   W ER1 L D"
+        output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"]
     elif phoneme_tokenizer.g2p_type == "g2p_en":
         input = "Hello World"
         output = ["HH", "AH0", "L", "OW1", " ", "W", "ER1", "L", "D"]
@@ -224,6 +243,40 @@ def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer):
             "2",
             "3",
         ]
+    elif phoneme_tokenizer.g2p_type == "pyopenjtalk_prosody":
+        input = "昔は、俺も若かった"
+        output = [
+            "^",
+            "m",
+            "u",
+            "[",
+            "k",
+            "a",
+            "sh",
+            "i",
+            "w",
+            "a",
+            "_",
+            "o",
+            "[",
+            "r",
+            "e",
+            "m",
+            "o",
+            "#",
+            "w",
+            "a",
+            "[",
+            "k",
+            "a",
+            "]",
+            "k",
+            "a",
+            "cl",
+            "t",
+            "a",
+            "$",
+        ]
     elif phoneme_tokenizer.g2p_type == "pypinyin_g2p":
         input = "卡尔普陪外孙玩滑梯。"
         output = [
@@ -259,8 +312,178 @@ def test_text2tokens(phoneme_tokenizer: PhonemeTokenizer):
             "。",
         ]
     elif phoneme_tokenizer.g2p_type == "espeak_ng_arabic":
-        input = u"السلام عليكم"
-        output = ["ʔ", "a", "s", "s", "ˈa", "l", "aː", "m", "ʕ", "l", "ˈiː", "k", "m"]
+        input = "السلام عليكم"
+        output = ["ʔ", "a", "s", "s", "a", "l", "ˈaː", "m", "ʕ", "l", "ˈiː", "k", "m"]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_german":
+        input = "Das hört sich gut an."
+        output = [
+            "d",
+            "a",
+            "s",
+            "h",
+            "ˈœ",
+            "ɾ",
+            "t",
+            "z",
+            "ɪ",
+            "ç",
+            "ɡ",
+            "ˈuː",
+            "t",
+            "ˈa",
+            "n",
+            ".",
+        ]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_french":
+        input = "Bonjour le monde."
+        output = ["b", "ɔ̃", "ʒ", "ˈu", "ʁ", "l", "ə-", "m", "ˈɔ̃", "d", "."]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_spanish":
+        input = "Hola Mundo."
+        output = ["ˈo", "l", "a", "m", "ˈu", "n", "d", "o", "."]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_russian":
+        input = "Привет мир."
+        output = ["p", "rʲ", "i", "vʲ", "ˈe", "t", "mʲ", "ˈi", "r", "."]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_greek":
+        input = "Γειά σου Κόσμε."
+        output = ["j", "ˈa", "s", "u", "k", "ˈo", "s", "m", "e", "."]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_finnish":
+        input = "Hei maailma."
+        output = ["h", "ˈei", "m", "ˈaː", "ɪ", "l", "m", "a", "."]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_hungarian":
+        input = "Helló Világ."
+        output = ["h", "ˈɛ", "l", "l", "oː", "v", "ˈi", "l", "aː", "ɡ", "."]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_dutch":
+        input = "Hallo Wereld."
+        output = ["h", "ˈɑ", "l", "oː", "ʋ", "ˈɪː", "r", "ə", "l", "t", "."]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_hindi":
+        input = "नमस्ते दुनिया"
+        output = ["n", "ə", "m", "ˈʌ", "s", "t", "eː", "d", "ˈʊ", "n", "ɪ", "j", "ˌaː"]
+    elif phoneme_tokenizer.g2p_type == "g2pk":
+        input = "안녕하세요 세계입니다."
+        output = [
+            "ᄋ",
+            "ᅡ",
+            "ᆫ",
+            "ᄂ",
+            "ᅧ",
+            "ᆼ",
+            "ᄒ",
+            "ᅡ",
+            "ᄉ",
+            "ᅦ",
+            "ᄋ",
+            "ᅭ",
+            " ",
+            "ᄉ",
+            "ᅦ",
+            "ᄀ",
+            "ᅨ",
+            "ᄋ",
+            "ᅵ",
+            "ᆷ",
+            "ᄂ",
+            "ᅵ",
+            "ᄃ",
+            "ᅡ",
+            ".",
+        ]
+    elif phoneme_tokenizer.g2p_type == "g2pk_no_space":
+        input = "안녕하세요 세계입니다."
+        output = [
+            "ᄋ",
+            "ᅡ",
+            "ᆫ",
+            "ᄂ",
+            "ᅧ",
+            "ᆼ",
+            "ᄒ",
+            "ᅡ",
+            "ᄉ",
+            "ᅦ",
+            "ᄋ",
+            "ᅭ",
+            "ᄉ",
+            "ᅦ",
+            "ᄀ",
+            "ᅨ",
+            "ᄋ",
+            "ᅵ",
+            "ᆷ",
+            "ᄂ",
+            "ᅵ",
+            "ᄃ",
+            "ᅡ",
+            ".",
+        ]
+    elif phoneme_tokenizer.g2p_type == "espeak_ng_english_us_vits":
+        input = "Hello, World."
+        output = [
+            "h",
+            "ə",
+            "l",
+            "ˈ",
+            "o",
+            "ʊ",
+            ",",
+            "<space>",
+            "w",
+            "ˈ",
+            "ɜ",
+            "ː",
+            "l",
+            "d",
+            ".",
+        ]
+    elif phoneme_tokenizer.g2p_type == "korean_jaso":
+        input = "나는 학교에 갑니다."
+        output = [
+            "ᄂ",
+            "ᅡ",
+            "ᄂ",
+            "ᅳ",
+            "ᆫ",
+            "<space>",
+            "ᄒ",
+            "ᅡ",
+            "ᆨ",
+            "ᄀ",
+            "ᅭ",
+            "ᄋ",
+            "ᅦ",
+            "<space>",
+            "ᄀ",
+            "ᅡ",
+            "ᆸ",
+            "ᄂ",
+            "ᅵ",
+            "ᄃ",
+            "ᅡ",
+            ".",
+        ]
+    elif phoneme_tokenizer.g2p_type == "korean_jaso_no_space":
+        input = "나는 학교에 갑니다."
+        output = [
+            "ᄂ",
+            "ᅡ",
+            "ᄂ",
+            "ᅳ",
+            "ᆫ",
+            "ᄒ",
+            "ᅡ",
+            "ᆨ",
+            "ᄀ",
+            "ᅭ",
+            "ᄋ",
+            "ᅦ",
+            "ᄀ",
+            "ᅡ",
+            "ᆸ",
+            "ᄂ",
+            "ᅵ",
+            "ᄃ",
+            "ᅡ",
+            ".",
+        ]
     else:
         raise NotImplementedError
     assert phoneme_tokenizer.text2tokens(input) == output
diff --git a/test/espnet2/train/test_distributed_utils.py b/test/espnet2/train/test_distributed_utils.py
index cc0a1823eae..c52fed773eb 100644
--- a/test/espnet2/train/test_distributed_utils.py
+++ b/test/espnet2/train/test_distributed_utils.py
@@ -186,6 +186,7 @@ def test_resolve_distributed_mode10(dist_init_method):
     assert not args.multiprocessing_distributed
 
 
+@pytest.mark.skipif(True, reason="sometimes hangup?")
 def test_init_cpu(dist_init_method):
     args = argparse.Namespace(
         multiprocessing_distributed=True,
@@ -238,6 +239,7 @@ def test_init_cpu2():
             fn2.result()
 
 
+@pytest.mark.skipif(True, reason="sometimes hangup?")
 def test_init_cpu3():
     args = argparse.Namespace(
         multiprocessing_distributed=True,
diff --git a/test/espnet2/train/test_iterable_dataset.py b/test/espnet2/train/test_iterable_dataset.py
index 8a6981e255f..1eb1e4c6a10 100644
--- a/test/espnet2/train/test_iterable_dataset.py
+++ b/test/espnet2/train/test_iterable_dataset.py
@@ -1,10 +1,7 @@
-from distutils.version import LooseVersion
-
 import h5py
 import kaldiio
 import numpy as np
 import pytest
-import torch
 
 from espnet2.fileio.npy_scp import NpyScpWriter
 from espnet2.fileio.sound_scp import SoundScpWriter
@@ -35,9 +32,6 @@ def sound_scp(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_sound_scp(sound_scp):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(sound_scp, "data1", "sound")],
@@ -64,9 +58,6 @@ def feats_scp(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_feats_scp(
     feats_scp,
 ):
@@ -97,9 +88,6 @@ def npy_scp(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_npy_scp(npy_scp):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(npy_scp, "data3", "npy")],
@@ -128,9 +116,6 @@ def h5file_1(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_h5file_1(h5file_1):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(h5file_1, "data4", "hdf5")],
@@ -159,9 +144,6 @@ def shape_file(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_rand_float(shape_file):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(shape_file, "data5", "rand_float")],
@@ -181,9 +163,6 @@ def test_ESPnetDataset_rand_float(shape_file):
             )
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_rand_int(shape_file):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(shape_file, "data6", "rand_int_0_10")],
@@ -212,9 +191,6 @@ def text(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_text(text):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(text, "data7", "text")],
@@ -237,9 +213,6 @@ def text_float(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_text_float(text_float):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(text_float, "data8", "text_float")],
@@ -262,9 +235,6 @@ def text_int(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_text_int(text_int):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(text_int, "data8", "text_int")],
@@ -287,9 +257,6 @@ def csv_float(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_csv_float(csv_float):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(csv_float, "data8", "csv_float")],
@@ -312,9 +279,6 @@ def csv_int(tmp_path):
     return str(p)
 
 
-@pytest.mark.skipif(
-    LooseVersion(torch.__version__) < LooseVersion("1.2"), reason="require pytorch>=1.2"
-)
 def test_ESPnetDataset_csv_int(csv_int):
     dataset = IterableESPnetDataset(
         path_name_type_list=[(csv_int, "data8", "csv_int")],
diff --git a/test/espnet2/train/test_reporter.py b/test/espnet2/train/test_reporter.py
index a1f9dc35ab2..c928c52523a 100644
--- a/test/espnet2/train/test_reporter.py
+++ b/test/espnet2/train/test_reporter.py
@@ -1,4 +1,3 @@
-from distutils.version import LooseVersion
 import logging
 from pathlib import Path
 import uuid
@@ -6,17 +5,13 @@
 import numpy as np
 import pytest
 import torch
+from torch.utils.tensorboard import SummaryWriter
 
 from espnet2.train.reporter import aggregate
 from espnet2.train.reporter import Average
 from espnet2.train.reporter import ReportedValue
 from espnet2.train.reporter import Reporter
 
-if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"):
-    from torch.utils.tensorboard import SummaryWriter
-else:
-    from tensorboardX import SummaryWriter
-
 
 @pytest.mark.parametrize("weight1,weight2", [(None, None), (19, np.array(9))])
 def test_register(weight1, weight2):
@@ -280,7 +275,7 @@ def test_matplotlib_plot(tmp_path: Path):
 def test_tensorboard_add_scalar(tmp_path: Path):
     reporter = Reporter()
     reporter.set_epoch(1)
-    key1 = uuid.uuid4().hex
+    key1 = "train"
     with reporter.observe(key1) as sub:
         stats1 = {"aa": 0.6}
         sub.register(stats1)
diff --git a/test/espnet2/tts/feats_extract/test_linear_spectrogram.py b/test/espnet2/tts/feats_extract/test_linear_spectrogram.py
new file mode 100644
index 00000000000..c941e117182
--- /dev/null
+++ b/test/espnet2/tts/feats_extract/test_linear_spectrogram.py
@@ -0,0 +1,50 @@
+import numpy as np
+import torch
+
+from espnet2.tts.feats_extract.linear_spectrogram import LinearSpectrogram
+from espnet2.tts.feats_extract.log_mel_fbank import LogMelFbank
+
+
+def test_forward():
+    layer = LinearSpectrogram(n_fft=4, hop_length=1)
+    x = torch.randn(2, 4, 9)
+    y, _ = layer(x, torch.LongTensor([4, 3]))
+    assert y.shape == (2, 5, 9, 3)
+
+
+def test_backward_leaf_in():
+    layer = LinearSpectrogram(n_fft=4, hop_length=1)
+    x = torch.randn(2, 4, 9, requires_grad=True)
+    y, _ = layer(x, torch.LongTensor([4, 3]))
+    y.sum().backward()
+
+
+def test_backward_not_leaf_in():
+    layer = LinearSpectrogram(n_fft=4, hop_length=1)
+    x = torch.randn(2, 4, 9, requires_grad=True)
+    x = x + 2
+    y, _ = layer(x, torch.LongTensor([4, 3]))
+    y.sum().backward()
+
+
+def test_output_size():
+    layer = LinearSpectrogram(n_fft=4, hop_length=1)
+    print(layer.output_size())
+
+
+def test_get_parameters():
+    layer = LinearSpectrogram(n_fft=4, hop_length=1)
+    print(layer.get_parameters())
+
+
+def test_log_mel_equal():
+    layer1 = LinearSpectrogram(n_fft=4, hop_length=1)
+    layer2 = LogMelFbank(n_fft=4, hop_length=1, n_mels=2)
+    x = torch.randn(2, 4, 9)
+    y1, y1_lens = layer1(x, torch.LongTensor([4, 3]))
+    y2, _ = layer2(x, torch.LongTensor([4, 3]))
+    y1_2, _ = layer2.logmel(y1, y1_lens)
+    np.testing.assert_array_equal(
+        y2.detach().cpu().numpy(),
+        y1_2.detach().cpu().numpy(),
+    )
diff --git a/test/espnet2/tts/test_fastspeech.py b/test/espnet2/tts/test_fastspeech.py
index 0f104fe526c..c5252afdc6d 100644
--- a/test/espnet2/tts/test_fastspeech.py
+++ b/test/espnet2/tts/test_fastspeech.py
@@ -11,13 +11,18 @@
 )
 @pytest.mark.parametrize("encoder_type", ["transformer", "conformer"])
 @pytest.mark.parametrize("decoder_type", ["transformer", "conformer"])
-@pytest.mark.parametrize("use_gst", [True, False])
+@pytest.mark.parametrize(
+    "spks, langs, use_gst",
+    [(-1, -1, False), (5, 2, True)],
+)
 def test_fastspeech(
     reduction_factor,
     encoder_type,
     decoder_type,
     spk_embed_dim,
     spk_embed_integration_type,
+    spks,
+    langs,
     use_gst,
 ):
     model = FastSpeech(
@@ -35,6 +40,8 @@ def test_fastspeech(
         reduction_factor=reduction_factor,
         encoder_type=encoder_type,
         decoder_type=decoder_type,
+        spks=spks,
+        langs=langs,
         spk_embed_dim=spk_embed_dim,
         spk_embed_integration_type=spk_embed_integration_type,
         use_gst=use_gst,
@@ -53,14 +60,18 @@ def test_fastspeech(
     inputs = dict(
         text=torch.randint(1, 10, (2, 2)),
         text_lengths=torch.tensor([2, 1], dtype=torch.long),
-        speech=torch.randn(2, 4 * reduction_factor, 5),
-        speech_lengths=torch.tensor([4, 2], dtype=torch.long) * reduction_factor,
+        feats=torch.randn(2, 4 * reduction_factor, 5),
+        feats_lengths=torch.tensor([4, 2], dtype=torch.long) * reduction_factor,
         durations=torch.tensor([[2, 2, 0], [2, 0, 0]], dtype=torch.long),
         # NOTE(kan-bayashi): +1 for eos
         durations_lengths=torch.tensor([2 + 1, 1 + 1], dtype=torch.long),
     )
     if spk_embed_dim is not None:
         inputs.update(spembs=torch.randn(2, spk_embed_dim))
+    if spks > 0:
+        inputs.update(sids=torch.randint(0, spks, (2, 1)))
+    if langs > 0:
+        inputs.update(lids=torch.randint(0, langs, (2, 1)))
     loss, *_ = model(**inputs)
     loss.backward()
 
@@ -71,9 +82,13 @@ def test_fastspeech(
             text=torch.randint(0, 10, (2,)),
         )
         if use_gst:
-            inputs.update(speech=torch.randn(5, 5))
+            inputs.update(feats=torch.randn(5, 5))
         if spk_embed_dim is not None:
             inputs.update(spembs=torch.randn(spk_embed_dim))
+        if spks > 0:
+            inputs.update(sids=torch.randint(0, spks, (1,)))
+        if langs > 0:
+            inputs.update(lids=torch.randint(0, langs, (1,)))
         model.inference(**inputs)
 
         # teacher forcing
diff --git a/test/espnet2/tts/test_fastspeech2.py b/test/espnet2/tts/test_fastspeech2.py
index 5a0853631da..56f7045c796 100644
--- a/test/espnet2/tts/test_fastspeech2.py
+++ b/test/espnet2/tts/test_fastspeech2.py
@@ -11,7 +11,10 @@
 )
 @pytest.mark.parametrize("encoder_type", ["transformer", "conformer"])
 @pytest.mark.parametrize("decoder_type", ["transformer", "conformer"])
-@pytest.mark.parametrize("use_gst", [True, False])
+@pytest.mark.parametrize(
+    "spks, langs, use_gst",
+    [(-1, -1, False), (5, 2, True)],
+)
 def test_fastspeech2(
     reduction_factor,
     spk_embed_dim,
@@ -19,6 +22,8 @@ def test_fastspeech2(
     encoder_type,
     decoder_type,
     use_gst,
+    spks,
+    langs,
 ):
     model = FastSpeech2(
         idim=10,
@@ -50,6 +55,8 @@ def test_fastspeech2(
         pitch_predictor_dropout=0.5,
         pitch_embed_kernel_size=9,
         pitch_embed_dropout=0.5,
+        spks=spks,
+        langs=langs,
         spk_embed_dim=spk_embed_dim,
         spk_embed_integration_type=spk_embed_integration_type,
         use_gst=use_gst,
@@ -68,8 +75,8 @@ def test_fastspeech2(
     inputs = dict(
         text=torch.randint(1, 10, (2, 2)),
         text_lengths=torch.tensor([2, 1], dtype=torch.long),
-        speech=torch.randn(2, 4 * reduction_factor, 5),
-        speech_lengths=torch.tensor([4, 2], dtype=torch.long) * reduction_factor,
+        feats=torch.randn(2, 4 * reduction_factor, 5),
+        feats_lengths=torch.tensor([4, 2], dtype=torch.long) * reduction_factor,
         durations=torch.tensor([[2, 2, 0], [2, 0, 0]], dtype=torch.long),
         pitch=torch.tensor([[2, 2, 0], [2, 0, 0]], dtype=torch.float).unsqueeze(-1),
         energy=torch.tensor([[2, 2, 0], [2, 0, 0]], dtype=torch.float).unsqueeze(-1),
@@ -80,6 +87,10 @@ def test_fastspeech2(
     )
     if spk_embed_dim is not None:
         inputs.update(spembs=torch.randn(2, spk_embed_dim))
+    if spks > 0:
+        inputs.update(sids=torch.randint(0, spks, (2, 1)))
+    if langs > 0:
+        inputs.update(lids=torch.randint(0, langs, (2, 1)))
     loss, *_ = model(**inputs)
     loss.backward()
 
@@ -90,9 +101,13 @@ def test_fastspeech2(
             text=torch.randint(0, 10, (2,)),
         )
         if use_gst:
-            inputs.update(speech=torch.randn(5, 5))
+            inputs.update(feats=torch.randn(5, 5))
         if spk_embed_dim is not None:
             inputs.update(spembs=torch.randn(spk_embed_dim))
+        if spks > 0:
+            inputs.update(sids=torch.randint(0, spks, (1,)))
+        if langs > 0:
+            inputs.update(lids=torch.randint(0, langs, (1,)))
         model.inference(**inputs)
 
         # teacher forcing
diff --git a/test/espnet2/tts/test_tacotron2.py b/test/espnet2/tts/test_tacotron2.py
index 954a19896e7..e67af90433c 100644
--- a/test/espnet2/tts/test_tacotron2.py
+++ b/test/espnet2/tts/test_tacotron2.py
@@ -11,12 +11,17 @@
     "spk_embed_dim, spk_embed_integration_type",
     [(None, "add"), (2, "add"), (2, "concat")],
 )
-@pytest.mark.parametrize("use_gst", [True, False])
+@pytest.mark.parametrize(
+    "spks, langs, use_gst",
+    [(-1, -1, False), (5, 2, True)],
+)
 @pytest.mark.parametrize("use_guided_attn_loss", [True, False])
 def test_tacotron2(
     prenet_layers,
     postnet_layers,
     reduction_factor,
+    spks,
+    langs,
     spk_embed_dim,
     spk_embed_integration_type,
     use_gst,
@@ -40,6 +45,8 @@ def test_tacotron2(
         postnet_chans=4,
         postnet_filts=5,
         reduction_factor=reduction_factor,
+        spks=spks,
+        langs=langs,
         spk_embed_dim=spk_embed_dim,
         spk_embed_integration_type=spk_embed_integration_type,
         use_gst=use_gst,
@@ -58,11 +65,15 @@ def test_tacotron2(
     inputs = dict(
         text=torch.randint(0, 10, (2, 4)),
         text_lengths=torch.tensor([4, 1], dtype=torch.long),
-        speech=torch.randn(2, 3, 5),
-        speech_lengths=torch.tensor([3, 1], dtype=torch.long),
+        feats=torch.randn(2, 5, 5),
+        feats_lengths=torch.tensor([5, 3], dtype=torch.long),
     )
     if spk_embed_dim is not None:
         inputs.update(spembs=torch.randn(2, spk_embed_dim))
+    if spks > 0:
+        inputs.update(sids=torch.randint(0, spks, (2, 1)))
+    if langs > 0:
+        inputs.update(lids=torch.randint(0, langs, (2, 1)))
     loss, *_ = model(**inputs)
     loss.backward()
 
@@ -74,11 +85,15 @@ def test_tacotron2(
             text=torch.randint(0, 10, (2,)),
         )
         if use_gst:
-            inputs.update(speech=torch.randn(5, 5))
+            inputs.update(feats=torch.randn(5, 5))
         if spk_embed_dim is not None:
             inputs.update(spembs=torch.randn(spk_embed_dim))
+        if spks > 0:
+            inputs.update(sids=torch.randint(0, spks, (1,)))
+        if langs > 0:
+            inputs.update(lids=torch.randint(0, langs, (1,)))
         model.inference(**inputs, maxlenratio=1.0)
 
         # teacher forcing
-        inputs.update(speech=torch.randn(5, 5))
+        inputs.update(feats=torch.randn(5, 5))
         model.inference(**inputs, use_teacher_forcing=True)
diff --git a/test/espnet2/tts/test_transformer.py b/test/espnet2/tts/test_transformer.py
index 68ba75f8e49..ef72518aff0 100644
--- a/test/espnet2/tts/test_transformer.py
+++ b/test/espnet2/tts/test_transformer.py
@@ -12,7 +12,10 @@
     "spk_embed_dim, spk_embed_integration_type",
     [(None, "add"), (2, "add"), (2, "concat")],
 )
-@pytest.mark.parametrize("use_gst", [True, False])
+@pytest.mark.parametrize(
+    "spks, langs, use_gst",
+    [(-1, -1, False), (5, 2, True)],
+)
 @pytest.mark.parametrize(
     "use_guided_attn_loss, modules_applied_guided_attn",
     [
@@ -25,6 +28,8 @@ def test_tranformer(
     dprenet_layers,
     postnet_layers,
     reduction_factor,
+    spks,
+    langs,
     spk_embed_dim,
     spk_embed_integration_type,
     use_gst,
@@ -53,6 +58,8 @@ def test_tranformer(
         use_scaled_pos_enc=True,
         use_batch_norm=True,
         reduction_factor=reduction_factor,
+        spks=spks,
+        langs=langs,
         spk_embed_dim=spk_embed_dim,
         spk_embed_integration_type=spk_embed_integration_type,
         use_gst=use_gst,
@@ -72,11 +79,15 @@ def test_tranformer(
     inputs = dict(
         text=torch.randint(0, 10, (2, 4)),
         text_lengths=torch.tensor([4, 1], dtype=torch.long),
-        speech=torch.randn(2, 3, 5),
-        speech_lengths=torch.tensor([3, 1], dtype=torch.long),
+        feats=torch.randn(2, 5, 5),
+        feats_lengths=torch.tensor([5, 3], dtype=torch.long),
     )
     if spk_embed_dim is not None:
         inputs.update(spembs=torch.randn(2, spk_embed_dim))
+    if spks > 0:
+        inputs.update(sids=torch.randint(0, spks, (2, 1)))
+    if langs > 0:
+        inputs.update(lids=torch.randint(0, langs, (2, 1)))
     loss, *_ = model(**inputs)
     loss.backward()
 
@@ -88,11 +99,15 @@ def test_tranformer(
             text=torch.randint(0, 10, (2,)),
         )
         if use_gst:
-            inputs.update(speech=torch.randn(5, 5))
+            inputs.update(feats=torch.randn(5, 5))
         if spk_embed_dim is not None:
             inputs.update(spembs=torch.randn(spk_embed_dim))
+        if spks > 0:
+            inputs.update(sids=torch.randint(0, spks, (1,)))
+        if langs > 0:
+            inputs.update(lids=torch.randint(0, langs, (1,)))
         model.inference(**inputs, maxlenratio=1.0)
 
         # teacher forcing
-        inputs.update(speech=torch.randn(5, 5))
+        inputs.update(feats=torch.randn(5, 5))
         model.inference(**inputs, use_teacher_forcing=True)
diff --git a/test/test_asr_init.py b/test/test_asr_init.py
index 8e9170abc07..b9254828ab2 100644
--- a/test/test_asr_init.py
+++ b/test/test_asr_init.py
@@ -72,11 +72,11 @@ def get_rnn_args(**kwargs):
 
 def get_rnnt_args(**kwargs):
     train_defaults = dict(
-        etype="vggblstmp",
+        etype="vggblstm",
         elayers=1,
         subsample="1_2_2_1_1",
-        eunits=4,
-        eprojs=4,
+        eunits=2,
+        eprojs=2,
         dtype="lstm",
         dlayers=1,
         dunits=4,
@@ -150,69 +150,73 @@ def pytorch_prepare_inputs(idim, odim, ilens, olens, is_cuda=False):
 
 
 @pytest.mark.parametrize(
-    "model_type, finetune_dic",
+    "main_model_type, pt_model_type, finetune_dic",
     [
         (
+            "rnn",
             "rnn",
             {
                 "enc_init": None,
                 "dec_init": True,
-                "dec_init_mods": "dec.,att.",
+                "dec_init_mods": ["dec.", "att."],
                 "mtlalpha": 0.5,
                 "use_lm": None,
             },
         ),
         (
             "rnnt",
+            "rnn",
+            {
+                "enc_init": True,
+                "enc_init_mods": ["enc."],
+                "dec_init": None,
+                "mtlalpha": 1.0,
+                "use_lm": None,
+            },
+        ),
+        (
+            "rnnt",
+            "lm",
             {
                 "enc_init": None,
                 "dec_init": True,
-                "dec_init_mods": "dec.0.",
+                "dec_init_mods": ["dec.decoder."],
                 "use_lm": True,
             },
         ),
     ],
 )
-def test_pytorch_trainable_and_transferable(model_type, finetune_dic):
+def test_pytorch_trainable_and_transferable(
+    main_model_type, pt_model_type, finetune_dic
+):
     idim, odim, ilens, olens = get_default_scope_inputs()
+    batch = pytorch_prepare_inputs(idim, odim, ilens, olens)
 
-    if model_type == "rnn":
-        from espnet.nets.pytorch_backend.e2e_asr import E2E
-
-        arg_function = get_rnn_args
+    if pt_model_type == "lm":
+        pt_args = get_rnnt_args() if main_model_type == "rnnt" else get_rnn_args()
+        pt_model = get_lm(pt_args.dlayers, pt_args.dunits, pt_args.char_list)
+        prefix_tmppath = "_rnnlm"
     else:
-        from espnet.nets.pytorch_backend.e2e_asr_transducer import E2E
-
-        arg_function = get_rnnt_args
+        if pt_model_type == "rnn":
+            from espnet.nets.pytorch_backend.e2e_asr import E2E
 
-    args = arg_function()
+            pt_args = get_rnn_args()
+        else:
+            from espnet.nets.pytorch_backend.e2e_asr_transducer import E2E
 
-    model = E2E(idim, odim, args)
+            pt_args = get_rnnt_args()
 
-    batch = pytorch_prepare_inputs(idim, odim, ilens, olens)
+        pt_model = E2E(idim, odim, pt_args)
+        prefix_tmppath = ""
 
-    loss = model(*batch)
-    loss.backward()
+        loss = pt_model(*batch)
+        loss.backward()
 
     if not os.path.exists(".pytest_cache"):
         os.makedirs(".pytest_cache")
 
-    tmppath = tempfile.mktemp()
-
-    if finetune_dic["use_lm"] is not None:
-        lm = get_lm(args.dlayers, args.dunits, args.char_list)
-        tmppath += "_rnnlm"
-
-        torch_save(tmppath, lm)
-    else:
-        torch_save(tmppath, model)
-
-    if finetune_dic["enc_init"] is not None:
-        finetune_dic["enc_init"] = tmppath
-    if finetune_dic["dec_init"] is not None:
-        finetune_dic["dec_init"] = tmppath
-
-    finetune_args = arg_function(**finetune_dic)
+    tmppath = tempfile.mktemp() + prefix_tmppath
+    torch_save(tmppath, pt_model)
 
     # create dummy model.json for saved model to go through
     # get_model_conf(...) called in load_trained_modules method.
@@ -220,22 +224,31 @@ def test_pytorch_trainable_and_transferable(model_type, finetune_dic):
     with open(model_conf, "wb") as f:
         f.write(
             json.dumps(
-                (idim, odim, vars(finetune_args)),
+                (idim, odim, vars(pt_args)),
                 indent=4,
                 ensure_ascii=False,
                 sort_keys=True,
             ).encode("utf_8")
         )
 
-    model = load_trained_modules(idim, odim, finetune_args)
+    if finetune_dic["enc_init"] is not None:
+        finetune_dic["enc_init"] = tmppath
+    if finetune_dic["dec_init"] is not None:
+        finetune_dic["dec_init"] = tmppath
+
+    if main_model_type == "rnn":
+        main_args = get_rnn_args(**finetune_dic)
+    else:
+        main_args = get_rnnt_args(**finetune_dic)
+    main_model = load_trained_modules(idim, odim, main_args)
 
-    loss = model(*batch)
+    loss = main_model(*batch)
     loss.backward()
 
-    if model_type == "rnnt":
+    if main_model_type == "rnnt":
         beam_search = BeamSearchTransducer(
-            decoder=model.dec,
-            joint_network=model.joint_network,
+            decoder=main_model.dec,
+            joint_network=main_model.transducer_tasks.joint_network,
             beam_size=1,
             lm=None,
             lm_weight=0.0,
@@ -249,11 +262,11 @@ def test_pytorch_trainable_and_transferable(model_type, finetune_dic):
 
         with torch.no_grad():
             in_data = np.random.randn(10, idim)
-            model.recognize(in_data, beam_search)
+            main_model.recognize(in_data, beam_search)
     else:
         with torch.no_grad():
             in_data = np.random.randn(10, idim)
-            model.recognize(in_data, args, args.char_list)
+            main_model.recognize(in_data, main_args, main_args.char_list)
 
 
 # todo (b-flo): add test for frozen layers
diff --git a/test/test_asr_quantize.py b/test/test_asr_quantize.py
new file mode 100644
index 00000000000..b174ad5c801
--- /dev/null
+++ b/test/test_asr_quantize.py
@@ -0,0 +1,21 @@
+# Copyright 2021 Gaopeng Xu
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import pytest
+import torch
+
+from espnet.nets.asr_interface import dynamic_import_asr
+
+
+@pytest.mark.parametrize(
+    "name, backend",
+    [(nn, backend) for nn in ("transformer", "rnn") for backend in ("pytorch",)],
+)
+def test_asr_quantize(name, backend):
+    model = dynamic_import_asr(name, backend).build(
+        10, 10, mtlalpha=0.123, adim=4, eunits=2, dunits=2, elayers=1, dlayers=1
+    )
+    quantized_model = torch.quantization.quantize_dynamic(
+        model, {torch.nn.Linear}, dtype=torch.qint8
+    )
+    assert quantized_model.state_dict()
diff --git a/test/test_ctc_segmentation.py b/test/test_ctc_segmentation.py
deleted file mode 100644
index 0fdb5180d5b..00000000000
--- a/test/test_ctc_segmentation.py
+++ /dev/null
@@ -1,377 +0,0 @@
-#!/usr/bin/env false
-# encoding: utf-8
-
-# Copyright 2020, Technische Universität München; Dominik Winkelbauer, Ludwig Kürzinger
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-"""Test functions for CTC segmentation."""
-import numpy as np
-
-from ctc_segmentation import ctc_segmentation
-from ctc_segmentation import CtcSegmentationParameters
-from ctc_segmentation import determine_utterance_segments
-from ctc_segmentation import prepare_text
-
-
-def test_ctc_segmentation():
-    """Test CTC segmentation.
-
-    This is a minimal example for the function.
-    Only executes CTC segmentation, does not check its result.
-    """
-    config = CtcSegmentationParameters()
-    config.min_window_size = 20
-    config.max_window_size = 50
-    char_list = ["•", "a", "c", "d", "g", "o", "s", "t"]
-    text = ["catzz#\n", "dogs!!\n"]
-    ground_truth_mat, utt_begin_indices = prepare_text(config, text, char_list)
-    timings, char_probs, state_list = ctc_segmentation(config, lpz, ground_truth_mat)
-
-
-def test_determine_utterance_segments():
-    """Test the generation of segments from aligned utterances.
-
-    This is a function that is used after a completed CTC segmentation.
-    Results are checked and compared with test vectors.
-    """
-    config = CtcSegmentationParameters()
-    config.frame_duration_ms = 1000
-    config.score_min_mean_over_L = 2
-    utt_begin_indices = [1, 4, 9]
-    text = ["catzz#\n", "dogs!!\n"]
-    char_probs = np.array([-0.5] * 10)
-    timings = np.array(list(range(10))) + 0.5
-    segments = determine_utterance_segments(
-        config, utt_begin_indices, char_probs, timings, text
-    )
-    correct_segments = [(2.0, 4.0, -0.5), (5.0, 9.0, -0.5)]
-    for i, boundary in enumerate(segments):
-        utt_segment = f"{i} {boundary[0]:.2f} {boundary[1]:.2f} {boundary[2]:.2f}"
-        print(utt_segment)
-    for i in [0, 1]:
-        for j in [0, 1, 2]:
-            assert segments[i][j] == correct_segments[i][j]
-
-
-def test_prepare_text():
-    """Test the prepare_text function for CTC segmentation.
-
-    Results are checked and compared with test vectors.
-    """
-    config = CtcSegmentationParameters()
-    text = ["catzz#\n", "dogs!!\n"]
-    char_list = ["•", "a", "c", "d", "g", "o", "s", "t"]
-    ground_truth_mat, utt_begin_indices = prepare_text(config, text, char_list)
-    correct_begin_indices = np.array([1, 5, 10])
-    assert (utt_begin_indices == correct_begin_indices).all()
-    gtm = list(ground_truth_mat.shape)
-    assert gtm[0] == 11
-    assert gtm[1] == 1
-
-
-# pre-generated test vectors
-lpz = np.array(
-    [
-        [
-            -1.9890659,
-            -6.910831,
-            -5.693124,
-            -2.8735375,
-            -2.5746322,
-            -3.7570968,
-            -6.505041,
-            -7.800645,
-        ],
-        [
-            -1.7459257,
-            -8.443403,
-            -9.054435,
-            -6.091851,
-            -1.1048597,
-            -4.3298893,
-            -3.6350899,
-            -4.132761,
-        ],
-        [
-            -1.9080026,
-            -7.994824,
-            -9.81665,
-            -2.3486533,
-            -5.144716,
-            -3.9509172,
-            -3.4352026,
-            -1.2714918,
-        ],
-        [
-            -6.0218654,
-            -2.3527913,
-            -2.2818222,
-            -4.691431,
-            -8.936862,
-            -6.176718,
-            -9.35063,
-            -3.822922,
-        ],
-        [
-            -6.7574806,
-            -4.8557367,
-            -7.597179,
-            -6.810881,
-            -7.2958636,
-            -2.3951168,
-            -7.7496943,
-            -2.4941995,
-        ],
-        [
-            -4.045436,
-            -1.1840547,
-            -2.3596387,
-            -6.391866,
-            -9.6217985,
-            -7.970184,
-            -2.97404,
-            -1.4489534,
-        ],
-        [
-            -8.723544,
-            -9.255755,
-            -4.9860573,
-            -5.4689684,
-            -4.178754,
-            -4.4266634,
-            -1.6171856,
-            -6.532046,
-        ],
-        [
-            -5.7916913,
-            -8.874264,
-            -8.35385,
-            -7.554833,
-            -2.7915673,
-            -6.53148,
-            -7.262638,
-            -4.068927,
-        ],
-        [
-            -1.9035804,
-            -8.733719,
-            -3.5118732,
-            -9.5878725,
-            -2.337254,
-            -5.6119165,
-            -9.185156,
-            -10.189388,
-        ],
-        [
-            -2.9709957,
-            -11.0104,
-            -5.8517113,
-            -4.0744276,
-            -5.278929,
-            -4.3865757,
-            -7.6332912,
-            -6.560225,
-        ],
-        [
-            -8.324375,
-            -7.9097023,
-            -4.4599323,
-            -7.7892103,
-            -9.1231165,
-            -2.0423908,
-            -4.377398,
-            -10.835497,
-        ],
-        [
-            -10.399205,
-            -7.0444527,
-            -5.371065,
-            -1.2489381,
-            -5.8032174,
-            -2.7301397,
-            -8.445712,
-            -3.8961184,
-        ],
-        [
-            -2.0746524,
-            -4.541919,
-            -8.762662,
-            -9.938227,
-            -3.8826694,
-            -5.6540346,
-            -8.945148,
-            -3.1916835,
-        ],
-        [
-            -5.8310924,
-            -3.471004,
-            -5.153735,
-            -2.415791,
-            -5.1635947,
-            -9.231514,
-            -4.1059637,
-            -2.7528045,
-        ],
-        [
-            -5.7406664,
-            -1.8533367,
-            -5.225171,
-            -6.8159046,
-            -5.9029193,
-            -6.623233,
-            -4.1038485,
-            -9.242478,
-        ],
-        [
-            -3.882025,
-            -7.318694,
-            -8.598673,
-            -8.664008,
-            -8.898863,
-            -4.3000784,
-            -9.741696,
-            -2.5367324,
-        ],
-        [
-            -8.534433,
-            -6.4304566,
-            -1.5769805,
-            -8.969663,
-            -3.539075,
-            -0.91964996,
-            -6.275173,
-            -2.4531362,
-        ],
-        [
-            -10.100832,
-            -1.9878258,
-            -9.781347,
-            -2.4888206,
-            -6.2522135,
-            -6.343619,
-            -7.033285,
-            -3.0782526,
-        ],
-        [
-            -5.0670514,
-            -3.3480282,
-            -2.4745665,
-            -3.039238,
-            -10.691722,
-            -9.94559,
-            -7.566962,
-            -9.439356,
-        ],
-        [
-            -2.5350397,
-            -9.904655,
-            -3.815092,
-            -6.5622272,
-            -4.3727484,
-            -4.5448284,
-            -7.3634896,
-            -8.524196,
-        ],
-        [
-            -6.907628,
-            -4.4899416,
-            -1.2235631,
-            -3.7986655,
-            -6.103579,
-            -6.596727,
-            -11.327395,
-            -6.719469,
-        ],
-        [
-            -10.498164,
-            -6.086135,
-            -5.3307266,
-            -2.8573642,
-            -1.9187597,
-            -7.7122536,
-            -9.413016,
-            -10.007352,
-        ],
-        [
-            -4.31647,
-            -2.97263,
-            -5.1576066,
-            -5.9061184,
-            -4.530726,
-            -10.311597,
-            -2.7961264,
-            -6.780219,
-        ],
-        [
-            -10.060461,
-            -6.929871,
-            -4.6684146,
-            -2.2593799,
-            -2.1629434,
-            -8.561601,
-            -1.3917265,
-            -5.724318,
-        ],
-        [
-            -8.468343,
-            -3.0233464,
-            -5.2083797,
-            -6.3359613,
-            -7.7919903,
-            -6.32028,
-            -11.001884,
-            -10.480761,
-        ],
-        [
-            -8.077727,
-            -9.722239,
-            -4.501517,
-            -4.7871294,
-            -5.916735,
-            -2.1889973,
-            -2.3767185,
-            -7.748427,
-        ],
-        [
-            -4.550388,
-            -8.701884,
-            -5.8193216,
-            -10.3321705,
-            -3.7262502,
-            -8.329333,
-            -5.845203,
-            -9.304822,
-        ],
-        [
-            -5.4920406,
-            -3.4807057,
-            -7.677996,
-            -2.2778478,
-            -4.0280805,
-            -2.5542955,
-            -1.5931826,
-            -9.432675,
-        ],
-        [
-            -7.612656,
-            -2.683886,
-            -9.083887,
-            -7.212092,
-            -4.4599934,
-            -5.9059615,
-            -3.591928,
-            -9.783908,
-        ],
-        [
-            -3.0536897,
-            -6.5981,
-            -2.4680572,
-            -6.5821176,
-            -8.253022,
-            -9.725112,
-            -6.0701623,
-            -7.134845,
-        ],
-    ],
-    dtype=np.float32,
-)
diff --git a/test/test_custom_transducer.py b/test/test_custom_transducer.py
index 11c47db67ad..34447581e6f 100644
--- a/test/test_custom_transducer.py
+++ b/test/test_custom_transducer.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 
 import argparse
+from distutils.version import LooseVersion
 import tempfile
 
 import json
@@ -14,6 +15,9 @@
 import espnet.nets.pytorch_backend.lm.default as lm_pytorch
 from espnet.nets.pytorch_backend.transducer.blocks import build_blocks
 
+is_torch_1_4_plus = LooseVersion(torch.__version__) >= LooseVersion("1.4.0")
+is_torch_1_5_plus = LooseVersion(torch.__version__) >= LooseVersion("1.5.0")
+
 
 def make_train_args(**kwargs):
     train_defaults = dict(
@@ -34,23 +38,23 @@ def make_train_args(**kwargs):
         dropout_rate_embed_decoder=0.0,
         joint_dim=2,
         joint_activation_type="tanh",
-        aux_task_type=None,
-        aux_task_weight=0.1,
-        aux_task_layer_list=[],
-        aux_ctc=False,
-        aux_ctc_weight=1.0,
-        aux_ctc_dropout_rate=0.0,
-        trans_type="warp-transducer",
-        rnnt_mode="rnnt_mode",
+        transducer_loss_weight=1.0,
+        use_ctc_loss=False,
+        ctc_loss_weight=0.0,
+        ctc_loss_dropout_rate=0.0,
+        use_lm_loss=False,
+        lm_loss_weight=0.0,
+        use_aux_transducer_loss=False,
+        aux_transducer_loss_weight=0.0,
+        aux_transducer_loss_enc_output_layers=[],
+        use_symm_kl_div_loss=False,
+        symm_kl_div_loss_weight=0.0,
         char_list=["a", "e", "i", "o", "u"],
         sym_space="<space>",
         sym_blank="<blank>",
         report_cer=False,
         report_wer=False,
         search_type="default",
-        score_norm_transducer=False,
-        beam_size=1,
-        nbest=1,
         verbose=0,
         outdir=None,
         rnnlm=None,
@@ -72,6 +76,7 @@ def make_recog_args(**kwargs):
         max_sym_exp=2,
         u_max=5,
         prefix_alpha=2,
+        softmax_temperature=1.0,
         score_norm_transducer=True,
         rnnlm=None,
         lm_weight=0.1,
@@ -86,8 +91,8 @@ def get_default_scope_inputs():
     idim = 12
     odim = 5
 
-    ilens = [12, 4]
-    olens = [5, 4]
+    ilens = [15, 11]
+    olens = [13, 9]
 
     return bs, idim, odim, ilens, olens
 
@@ -131,12 +136,12 @@ def prepare(args):
 
     model = E2E(idim, odim, args)
 
-    x = torch.randn(bs, max(ilens), idim)
-    y = (torch.rand(bs, max(olens)) * n_token % n_token).long()
+    feats = torch.randn(bs, max(ilens), idim)
+    labels = (torch.rand(bs, max(olens)) * n_token % n_token).long()
 
     for i in range(bs):
-        x[i, ilens[i] :] = -1
-        y[i, olens[i] :] = model.ignore_id
+        feats[i, ilens[i] :] = -1
+        labels[i, olens[i] :] = model.ignore_id
 
     data = {}
     uttid_list = []
@@ -147,7 +152,7 @@ def prepare(args):
         }
         uttid_list.append("utt%d" % i)
 
-    return model, x, torch.tensor(ilens), y, data, uttid_list
+    return model, feats, torch.tensor(ilens), labels, data, uttid_list
 
 
 @pytest.mark.parametrize(
@@ -193,22 +198,30 @@ def prepare(args):
         ),
         (
             {
+                "custom_enc_input_layer": "linear",
+                "custom_enc_positional_encoding_type": "abs_pos",
                 "enc_block_arch": [
                     {
-                        "type": "tdnn",
-                        "idim": 2,
-                        "odim": 2,
-                        "ctx_size": 2,
-                        "dilation": 1,
-                        "stride": 1,
+                        "type": "transformer",
+                        "d_hidden": 32,
+                        "d_ff": 4,
+                        "heads": 1,
+                    },
+                    {
+                        "type": "conv1d",
+                        "idim": 32,
+                        "odim": 16,
+                        "kernel_size": 3,
+                        "dilation": 2,
+                        "stride": 2,
                         "dropout-rate": 0.3,
                         "use-relu": True,
                         "use-batch-norm": True,
                     },
                     {
                         "type": "transformer",
-                        "d_hidden": 2,
-                        "d_ff": 2,
+                        "d_hidden": 16,
+                        "d_ff": 4,
                         "heads": 1,
                         "dropout-rate": 0.3,
                         "att-dropout-rate": 0.2,
@@ -222,11 +235,11 @@ def prepare(args):
             {
                 "enc_block_arch": [
                     {
-                        "type": "tdnn",
-                        "idim": 2,
-                        "odim": 2,
-                        "ctx_size": 2,
-                        "dilation": 1,
+                        "type": "conv1d",
+                        "idim": 8,
+                        "odim": 8,
+                        "kernel_size": 2,
+                        "dilation": 2,
                         "stride": 1,
                         "dropout-rate": 0.3,
                         "use-relu": True,
@@ -234,14 +247,13 @@ def prepare(args):
                     },
                     {
                         "type": "conformer",
-                        "d_hidden": 2,
-                        "d_ff": 2,
+                        "d_hidden": 8,
+                        "d_ff": 4,
                         "heads": 1,
                         "macaron_style": False,
                         "use_conv_mod": False,
                     },
                 ],
-                "custom_enc_input_layer": "linear",
                 "custom_enc_self_attn_type": "rel_self_attn",
                 "custom_enc_positional_encoding_type": "rel_pos",
             },
@@ -251,10 +263,10 @@ def prepare(args):
             {
                 "enc_block_arch": [
                     {
-                        "type": "tdnn",
+                        "type": "conv1d",
                         "idim": 2,
                         "odim": 2,
-                        "ctx_size": 2,
+                        "kernel_size": 2,
                         "dilation": 1,
                         "stride": 1,
                     }
@@ -265,8 +277,16 @@ def prepare(args):
         (
             {
                 "dec_block_arch": [
-                    {"type": "causal-conv1d", "idim": 2, "odim": 2, "kernel_size": 3},
-                    {"type": "transformer", "d_hidden": 2, "d_ff": 2, "heads": 1},
+                    {
+                        "type": "causal-conv1d",
+                        "idim": 8,
+                        "odim": 8,
+                        "kernel_size": 3,
+                        "dropout-rate": 0.3,
+                        "use-relu": True,
+                        "use-batch-norm": True,
+                    },
+                    {"type": "transformer", "d_hidden": 8, "d_ff": 4, "heads": 1},
                 ]
             },
             {},
@@ -288,18 +308,23 @@ def prepare(args):
         ({}, {"beam_size": 2, "search_type": "tsd", "max_sym_exp": 3}),
         ({}, {"beam_size": 2, "search_type": "alsd"}),
         ({}, {"beam_size": 2, "search_type": "alsd", "u_max": 10}),
+        ({}, {"beam_size": 2, "search_type": "maes", "nstep": 3, "prefix_alpha": 1}),
         ({}, {"beam_size": 2, "search_type": "tsd", "rnnlm": get_lm()}),
         ({}, {"beam_size": 2, "search_type": "tsd", "rnnlm": get_wordlm()}),
+        ({}, {"beam_size": 2, "search_type": "maes", "nstep": 4, "rnnlm": get_lm()}),
+        ({}, {"beam_size": 2, "search_type": "maes", "rnnlm": get_wordlm()}),
+        ({}, {"beam_size": 2, "softmax_temperature": 2.0, "rnnlm": get_wordlm()}),
+        ({}, {"beam_size": 2, "search_type": "nsc", "softmax_temperature": 5.0}),
     ],
 )
 def test_custom_transducer_trainable_and_decodable(train_dic, recog_dic):
     train_args = make_train_args(**train_dic)
     recog_args = make_recog_args(**recog_dic)
 
-    model, x, ilens, y, data, uttid_list = prepare(train_args)
+    model, feats, feats_len, labels, data, uttid_list = prepare(train_args)
 
     optim = torch.optim.Adam(model.parameters(), 0.01)
-    loss = model(x, ilens, y)
+    loss = model(feats, feats_len, labels)
 
     optim.zero_grad()
     loss.backward()
@@ -307,7 +332,7 @@ def test_custom_transducer_trainable_and_decodable(train_dic, recog_dic):
 
     beam_search = BeamSearchTransducer(
         decoder=model.decoder,
-        joint_network=model.joint_network,
+        joint_network=model.transducer_tasks.joint_network,
         beam_size=recog_args.beam_size,
         lm=recog_args.rnnlm,
         lm_weight=recog_args.lm_weight,
@@ -317,22 +342,25 @@ def test_custom_transducer_trainable_and_decodable(train_dic, recog_dic):
         nstep=recog_args.nstep,
         prefix_alpha=recog_args.prefix_alpha,
         score_norm=recog_args.score_norm_transducer,
+        softmax_temperature=recog_args.softmax_temperature,
     )
 
     with torch.no_grad():
-        nbest = model.recognize(x[0, : ilens[0]].numpy(), beam_search)
+        nbest = model.recognize(feats[0, : feats_len[0]].numpy(), beam_search)
 
-        print(y[0])
         print(nbest[0]["yseq"][1:-1])
 
 
+@pytest.mark.execution_timeout(4)
 def test_calculate_plot_attention():
     from espnet.nets.pytorch_backend.transformer import plot
 
     train_args = make_train_args(report_cer=True)
-    model, x, ilens, y, data, uttid_list = prepare(train_args)
+    model, feats, feats_len, labels, data, uttid_list = prepare(train_args)
+
+    model.attention_plot_class
+    attn_dict = model.calculate_all_attentions(feats[0:1], feats_len[0:1], labels[0:1])
 
-    attn_dict = model.calculate_all_attentions(x[0:1], ilens[0:1], y[0:1])
     plot.plot_multi_head_attention(data, uttid_list, attn_dict, "/tmp/espnet-test")
 
 
@@ -341,8 +369,8 @@ def test_calculate_plot_attention():
     [
         {
             "enc_block_repeat": 2,
-            "aux_task_type": "default",
-            "aux_task_layer_list": [0],
+            "use_aux_transducer_loss": True,
+            "aux_transducer_loss_enc_output_layers": [0],
         },
         {
             "enc_block_arch": [
@@ -360,8 +388,8 @@ def test_calculate_plot_attention():
             "custom_enc_self_attn_type": "rel_self_attn",
             "custom_enc_positional_encoding_type": "rel_pos",
             "enc_block_repeat": 3,
-            "aux_task_type": "symm_kl_div",
-            "aux_task_layer_list": [0, 1],
+            "use_aux_transducer_loss": True,
+            "aux_transducer_loss_enc_output_layers": [0, 1],
         },
         {"aux_ctc": True, "aux_ctc_weight": 0.5},
         {"aux_cross_entropy": True, "aux_cross_entropy_weight": 0.5},
@@ -371,10 +399,10 @@ def test_auxiliary_task(train_dic):
     train_args = make_train_args(**train_dic)
     recog_args = make_recog_args()
 
-    model, x, ilens, y, data, uttid_list = prepare(train_args)
+    model, feats, feats_len, labels, data, uttid_list = prepare(train_args)
 
     optim = torch.optim.Adam(model.parameters(), 0.01)
-    loss = model(x, ilens, y)
+    loss = model(feats, feats_len, labels)
 
     optim.zero_grad()
     loss.backward()
@@ -382,7 +410,7 @@ def test_auxiliary_task(train_dic):
 
     beam_search = BeamSearchTransducer(
         decoder=model.decoder,
-        joint_network=model.joint_network,
+        joint_network=model.transducer_tasks.joint_network,
         beam_size=recog_args.beam_size,
         lm=recog_args.rnnlm,
         lm_weight=recog_args.lm_weight,
@@ -410,9 +438,8 @@ def test_auxiliary_task(train_dic):
     with torch.no_grad():
         model, _ = load_trained_model(tmpdir + "/model.dummy.best", training=False)
 
-        nbest = model.recognize(x[0, : ilens[0]].numpy(), beam_search)
+        nbest = model.recognize(feats[0, : feats_len[0]].numpy(), beam_search)
 
-        print(y[0])
         print(nbest[0]["yseq"][1:-1])
 
 
@@ -482,10 +509,10 @@ def test_invalid_block_arguments():
         _, _, _ = build_blocks("decoder", 4, "embed", [{"type": "conformer"}])
 
     with pytest.raises(ValueError):
-        _, _, _ = build_blocks("encoder", 4, "linear", [{"type": "tdnn"}])
+        _, _, _ = build_blocks("encoder", 4, "embed", [{"type": "causal-conv1d"}])
 
     with pytest.raises(ValueError):
-        _, _, _ = build_blocks("decoder", 4, "embed", [{"type": "causal-conv1d"}])
+        _, _, _ = build_blocks("decoder", 4, "embed", [{"type": "conv1d"}])
 
     with pytest.raises(ValueError):
         _, _, _ = build_blocks(
@@ -526,3 +553,138 @@ def test_invalid_block_io():
                 },
             ],
         )
+
+
+@pytest.mark.parametrize(
+    "train_dic",
+    [
+        {},
+        {
+            "enc_block_arch": [
+                {
+                    "type": "conformer",
+                    "d_hidden": 2,
+                    "d_ff": 2,
+                    "heads": 1,
+                    "macaron_style": True,
+                    "use_conv_mod": True,
+                    "conv_mod_kernel": 1,
+                }
+            ],
+            "custom_enc_input_layer": "vgg2l",
+            "custom_enc_self_attn_type": "rel_self_attn",
+            "custom_enc_positional_encoding_type": "rel_pos",
+        },
+        {
+            "enc_block_arch": [
+                {
+                    "type": "conv1d",
+                    "idim": 2,
+                    "odim": 2,
+                    "kernel_size": 2,
+                    "dilation": 1,
+                    "stride": 1,
+                    "dropout-rate": 0.3,
+                    "use-relu": True,
+                    "use-batch-norm": True,
+                },
+                {
+                    "type": "transformer",
+                    "d_hidden": 2,
+                    "d_ff": 2,
+                    "heads": 1,
+                    "macaron_style": False,
+                    "use_conv_mod": False,
+                },
+            ],
+            "custom_enc_input_layer": "linear",
+        },
+        {
+            "dec_block_arch": [
+                {"type": "causal-conv1d", "idim": 2, "odim": 2, "kernel_size": 1},
+                {"type": "transformer", "d_hidden": 2, "d_ff": 2, "heads": 1},
+            ]
+        },
+    ],
+)
+@pytest.mark.parametrize(
+    "recog_dic",
+    [
+        {},
+        {"beam_size": 2, "search_type": "default"},
+        {"beam_size": 2, "search_type": "alsd"},
+        {"beam_size": 2, "search_type": "tsd"},
+        {"beam_size": 2, "search_type": "nsc"},
+        {"beam_size": 2, "search_type": "maes"},
+    ],
+)
+@pytest.mark.parametrize(
+    "quantize_dic",
+    [
+        {"mod": {torch.nn.Linear}, "dtype": torch.qint8},
+        {"mod": {torch.nn.Linear}, "dtype": torch.float16},
+        {"mod": {torch.nn.LSTM}, "dtype": torch.qint8},
+        {"mod": {torch.nn.LSTM}, "dtype": torch.float16},
+        {"mod": {torch.nn.Linear, torch.nn.LSTM}, "dtype": torch.qint8},
+        {"mod": {torch.nn.Linear, torch.nn.LSTM}, "dtype": torch.float16},
+    ],
+)
+def test_dynamic_quantization(train_dic, recog_dic, quantize_dic):
+    train_args = make_train_args(**train_dic)
+    recog_args = make_recog_args(**recog_dic)
+
+    model, feats, feats_len, _, _, _ = prepare(train_args)
+
+    if not is_torch_1_5_plus and (
+        torch.nn.Linear in quantize_dic["mod"]
+        and quantize_dic["dtype"] == torch.float16
+    ):
+        # In recognize(...) from asr.py we raise ValueError however
+        # AssertionError is originaly raised by torch.
+        with pytest.raises(AssertionError):
+            model = torch.quantization.quantize_dynamic(
+                model,
+                quantize_dic["mod"],
+                dtype=quantize_dic["dtype"],
+            )
+        pytest.skip("Skip rest of the test after checking AssertionError")
+    else:
+        model = torch.quantization.quantize_dynamic(
+            model,
+            quantize_dic["mod"],
+            dtype=quantize_dic["dtype"],
+        )
+
+    beam_search = BeamSearchTransducer(
+        decoder=model.decoder,
+        joint_network=model.transducer_tasks.joint_network,
+        beam_size=recog_args.beam_size,
+        lm=recog_args.rnnlm,
+        lm_weight=recog_args.lm_weight,
+        search_type=recog_args.search_type,
+        max_sym_exp=recog_args.max_sym_exp,
+        u_max=recog_args.u_max,
+        nstep=recog_args.nstep,
+        prefix_alpha=recog_args.prefix_alpha,
+        score_norm=recog_args.score_norm_transducer,
+        quantization=True,
+    )
+
+    with torch.no_grad():
+        model.recognize(feats[0, : feats_len[0]].numpy(), beam_search)
+
+
+@pytest.mark.parametrize(
+    "train_dic, subsample",
+    [
+        ({}, 4),
+        ({"custom_enc_input_layer": "vgg2l"}, 4),
+        ({"custom_enc_input_layer": "linear"}, 1),
+    ],
+)
+def test_subsampling(train_dic, subsample):
+    train_args = make_train_args(**train_dic)
+
+    model, feats, feats_len, _, _, _ = prepare(train_args)
+
+    assert model.get_total_subsampling_factor() == subsample
diff --git a/test/test_e2e_asr.py b/test/test_e2e_asr.py
index fe4f8c09244..a9f29478298 100644
--- a/test/test_e2e_asr.py
+++ b/test/test_e2e_asr.py
@@ -60,7 +60,7 @@ def make_arg(**kwargs):
         streaming_onset_margin=2,
         streaming_offset_margin=2,
         verbose=2,
-        char_list=[u"あ", u"い", u"う", u"え", u"お"],
+        char_list=["あ", "い", "う", "え", "お"],
         outdir=None,
         ctc_type="warpctc",
         report_cer=False,
@@ -320,7 +320,7 @@ def test_gradient_noise_injection(module):
         import espnet.nets.pytorch_backend.e2e_asr as m
     else:
         import espnet.nets.chainer_backend.e2e_asr as m
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E(10, 5, args)
     model_org = m.E2E(10, 5, args_org)
     for batch in batchset:
@@ -343,7 +343,7 @@ def test_sortagrad_trainable(module):
         import espnet.nets.pytorch_backend.e2e_asr as m
     else:
         import espnet.nets.chainer_backend.e2e_asr as m
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E(idim, odim, args)
     for batch in batchset:
         loss = model(*convert_batch(batch, module, idim=idim, odim=odim))
diff --git a/test/test_e2e_asr_conformer.py b/test/test_e2e_asr_conformer.py
index 92a68ebb5b8..783b89d445b 100644
--- a/test/test_e2e_asr_conformer.py
+++ b/test/test_e2e_asr_conformer.py
@@ -85,6 +85,39 @@ def prepare(args):
     use_cnn_module=False,
 )
 
+conformer_ctc = dict(
+    transformer_encoder_pos_enc_layer_type="rel_pos",
+    transformer_encoder_selfattn_layer_type="rel_selfattn",
+    macaron_style=True,
+    use_cnn_module=False,
+    mtlalpha=1.0,
+)
+
+conformer_intermediate_ctc = dict(
+    transformer_encoder_pos_enc_layer_type="rel_pos",
+    transformer_encoder_selfattn_layer_type="rel_selfattn",
+    macaron_style=True,
+    use_cnn_module=False,
+    mtlalpha=1.0,
+    elayers=2,
+    intermediate_ctc_weight=0.3,
+    intermediate_ctc_layer="1",
+    stochastic_depth_rate=0.3,
+)
+
+conformer_selfconditioned_ctc = dict(
+    transformer_encoder_pos_enc_layer_type="rel_pos",
+    transformer_encoder_selfattn_layer_type="rel_selfattn",
+    macaron_style=True,
+    use_cnn_module=False,
+    mtlalpha=1.0,
+    elayers=2,
+    intermediate_ctc_weight=0.5,
+    intermediate_ctc_layer="1",
+    stochastic_depth_rate=0.0,
+    self_conditioning=True,
+)
+
 
 def _savefn(*args, **kwargs):
     return
@@ -97,6 +130,9 @@ def _savefn(*args, **kwargs):
         conformer_mcnn_args,
         conformer_mcnn_mmacaron_args,
         conformer_mcnn_mmacaron_mrelattn_args,
+        conformer_ctc,
+        conformer_intermediate_ctc,
+        conformer_selfconditioned_ctc,
     ],
 )
 def test_transformer_trainable_and_decodable(model_dict):
diff --git a/test/test_e2e_asr_maskctc.py b/test/test_e2e_asr_maskctc.py
index 78317c0920c..f9154f3ff27 100644
--- a/test/test_e2e_asr_maskctc.py
+++ b/test/test_e2e_asr_maskctc.py
@@ -84,6 +84,15 @@ def _savefn(*args, **kwargs):
     return
 
 
+maskctc_interctc = {
+    "maskctc_n_iterations": 0,
+    "maskctc_probability_threshold": 0.5,
+    "elayers": 2,
+    "intermediate_ctc_weight": 0.3,
+    "intermediate_ctc_layer": "1",
+}
+
+
 @pytest.mark.parametrize(
     "model_dict",
     [
@@ -91,6 +100,7 @@ def _savefn(*args, **kwargs):
         ({"maskctc_n_iterations": 1, "maskctc_probability_threshold": 0.5}),
         ({"maskctc_n_iterations": 2, "maskctc_probability_threshold": 0.5}),
         ({"maskctc_n_iterations": 0, "maskctc_probability_threshold": 0.5}),
+        maskctc_interctc,
     ],
 )
 def test_transformer_trainable_and_decodable(model_dict):
diff --git a/test/test_e2e_asr_mulenc.py b/test/test_e2e_asr_mulenc.py
index 940bae8d7a9..88a04bac456 100644
--- a/test/test_e2e_asr_mulenc.py
+++ b/test/test_e2e_asr_mulenc.py
@@ -61,7 +61,7 @@ def make_arg(num_encs, **kwargs):
         streaming_onset_margin=2,
         streaming_offset_margin=2,
         verbose=2,
-        char_list=[u"あ", u"い"],
+        char_list=["あ", "い"],
         outdir=None,
         ctc_type="warpctc",
         report_cer=False,
@@ -337,7 +337,7 @@ def test_gradient_noise_injection(module, num_encs):
     )
     import espnet.nets.pytorch_backend.e2e_asr_mulenc as m
 
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E([2 for _ in range(num_encs)], 2, args)
     model_org = m.E2E([2 for _ in range(num_encs)], 2, args_org)
     for batch in batchset:
@@ -358,7 +358,7 @@ def test_sortagrad_trainable(module, num_encs):
     dummy_json = make_dummy_json(6, [2, 3], [2, 3], idim=2, odim=2, num_inputs=num_encs)
     import espnet.nets.pytorch_backend.e2e_asr_mulenc as m
 
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E([2 for _ in range(num_encs)], 2, args)
     num_utts = 0
     for batch in batchset:
@@ -565,7 +565,7 @@ def test_calculate_plot_attention_ctc(module, num_encs, model_dict):
     dummy_json = make_dummy_json(
         num_encs, [2, 3], [2, 3], idim=2, odim=2, num_inputs=num_encs
     )
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     att_ws = model.calculate_all_attentions(
         *convert_batch(batchset[0], "pytorch", idim=2, odim=2, num_inputs=num_encs)
     )
diff --git a/test/test_e2e_asr_transducer.py b/test/test_e2e_asr_transducer.py
index 46aa019e3ad..835f9bfe8ab 100644
--- a/test/test_e2e_asr_transducer.py
+++ b/test/test_e2e_asr_transducer.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 
 import argparse
+from distutils.version import LooseVersion
 import tempfile
 
 import json
@@ -15,6 +16,9 @@
 import espnet.nets.pytorch_backend.lm.default as lm_pytorch
 from espnet.nets.pytorch_backend.nets_utils import pad_list
 
+is_torch_1_4_plus = LooseVersion(torch.__version__) >= LooseVersion("1.4.0")
+is_torch_1_5_plus = LooseVersion(torch.__version__) >= LooseVersion("1.5.0")
+
 
 def get_default_train_args(**kwargs):
     train_defaults = dict(
@@ -27,34 +31,27 @@ def get_default_train_args(**kwargs):
         dlayers=1,
         dunits=4,
         dec_embed_dim=4,
-        atype="location",
-        adim=4,
-        aheads=2,
-        awin=2,
-        aconv_chans=2,
-        aconv_filts=5,
         dropout_rate=0.0,
         dropout_rate_decoder=0.0,
         dropout_rate_embed_decoder=0.0,
         joint_dim=2,
         joint_activation_type="tanh",
-        transducer_weight=1.0,
-        aux_task_type=None,
-        aux_task_weight=0.1,
-        aux_task_layer_list=[],
-        aux_ctc=False,
-        aux_ctc_weight=1.0,
-        aux_ctc_dropout_rate=0.0,
-        use_frontend=False,
-        trans_type="warp-transducer",
+        transducer_loss_weight=1.0,
+        use_ctc_loss=False,
+        ctc_loss_weight=0.0,
+        ctc_loss_dropout_rate=0.0,
+        use_lm_loss=False,
+        lm_loss_weight=0.0,
+        use_aux_transducer_loss=False,
+        aux_transducer_loss_weight=0.0,
+        aux_transducer_loss_enc_output_layers=[],
+        use_symm_kl_div_loss=False,
+        symm_kl_div_loss_weight=0.0,
         char_list=["a", "b", "c", "d"],
         sym_space="<space>",
         sym_blank="<blank>",
         report_cer=False,
         report_wer=False,
-        score_norm_transducer=True,
-        beam_size=1,
-        nbest=1,
         verbose=0,
         outdir=None,
         rnnlm=None,
@@ -76,6 +73,8 @@ def get_default_recog_args(**kwargs):
         max_sym_exp=2,
         prefix_alpha=2,
         u_max=5,
+        expansion_gamma=2,
+        expansion_beta=0.2,
         score_norm_transducer=True,
         rnnlm=None,
         lm_weight=0.1,
@@ -131,20 +130,21 @@ def get_wordlm():
 def prepare_inputs(idim, odim, ilens, olens, is_cuda=False):
     np.random.seed(1)
 
-    xs = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens]
-    ys = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens]
-    ilens = np.array([x.shape[0] for x in xs], dtype=np.int32)
+    feats = [np.random.randn(ilen, idim).astype(np.float32) for ilen in ilens]
+    labels = [np.random.randint(1, odim, olen).astype(np.int32) for olen in olens]
+    feats_len = np.array([x.shape[0] for x in feats], dtype=np.int32)
+
+    feats = pad_list([torch.from_numpy(x).float() for x in feats], 0)
+    labels = pad_list([torch.from_numpy(y).long() for y in labels], -1)
 
-    xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0)
-    ys_pad = pad_list([torch.from_numpy(y).long() for y in ys], -1)
-    ilens = torch.from_numpy(ilens).long()
+    feats_len = torch.from_numpy(feats_len).long()
 
     if is_cuda:
-        xs_pad = xs_pad.cuda()
-        ys_pad = ys_pad.cuda()
-        ilens = ilens.cuda()
+        feats = feats.cuda()
+        labels = labels.cuda()
+        feats_len = feats_len.cuda()
 
-    return xs_pad, ilens, ys_pad
+    return feats, feats_len, labels
 
 
 @pytest.mark.parametrize(
@@ -175,6 +175,7 @@ def prepare_inputs(idim, odim, ilens, olens, is_cuda=False):
         ({}, {"beam_size": 2, "search_type": "tsd", "max-sym-exp": 3}),
         ({}, {"beam_size": 2, "search_type": "alsd"}),
         ({}, {"beam_size": 2, "search_type": "alsd", "u_max": 10}),
+        ({}, {"beam_size": 2, "search_type": "maes", "nstep": 2}),
         (
             {},
             {
@@ -208,6 +209,10 @@ def prepare_inputs(idim, odim, ilens, olens, is_cuda=False):
         ),
         ({}, {"beam_size": 2, "search_type": "tsd", "rnnlm": get_lm()}),
         ({}, {"beam_size": 2, "search_type": "tsd", "rnnlm": get_wordlm()}),
+        (
+            {},
+            {"beam_size": 2, "search_type": "maes", "nstep": 2, "rnnlm": get_wordlm()},
+        ),
     ],
 )
 def test_pytorch_transducer_trainable_and_decodable(train_dic, recog_dic):
@@ -230,7 +235,7 @@ def test_pytorch_transducer_trainable_and_decodable(train_dic, recog_dic):
 
     beam_search = BeamSearchTransducer(
         decoder=model.dec,
-        joint_network=model.joint_network,
+        joint_network=model.transducer_tasks.joint_network,
         beam_size=recog_args.beam_size,
         lm=recog_args.rnnlm,
         lm_weight=recog_args.lm_weight,
@@ -248,28 +253,6 @@ def test_pytorch_transducer_trainable_and_decodable(train_dic, recog_dic):
         model.recognize(in_data, beam_search)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="gpu required")
-@pytest.mark.parametrize("trans_type", ["warp-transducer", "warp-rnnt"])
-def test_pytorch_transducer_gpu_trainable(trans_type):
-    idim, odim, ilens, olens = get_default_scope_inputs()
-    train_args = get_default_train_args(trans_type=trans_type)
-
-    if trans_type == "warp-rnnt" and torch.version.cuda != "10.0":
-        with pytest.raises(ImportError):
-            model = E2E(idim, odim, train_args)
-
-        return
-
-    model = E2E(idim, odim, train_args)
-
-    model.cuda()
-
-    batch = prepare_inputs(idim, odim, ilens, olens, is_cuda=True)
-
-    loss = model(*batch)
-    loss.backward()
-
-
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="multi gpu required")
 @pytest.mark.parametrize(
     "train_dic",
@@ -277,7 +260,7 @@ def test_pytorch_transducer_gpu_trainable(trans_type):
         {"report_cer": True, "report_wer": True},
     ],
 )
-@pytest.mark.execution_timeout(2.8)
+@pytest.mark.execution_timeout(3.2)
 def test_pytorch_multi_gpu_trainable(train_dic):
     idim, odim, ilens, olens = get_default_scope_inputs()
     train_args = get_default_train_args(**train_dic)
@@ -309,21 +292,27 @@ def test_calculate_plot_attention():
 @pytest.mark.parametrize(
     "train_dic",
     [
-        {"elayers": 2, "aux_task_type": "default", "aux_task_layer_list": [0]},
         {
-            "etype": "vggblstm",
             "elayers": 3,
-            "aux_task_type": "symm_kl_div",
-            "aux_task_layer_list": [0, 1],
+            "use_aux_transducer_loss": True,
+            "aux_transducer_loss_enc_output_layers": [1],
         },
         {
-            "etype": "blstm",
             "elayers": 2,
-            "aux_task_type": "both",
-            "aux_task_layer_list": [0],
+            "use_ctc_loss": True,
+            "ctc_loss_weight": 0.5,
+            "ctc_loss_dropout_rate": 0.1,
+        },
+        {
+            "etype": "vggblstm",
+            "elayers": 3,
+            "use_aux_transducer_loss": True,
+            "aux_transducer_loss": True,
+            "use_symm_kl_div_loss": True,
+            "symm_kl_div_loss_weight": 0.5,
+            "aux_transducer_loss_enc_output_layers": [0, 1],
         },
-        {"elayers": 2, "aux_ctc": True, "aux_ctc_weight": 0.5},
-        {"elayers": 2, "aux_cross_entropy": True, "aux_cross_entropy_weight": 0.5},
+        {"dlayers": 2, "use_lm_loss": True, "lm_loss_weight": 0.5},
     ],
 )
 def test_auxiliary_task(train_dic):
@@ -341,7 +330,7 @@ def test_auxiliary_task(train_dic):
 
     beam_search = BeamSearchTransducer(
         decoder=model.dec,
-        joint_network=model.joint_network,
+        joint_network=model.transducer_tasks.joint_network,
         beam_size=recog_args.beam_size,
         lm=recog_args.rnnlm,
         lm_weight=recog_args.lm_weight,
@@ -374,23 +363,136 @@ def test_auxiliary_task(train_dic):
         model.recognize(in_data, beam_search)
 
 
-def test_invalid_aux_task_layer_list():
+def test_invalid_aux_transducer_loss_enc_layers():
     idim, odim, ilens, olens = get_default_scope_inputs()
-    train_args = get_default_train_args(aux_task_type="default")
+    train_args = get_default_train_args(use_aux_transducer_loss=True)
+
+    with pytest.raises(ValueError):
+        E2E(idim, odim, train_args)
+
+    train_args = get_default_train_args(
+        use_aux_transducer_loss=True, aux_transducer_loss_enc_output_layers="foo"
+    )
 
     with pytest.raises(ValueError):
         E2E(idim, odim, train_args)
 
     train_args = get_default_train_args(
-        aux_task_type="default", aux_task_layer_list="foo"
+        use_aux_transducer_loss=True, aux_transducer_loss_enc_output_layers=[0, 4]
     )
 
     with pytest.raises(ValueError):
         E2E(idim, odim, train_args)
 
     train_args = get_default_train_args(
-        aux_task_type="default", aux_task_layer_list=[0, 4]
+        use_aux_transducer_loss=True,
+        use_symm_kl_div_loss=True,
+        aux_transducer_loss_enc_output_layers=[0],
+        elayers=3,
+        etype="blstmp",
+        subsample="1_2_1",
     )
 
     with pytest.raises(ValueError):
         E2E(idim, odim, train_args)
+
+
+@pytest.mark.parametrize(
+    "train_dic",
+    [
+        {},
+        {"etype": "vggblstm"},
+    ],
+)
+@pytest.mark.parametrize(
+    "recog_dic",
+    [
+        {},
+        {"beam_size": 2, "search_type": "default"},
+        {"beam_size": 2, "search_type": "alsd"},
+        {"beam_size": 2, "search_type": "tsd"},
+        {"beam_size": 2, "search_type": "nsc"},
+        {"beam_size": 2, "search_type": "maes"},
+    ],
+)
+@pytest.mark.parametrize(
+    "quantize_dic",
+    [
+        {"mod": {torch.nn.Linear}, "dtype": torch.qint8},
+        {"mod": {torch.nn.Linear}, "dtype": torch.float16},
+        {"mod": {torch.nn.LSTM}, "dtype": torch.qint8},
+        {"mod": {torch.nn.LSTM}, "dtype": torch.float16},
+        {"mod": {torch.nn.Linear, torch.nn.LSTM}, "dtype": torch.qint8},
+        {"mod": {torch.nn.Linear, torch.nn.LSTM}, "dtype": torch.float16},
+    ],
+)
+def test_dynamic_quantization(train_dic, recog_dic, quantize_dic):
+    idim, odim, ilens, olens = get_default_scope_inputs()
+
+    train_args = get_default_train_args(**train_dic)
+    recog_args = get_default_recog_args(**recog_dic)
+
+    model = E2E(idim, odim, train_args)
+
+    if not is_torch_1_5_plus and (
+        torch.nn.Linear in quantize_dic["mod"]
+        and quantize_dic["dtype"] == torch.float16
+    ):
+        # In recognize(...) from asr.py we raise ValueError however
+        # AssertionError is originaly raised by torch.
+        with pytest.raises(AssertionError):
+            model = torch.quantization.quantize_dynamic(
+                model,
+                quantize_dic["mod"],
+                dtype=quantize_dic["dtype"],
+            )
+        pytest.skip("Skip rest of the test after checking AssertionError")
+    else:
+        model = torch.quantization.quantize_dynamic(
+            model,
+            quantize_dic["mod"],
+            quantize_dic["dtype"],
+        )
+
+    beam_search = BeamSearchTransducer(
+        decoder=model.dec,
+        joint_network=model.transducer_tasks.joint_network,
+        beam_size=recog_args.beam_size,
+        lm=recog_args.rnnlm,
+        lm_weight=recog_args.lm_weight,
+        search_type=recog_args.search_type,
+        max_sym_exp=recog_args.max_sym_exp,
+        u_max=recog_args.u_max,
+        nstep=recog_args.nstep,
+        prefix_alpha=recog_args.prefix_alpha,
+        score_norm=recog_args.score_norm_transducer,
+        quantization=True,
+    )
+
+    with torch.no_grad():
+        in_data = np.random.randn(20, idim)
+
+        if not is_torch_1_4_plus and torch.nn.LSTM in quantize_dic["mod"]:
+            # Cf. previous comment
+            with pytest.raises(AssertionError):
+                model.recognize(in_data, beam_search)
+        else:
+            model.recognize(in_data, beam_search)
+
+
+@pytest.mark.parametrize(
+    "train_dic, subsample",
+    [
+        ({}, 4),
+        ({"etype": "blstm"}, 1),
+        ({"etype": "blstmp"}, 2),
+    ],
+)
+def test_subsampling(train_dic, subsample):
+    idim, odim, ilens, olens = get_default_scope_inputs()
+
+    train_args = get_default_train_args(**train_dic)
+
+    model = E2E(idim, odim, train_args)
+
+    assert model.get_total_subsampling_factor() == subsample
diff --git a/test/test_e2e_asr_transformer.py b/test/test_e2e_asr_transformer.py
index a7260b04907..6fd338eefb3 100644
--- a/test/test_e2e_asr_transformer.py
+++ b/test/test_e2e_asr_transformer.py
@@ -154,6 +154,23 @@ def test_transformer_mask():
     ldconv_usebias=False,
 )
 
+interctc_args = dict(
+    mtlalpha=1.0,
+    elayers=2,
+    intermediate_ctc_weight=0.3,
+    intermediate_ctc_layer="1",
+    stochastic_depth_rate=0.3,
+)
+
+selfconditionedctc_args = dict(
+    mtlalpha=1.0,
+    elayers=2,
+    intermediate_ctc_weight=0.3,
+    intermediate_ctc_layer="1",
+    stochastic_depth_rate=0.0,
+    self_conditioning=True,
+)
+
 
 def _savefn(*args, **kwargs):
     return
@@ -172,6 +189,8 @@ def _savefn(*args, **kwargs):
         ("pytorch", {"report_cer": True, "report_wer": True}),
         ("pytorch", {"report_cer": True, "report_wer": True, "mtlalpha": 0.0}),
         ("pytorch", {"report_cer": True, "report_wer": True, "mtlalpha": 1.0}),
+        ("pytorch", interctc_args),
+        ("pytorch", selfconditionedctc_args),
         ("chainer", {}),
     ],
 )
diff --git a/test/test_e2e_mt.py b/test/test_e2e_mt.py
index 26d3c75b70c..4c2158b1856 100644
--- a/test/test_e2e_mt.py
+++ b/test/test_e2e_mt.py
@@ -47,7 +47,7 @@ def make_arg(**kwargs):
         ctc_weight=0.0,  # dummy
         ctc_window_margin=0,  # dummy
         verbose=2,
-        char_list=[u"あ", u"い", u"う", u"え", u"お"],
+        char_list=["あ", "い", "う", "え", "お"],
         outdir=None,
         report_bleu=False,
         sym_space="<space>",
@@ -162,7 +162,7 @@ def test_sortagrad_trainable(module):
     else:
         import espnet.nets.chainer_backend.e2e_mt as m
     batchset = make_batchset(
-        dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True, mt=True, iaxis=1, oaxis=0
+        dummy_json, 2, 2**10, 2**10, shortest_first=True, mt=True, iaxis=1, oaxis=0
     )
     model = m.E2E(6, 5, args)
     for batch in batchset:
diff --git a/test/test_e2e_st.py b/test/test_e2e_st.py
index 2f3a244e888..f3e53369128 100644
--- a/test/test_e2e_st.py
+++ b/test/test_e2e_st.py
@@ -55,7 +55,7 @@ def make_arg(**kwargs):
         streaming_onset_margin=2,
         streaming_offset_margin=2,
         verbose=2,
-        char_list=[u"あ", u"い", u"う", u"え", u"お"],
+        char_list=["あ", "い", "う", "え", "お"],
         outdir=None,
         ctc_type="warpctc",
         report_bleu=False,
@@ -303,7 +303,7 @@ def test_gradient_noise_injection(module):
         import espnet.nets.pytorch_backend.e2e_st as m
     else:
         raise NotImplementedError
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E(20, 5, args)
     model_org = m.E2E(20, 5, args_org)
     for batch in batchset:
@@ -324,7 +324,7 @@ def test_sortagrad_trainable(module):
         import espnet.nets.pytorch_backend.e2e_st as m
     else:
         raise NotImplementedError
-    batchset = make_batchset(dummy_json, 2, 2 ** 10, 2 ** 10, shortest_first=True)
+    batchset = make_batchset(dummy_json, 2, 2**10, 2**10, shortest_first=True)
     model = m.E2E(20, 5, args)
     for batch in batchset:
         loss = model(*convert_batch(batch, module, idim=20, odim=5))
diff --git a/test/test_e2e_tts_fastspeech.py b/test/test_e2e_tts_fastspeech.py
index d61e90ca18a..6b66902746c 100644
--- a/test/test_e2e_tts_fastspeech.py
+++ b/test/test_e2e_tts_fastspeech.py
@@ -21,7 +21,6 @@
 from espnet.nets.pytorch_backend.fastspeech.duration_calculator import (
     DurationCalculator,  # noqa: H301
 )
-from espnet.nets.pytorch_backend.fastspeech.length_regulator import is_torch_1_1_plus
 from espnet.nets.pytorch_backend.fastspeech.length_regulator import LengthRegulator
 from espnet.nets.pytorch_backend.nets_utils import pad_list
 
@@ -583,37 +582,6 @@ def test_length_regulator():
     assert int(xs_expand.shape[1]) == int(ds.sum(dim=-1).max())
 
 
-@pytest.mark.skipif(not is_torch_1_1_plus, reason="torch 1.1+ is required.")
-def test_legacy_length_regulator():
-    # prepare inputs
-    idim = 5
-    ilens = [10, 5, 3]
-    xs = pad_list([torch.randn((ilen, idim)) for ilen in ilens], 0.0)
-    ds = pad_list([torch.arange(ilen) for ilen in ilens], 0)
-
-    # test with non-zero durations
-    length_regulator = LengthRegulator()
-    legacy_length_regulator = LengthRegulator()
-    legacy_length_regulator.repeat_fn = (
-        legacy_length_regulator._legacy_repeat_one_sequence
-    )
-    xs_expand_1 = length_regulator(xs, ds)
-    xs_expand_2 = legacy_length_regulator(xs, ds)
-    np.testing.assert_array_equal(
-        xs_expand_1.numpy(),
-        xs_expand_2.numpy(),
-    )
-
-    # test with duration including zero
-    ds[:, 2] = 0
-    xs_expand_1 = length_regulator(xs, ds)
-    xs_expand_2 = legacy_length_regulator(xs, ds)
-    np.testing.assert_array_equal(
-        xs_expand_1.numpy(),
-        xs_expand_2.numpy(),
-    )
-
-
 def test_duration_calculator():
     # define duration calculator
     idim, odim = 10, 25
diff --git a/test/test_initialization.py b/test/test_initialization.py
index 5d6d8de5609..09781eea216 100644
--- a/test/test_initialization.py
+++ b/test/test_initialization.py
@@ -1,16 +1,12 @@
-# coding: utf-8
-
 # Copyright 2017 Shigeki Karita
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-
 import argparse
-
-import numpy
 import os
-import pytest
 import random
 
+import numpy
+import torch
+
 args = argparse.Namespace(
     elayers=4,
     subsample="1_2_2_1_1",
@@ -36,7 +32,7 @@
     minlenratio=0.0,
     ctc_weight=0.2,
     verbose=True,
-    char_list=[u"あ", u"い", u"う", u"え", u"お"],
+    char_list=["あ", "い", "う", "え", "お"],
     outdir=None,
     seed=1,
     ctc_type="warpctc",
@@ -52,7 +48,6 @@
 
 
 def test_lecun_init_torch():
-    torch = pytest.importorskip("torch")
     nseed = args.seed
     random.seed(nseed)
     torch.manual_seed(nseed)
diff --git a/test/test_loss.py b/test/test_loss.py
index b09222b32c7..a9d2e5d4099 100644
--- a/test/test_loss.py
+++ b/test/test_loss.py
@@ -1,10 +1,5 @@
-# coding: utf-8
-
 # Copyright 2017 Shigeki Karita
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-from distutils.version import LooseVersion
-
 import chainer.functions as F
 import numpy
 import pytest
@@ -14,20 +9,17 @@
 from espnet.nets.pytorch_backend.nets_utils import th_accuracy
 
 
-@pytest.mark.parametrize("use_warpctc", [True, False])
+@pytest.mark.parametrize("ctc_type", ["warpctc", "builtin", "gtnctc", "cudnnctc"])
 @pytest.mark.parametrize(
     "in_length,out_length", [([11, 17, 15], [4, 2, 3]), ([4], [1])]
 )
-def test_ctc_loss(in_length, out_length, use_warpctc):
-    pytest.importorskip("torch")
-    if use_warpctc:
+def test_ctc_loss(in_length, out_length, ctc_type):
+    if ctc_type == "warpctc":
         pytest.importorskip("warpctc_pytorch")
         import warpctc_pytorch
 
         torch_ctcloss = warpctc_pytorch.CTCLoss(size_average=True)
-    else:
-        if LooseVersion(torch.__version__) < LooseVersion("1.0"):
-            pytest.skip("pytorch < 1.0 doesn't support CTCLoss")
+    elif ctc_type == "builtin" or ctc_type == "cudnnctc":
         _ctcloss_sum = torch.nn.CTCLoss(reduction="sum")
 
         def torch_ctcloss(th_pred, th_target, th_ilen, th_olen):
@@ -37,6 +29,18 @@ def torch_ctcloss(th_pred, th_target, th_ilen, th_olen):
             loss = loss / th_pred.size(1)
             return loss
 
+    elif ctc_type == "gtnctc":
+        pytest.importorskip("gtn")
+        from espnet.nets.pytorch_backend.gtn_ctc import GTNCTCLossFunction
+
+        _ctcloss_sum = GTNCTCLossFunction.apply
+
+        def torch_ctcloss(th_pred, th_target, th_ilen, th_olen):
+            targets = [t.tolist() for t in th_target]
+            log_probs = torch.nn.functional.log_softmax(th_pred, dim=2)
+            loss = _ctcloss_sum(log_probs, targets, th_ilen, 0, "none")
+            return loss
+
     n_out = 7
     input_length = numpy.array(in_length, dtype=numpy.int32)
     label_length = numpy.array(out_length, dtype=numpy.int32)
@@ -56,10 +60,17 @@ def torch_ctcloss(th_pred, th_target, th_ilen, th_olen):
     ).data
 
     th_pred = pad_list([torch.from_numpy(x) for x in np_pred], 0.0).transpose(0, 1)
-    th_target = torch.from_numpy(numpy.concatenate(np_target))
+    if ctc_type == "gtnctc":
+        # gtn implementation expects targets as list
+        th_target = np_target
+        # keep as B x T x H for gtn
+        th_pred = th_pred.transpose(0, 1)
+    else:
+        th_target = torch.from_numpy(numpy.concatenate(np_target))
     th_ilen = torch.from_numpy(input_length)
     th_olen = torch.from_numpy(label_length)
     th_loss = torch_ctcloss(th_pred, th_target, th_ilen, th_olen).numpy()
+
     numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
 
 
@@ -92,12 +103,11 @@ def test_attn_loss():
     th_ignore = 0
     th_pred = torch.from_numpy(y_all.data)
     th_target = pad_list([torch.from_numpy(t.data).long() for t in ys_out], th_ignore)
-    if LooseVersion(torch.__version__) < LooseVersion("1.0"):
-        reduction_str = "elementwise_mean"
-    else:
-        reduction_str = "mean"
     th_loss = torch.nn.functional.cross_entropy(
-        th_pred, th_target.view(-1), ignore_index=th_ignore, reduction=reduction_str
+        th_pred,
+        th_target.view(-1),
+        ignore_index=th_ignore,
+        reduction="mean",
     )
     print(ch_loss)
     print(th_loss)
diff --git a/test/test_positional_encoding.py b/test/test_positional_encoding.py
index 1b0085a3c8b..6637a5245c1 100644
--- a/test/test_positional_encoding.py
+++ b/test/test_positional_encoding.py
@@ -1,6 +1,8 @@
 import pytest
 import torch
 
+
+from espnet.nets.pytorch_backend.transformer.embedding import LearnableFourierPosEnc
 from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
 from espnet.nets.pytorch_backend.transformer.embedding import ScaledPositionalEncoding
 
@@ -35,6 +37,31 @@ def test_pe_extendable(dtype, device):
     assert torch.allclose(y, y2)
 
 
+@pytest.mark.parametrize(
+    "dtype, device, apply_scaling, hidden_dim",
+    [
+        (dt, dv, scal, hd)
+        for dt in ("float32", "float64")
+        for dv in ("cpu", "cuda")
+        for scal in [True, False]
+        for hd in [None, 12]
+    ],
+)
+def test_learnedFourierPe_extendable(dtype, device, apply_scaling, hidden_dim):
+    if device == "cuda" and not torch.cuda.is_available():
+        pytest.skip("no cuda device is available")
+    dtype = getattr(torch, dtype)
+    dim = 2
+    pe = LearnableFourierPosEnc(
+        dim, apply_scaling=apply_scaling, hidden_dim=hidden_dim
+    ).to(dtype=dtype, device=device)
+    x = torch.rand(2, 3, dim, dtype=dtype, device=device)
+    pe(x)
+
+    x = torch.rand(2, 5, dim, dtype=dtype, device=device)
+    pe(x)
+
+
 @pytest.mark.parametrize(
     "dtype, device",
     [(dt, dv) for dt in ("float32", "float64") for dv in ("cpu", "cuda")],
diff --git a/test/test_recog.py b/test/test_recog.py
index 9b1d87822de..465331eaf2a 100644
--- a/test/test_recog.py
+++ b/test/test_recog.py
@@ -4,20 +4,15 @@
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 import argparse
-from distutils.version import LooseVersion
-import importlib
 
 import numpy
 import pytest
 import torch
 
-import espnet.lm.chainer_backend.lm as lm_chainer
 import espnet.lm.pytorch_backend.extlm as extlm_pytorch
 from espnet.nets.pytorch_backend import e2e_asr
 import espnet.nets.pytorch_backend.lm.default as lm_pytorch
 
-is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2.0")
-
 
 def make_arg(**kwargs):
     defaults = dict(
@@ -82,140 +77,6 @@ def init_chainer_weight_const(m, val):
         p.data[:] = val
 
 
-@pytest.mark.execution_timeout(5)
-@pytest.mark.skipif(is_torch_1_2_plus, reason="pytestskip")
-@pytest.mark.parametrize(
-    ("etype", "dtype", "m_str", "text_idx1"),
-    [
-        ("blstmp", "lstm", "espnet.nets.chainer_backend.e2e_asr", 0),
-        ("blstmp", "lstm", "espnet.nets.pytorch_backend.e2e_asr", 1),
-        ("vggblstmp", "lstm", "espnet.nets.chainer_backend.e2e_asr", 2),
-        ("vggblstmp", "lstm", "espnet.nets.pytorch_backend.e2e_asr", 3),
-        ("bgrup", "gru", "espnet.nets.chainer_backend.e2e_asr", 4),
-        ("bgrup", "gru", "espnet.nets.pytorch_backend.e2e_asr", 5),
-        ("vggbgrup", "gru", "espnet.nets.chainer_backend.e2e_asr", 6),
-        ("vggbgrup", "gru", "espnet.nets.pytorch_backend.e2e_asr", 7),
-    ],
-)
-def test_recognition_results(etype, dtype, m_str, text_idx1):
-    const = 1e-4
-    numpy.random.seed(1)
-    seq_true_texts = [
-        ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"],
-        ["o", "iuiuiuiuiuiuiuo", "ieieieieieieieieo"],
-        ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"],
-        ["o", "iuiuiuiuiuiuiuo", "ieieieieieieieieo"],
-        ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"],
-        ["o", "iuiuiuiuiuiuiuo", "ieieieieieieieieo"],
-        ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"],
-        ["o", "iuiuiuiuiuiuiuo", "ieieieieieieieieo"],
-    ]
-
-    # ctc_weight: 0.0 (attention), 0.5 (hybrid CTC/attention), 1.0 (CTC)
-    for text_idx2, ctc_weight in enumerate([0.0, 0.5, 1.0]):
-        seq_true_text = seq_true_texts[text_idx1][text_idx2]
-
-        args = make_arg(etype=etype, ctc_weight=ctc_weight)
-        m = importlib.import_module(m_str)
-        model = m.E2E(40, 5, args)
-
-        if "pytorch" in m_str:
-            init_torch_weight_const(model, const)
-        else:
-            init_chainer_weight_const(model, const)
-
-        data = [
-            (
-                "aaa",
-                dict(
-                    feat=numpy.random.randn(100, 40).astype(numpy.float32),
-                    token=seq_true_text,
-                ),
-            )
-        ]
-
-        in_data = data[0][1]["feat"]
-        nbest_hyps = model.recognize(in_data, args, args.char_list)
-        y_hat = nbest_hyps[0]["yseq"][1:]
-        seq_hat = [args.char_list[int(idx)] for idx in y_hat]
-        seq_hat_text = "".join(seq_hat).replace("<space>", " ")
-        seq_true_text = data[0][1]["token"]
-
-        assert seq_hat_text == seq_true_text
-
-
-@pytest.mark.execution_timeout(5)
-@pytest.mark.skipif(is_torch_1_2_plus, reason="pytestskip")
-@pytest.mark.parametrize(
-    ("etype", "dtype", "m_str", "text_idx1"),
-    [
-        ("blstmp", "lstm", "espnet.nets.chainer_backend.e2e_asr", 0),
-        ("blstmp", "lstm", "espnet.nets.pytorch_backend.e2e_asr", 1),
-        ("vggblstmp", "lstm", "espnet.nets.chainer_backend.e2e_asr", 2),
-        ("vggblstmp", "lstm", "espnet.nets.pytorch_backend.e2e_asr", 3),
-        ("bgrup", "gru", "espnet.nets.chainer_backend.e2e_asr", 4),
-        ("bgrup", "gru", "espnet.nets.pytorch_backend.e2e_asr", 5),
-        ("vggbgrup", "gru", "espnet.nets.chainer_backend.e2e_asr", 6),
-        ("vggbgrup", "gru", "espnet.nets.pytorch_backend.e2e_asr", 7),
-    ],
-)
-def test_recognition_results_with_lm(etype, dtype, m_str, text_idx1):
-    const = 1e-4
-    numpy.random.seed(1)
-    seq_true_texts = [
-        ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"],
-        ["o", "o", "ieieieieieieieieo"],
-        ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"],
-        ["o", "o", "ieieieieieieieieo"],
-        ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"],
-        ["o", "o", "ieieieieieieieieo"],
-        ["o", "iuiuiuiuiuiuiuiuo", "iuiuiuiuiuiuiuiuo"],
-        ["o", "o", "ieieieieieieieieo"],
-    ]
-
-    # ctc_weight: 0.0 (attention), 0.5 (hybrid CTC/attention), 1.0 (CTC)
-    for text_idx2, ctc_weight in enumerate([0.0, 0.5, 1.0]):
-        seq_true_text = seq_true_texts[text_idx1][text_idx2]
-
-        args = make_arg(
-            etype=etype, rnnlm="dummy", ctc_weight=ctc_weight, lm_weight=0.3
-        )
-        m = importlib.import_module(m_str)
-        model = m.E2E(40, 5, args)
-
-        if "pytorch" in m_str:
-            rnnlm = lm_pytorch.ClassifierWithState(
-                lm_pytorch.RNNLM(len(args.char_list), 2, 10)
-            )
-            init_torch_weight_const(model, const)
-            init_torch_weight_const(rnnlm, const)
-        else:
-            rnnlm = lm_chainer.ClassifierWithState(
-                lm_chainer.RNNLM(len(args.char_list), 2, 10)
-            )
-            init_chainer_weight_const(model, const)
-            init_chainer_weight_const(rnnlm, const)
-
-        data = [
-            (
-                "aaa",
-                dict(
-                    feat=numpy.random.randn(100, 40).astype(numpy.float32),
-                    token=seq_true_text,
-                ),
-            )
-        ]
-
-        in_data = data[0][1]["feat"]
-        nbest_hyps = model.recognize(in_data, args, args.char_list, rnnlm)
-        y_hat = nbest_hyps[0]["yseq"][1:]
-        seq_hat = [args.char_list[int(idx)] for idx in y_hat]
-        seq_hat_text = "".join(seq_hat).replace("<space>", " ")
-        seq_true_text = data[0][1]["token"]
-
-        assert seq_hat_text == seq_true_text
-
-
 def make_small_arg(**kwargs):
     return make_arg(
         elayers=1,
diff --git a/test/test_torch.py b/test/test_torch.py
index b93b8047ed0..3a880bc7fbe 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1,15 +1,8 @@
-# coding: utf-8
-
 # Copyright 2017 Shigeki Karita
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+import torch
 
-
-import pytest
-
-pytest.importorskip("torch")
-import torch  # NOQA
-
-from espnet.nets.pytorch_backend.nets_utils import pad_list  # NOQA
+from espnet.nets.pytorch_backend.nets_utils import pad_list
 
 
 def test_pad_list():
@@ -31,3 +24,7 @@ def test_bmm_attention():
     import numpy
 
     numpy.testing.assert_allclose(naive.numpy(), fast.numpy(), 1e-6, 1e-6)
+
+
+def test_eye_bool_dtype():
+    assert torch.eye(2, dtype=torch.bool).dtype == torch.bool
diff --git a/test/test_train_dtype.py b/test/test_train_dtype.py
index ab97f9bb18b..2c3eeaca570 100644
--- a/test/test_train_dtype.py
+++ b/test/test_train_dtype.py
@@ -9,28 +9,6 @@
     [
         (dtype, device, nn, conf)
         for nn, conf in [
-            (
-                "transducer",
-                dict(
-                    adim=4,
-                    eunits=3,
-                    dunits=3,
-                    elayers=2,
-                    dlayers=2,
-                    trans_type="warp-transducer",
-                ),
-            ),
-            (
-                "transducer",
-                dict(
-                    adim=4,
-                    eunits=3,
-                    dunits=3,
-                    elayers=2,
-                    dlayers=2,
-                    trans_type="warp-rnnt",
-                ),
-            ),
             (
                 "transformer",
                 dict(adim=4, eunits=3, dunits=3, elayers=2, dlayers=2, mtlalpha=0.0),
@@ -97,8 +75,6 @@ def test_train_pytorch_dtype(dtype, device, model, conf):
         pytest.skip("no cuda device is available")
     if device == "cpu" and dtype == "float16":
         pytest.skip("cpu float16 implementation is not available in pytorch yet")
-    if device == "cpu" and "trans_type" in conf and conf["trans_type"] == "warp-rnnt":
-        pytest.skip("warp-rnnt is not supported in CPU mode")
 
     idim = 10
     odim = 10
diff --git a/test/test_transform.py b/test/test_transform.py
index 2e6182c5436..a9a8059fd7a 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -23,7 +23,7 @@ def test_preprocessing(tmpdir):
     samples = np.random.randn(100, 80)
     stats = np.empty((2, 81), dtype=np.float32)
     stats[0, :80] = samples.sum(axis=0)
-    stats[1, :80] = (samples ** 2).sum(axis=0)
+    stats[1, :80] = (samples**2).sum(axis=0)
     stats[0, -1] = 100.0
     stats[1, -1] = 0.0
     kaldiio.save_mat(cmvn_ark, stats)
diff --git a/test/test_utils.py b/test/test_utils.py
index 435239987ca..7103bd2e012 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -15,12 +15,12 @@ def test_make_batchset(swap_io):
     dummy_json = make_dummy_json(128, [128, 512], [16, 128])
     # check w/o adaptive batch size
     batchset = make_batchset(
-        dummy_json, 24, 2 ** 10, 2 ** 10, min_batch_size=1, swap_io=swap_io
+        dummy_json, 24, 2**10, 2**10, min_batch_size=1, swap_io=swap_io
     )
     assert sum([len(batch) >= 1 for batch in batchset]) == len(batchset)
     print([len(batch) for batch in batchset])
     batchset = make_batchset(
-        dummy_json, 24, 2 ** 10, 2 ** 10, min_batch_size=10, swap_io=swap_io
+        dummy_json, 24, 2**10, 2**10, min_batch_size=10, swap_io=swap_io
     )
     assert sum([len(batch) >= 10 for batch in batchset]) == len(batchset)
     print([len(batch) for batch in batchset])
@@ -44,15 +44,15 @@ def test_sortagrad(swap_io):
         batchset = make_batchset(
             dummy_json,
             16,
-            2 ** 10,
-            2 ** 10,
+            2**10,
+            2**10,
             batch_sort_key="input",
             shortest_first=True,
             swap_io=True,
         )
         key = "output"
     else:
-        batchset = make_batchset(dummy_json, 16, 2 ** 10, 2 ** 10, shortest_first=True)
+        batchset = make_batchset(dummy_json, 16, 2**10, 2**10, shortest_first=True)
         key = "input"
     prev_start_ilen = batchset[0][0][1][key][0]["shape"][0]
     for batch in batchset:
diff --git a/test_utils/hubert_test.txt b/test_utils/hubert_test.txt
new file mode 100644
index 00000000000..86d83f3390d
--- /dev/null
+++ b/test_utils/hubert_test.txt
@@ -0,0 +1,10 @@
+<blank> 0
+<unk> 0
+<pad> 0
+1 1
+2 1
+3 1
+4 1
+5 1
+6 1
+<sos/eos> 0
diff --git a/test_utils/integration_test_translate_wav.bats b/test_utils/integration_test_translate_wav.bats
index b1016c08d3c..5ecdffb6a7c 100644
--- a/test_utils/integration_test_translate_wav.bats
+++ b/test_utils/integration_test_translate_wav.bats
@@ -14,14 +14,14 @@ setup() {
     cd ./egs/fisher_callhome_spanish/st1/
     wav=../../../test_utils/st_test.wav
     model=fisher_callhome_spanish.transformer.v1.es-en
-    ../../../utils/translate_wav.sh --stop-stage 2 --decode_dir $tmpdir --models ${model} ${wav}
+    ../../../utils/translate_wav.sh --python "coverage run --append" --stop-stage 2 --decode_dir $tmpdir --models ${model} ${wav}
 
     prefix="Translated text: "
 
     # NOTE(karita): If you will change the model, you should change these outputs.
-    trans=$(../../../utils/translate_wav.sh --stage 3 --decode_dir ${tmpdir} --models ${model} ${wav} | grep "${prefix}")
+    trans=$(../../../utils/translate_wav.sh --python "coverage run --append" --stage 3 --decode_dir ${tmpdir} --models ${model} ${wav} | grep "${prefix}")
     [ "$trans" = "${prefix}yes i'm jose" ]
 
-    trans=$(../../../utils/translate_wav.sh --stage 3 --decode_dir ${tmpdir} --models ${model} --detokenize false ${wav} | grep "${prefix}")
+    trans=$(../../../utils/translate_wav.sh --python "coverage run --append" --stage 3 --decode_dir ${tmpdir} --models ${model} --detokenize false ${wav} | grep "${prefix}")
     [ "$trans" = "${prefix}▁yes▁i▁&apos;m▁jose" ]
 }
diff --git a/test_utils/test_evaluate_asr.bats b/test_utils/test_evaluate_asr.bats
new file mode 100644
index 00000000000..3b8b51da792
--- /dev/null
+++ b/test_utils/test_evaluate_asr.bats
@@ -0,0 +1,29 @@
+#!/usr/bin/env bats
+
+setup() {
+    tmpdir=/tmp/espnet2-test-evaluate-asr-${RANDOM}
+    # Create dummy data
+    mkdir -p ${tmpdir}/data
+    echo "dummy A" > ${tmpdir}/data/text
+    echo "dummy ${tmpdir}/data/dummy.wav" > ${tmpdir}/data/wav.scp
+    python << EOF
+import numpy as np
+import soundfile as sf
+sf.write("${tmpdir}/data/dummy.wav", np.zeros(16000 * 2,), 16000, "PCM_16")
+EOF
+}
+
+@test "evaluate_asr" {
+    cd egs2/mini_an4/asr1
+    model_tag="kamo-naoyuki/mini_an4_asr_train_raw_bpe_valid.acc.best"
+    scripts/utils/evaluate_asr.sh \
+        --stop-stage 3 \
+        --model_tag "${model_tag}" \
+        --gt_text "${tmpdir}/data/text" \
+        --inference_args "--beam_size 1" \
+        "${tmpdir}/data/wav.scp" "${tmpdir}/asr_results"
+}
+
+teardown() {
+    rm -r $tmpdir
+}
diff --git a/test_utils/test_evaluate_asr_hf.bats b/test_utils/test_evaluate_asr_hf.bats
new file mode 100644
index 00000000000..598455b8529
--- /dev/null
+++ b/test_utils/test_evaluate_asr_hf.bats
@@ -0,0 +1,29 @@
+#!/usr/bin/env bats
+
+setup() {
+    tmpdir=/tmp/espnet2-test-evaluate-asr-hf-${RANDOM}
+    # Create dummy data
+    mkdir -p ${tmpdir}/data
+    echo "dummy A" > ${tmpdir}/data/text
+    echo "dummy ${tmpdir}/data/dummy.wav" > ${tmpdir}/data/wav.scp
+    python << EOF
+import numpy as np
+import soundfile as sf
+sf.write("${tmpdir}/data/dummy.wav", np.zeros(16000 * 2,), 16000, "PCM_16")
+EOF
+}
+
+@test "evaluate_asr_hf" {
+    cd egs2/mini_an4/asr1
+    model_tag="espnet/kamo-naoyuki-mini_an4_asr_train_raw_bpe_valid.acc.best"
+    scripts/utils/evaluate_asr.sh \
+        --stop-stage 3 \
+        --model_tag "${model_tag}" \
+        --gt_text "${tmpdir}/data/text" \
+        --inference_args "--beam_size 1" \
+        "${tmpdir}/data/wav.scp" "${tmpdir}/asr_results"
+}
+
+teardown() {
+    rm -r $tmpdir
+}
diff --git a/test_utils/test_evaluate_f0.bats b/test_utils/test_evaluate_f0.bats
new file mode 100644
index 00000000000..344c5273b77
--- /dev/null
+++ b/test_utils/test_evaluate_f0.bats
@@ -0,0 +1,26 @@
+#!/usr/bin/env bats
+
+setup() {
+    tmpdir=/tmp/espnet2-test-evaluate-f0-${RANDOM}
+    # Create dummy data
+    mkdir -p ${tmpdir}/data
+    echo "dummy ${tmpdir}/data/dummy.wav" > ${tmpdir}/data/wav.scp
+    python << EOF
+import numpy as np
+import soundfile as sf
+sf.write("${tmpdir}/data/dummy.wav", np.zeros(16000 * 2,), 16000, "PCM_16")
+EOF
+}
+
+teardown() {
+    rm -rf $tmpdir
+}
+
+@test "evaluate_f0" {
+    cd egs2/mini_an4/asr1
+    python pyscripts/utils/evaluate_f0.py \
+        --nj 1 \
+        ${tmpdir}/data/wav.scp \
+        ${tmpdir}/data/wav.scp
+}
+
diff --git a/test_utils/test_evaluate_mcd.bats b/test_utils/test_evaluate_mcd.bats
new file mode 100644
index 00000000000..320170f5903
--- /dev/null
+++ b/test_utils/test_evaluate_mcd.bats
@@ -0,0 +1,26 @@
+#!/usr/bin/env bats
+
+setup() {
+    tmpdir=/tmp/espnet2-test-evaluate-mcd-${RANDOM}
+    # Create dummy data
+    mkdir -p ${tmpdir}/data
+    echo "dummy ${tmpdir}/data/dummy.wav" > ${tmpdir}/data/wav.scp
+    python << EOF
+import numpy as np
+import soundfile as sf
+sf.write("${tmpdir}/data/dummy.wav", np.zeros(16000 * 2,), 16000, "PCM_16")
+EOF
+}
+
+teardown() {
+    rm -rf $tmpdir
+}
+
+@test "evaluate_f0" {
+    cd egs2/mini_an4/asr1
+    python pyscripts/utils/evaluate_mcd.py \
+        --nj 1 \
+        ${tmpdir}/data/wav.scp \
+        ${tmpdir}/data/wav.scp
+}
+
diff --git a/test_utils/test_scoreintent_py.bats b/test_utils/test_scoreintent_py.bats
new file mode 100755
index 00000000000..3af9b0e09d5
--- /dev/null
+++ b/test_utils/test_scoreintent_py.bats
@@ -0,0 +1,39 @@
+#!/usr/bin/env bats
+
+setup() {
+    utils=$(cd $BATS_TEST_DIRNAME/..; pwd)/egs2/TEMPLATE/asr1/pyscripts/utils/
+    export LC_ALL="en_US.UTF-8"
+    tmpdir=$(mktemp -d testXXXXXX)/
+    mkdir -p $tmpdir/valid/score_wer $tmpdir/test/score_wer
+    valid_inference_folder=valid/
+    test_inference_folder=test/
+    echo $tmpdir
+    cat <<EOF > $tmpdir/valid/score_wer/hyp.trn
+decrease_heat_washroom Turn the temperature down in the bathroom	(7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_01307c00-4630-11e9-bc65-55b32b211b66.wav)
+decrease_heat_washroom Turn the temperature down in the washroom	(7NqqnAOPVVSKnxyv-7NqqnAOPVVSKnxyv_0157abb0-4633-11e9-bc65-55b32b211b66.wav)
+EOF
+    cp $tmpdir/valid/score_wer/hyp.trn $tmpdir/valid/score_wer/ref.trn
+    cat <<EOF > $tmpdir/test/score_wer/hyp.trn
+activate_lights_washroom Lights on in the bathroom	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
+increase_volume_none Increase the volume	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
+EOF
+    cat <<EOF > $tmpdir/test/score_wer/ref.trn
+activate_lights_none Lights on	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00143870-4531-11e9-b1e4-e5985dca719e.wav)
+increase_volume_none Increase the volume	(4BrX8aDqK2cLZRYl-4BrX8aDqK2cLZRYl_00224990-452e-11e9-b1e4-e5985dca719e.wav)
+EOF
+    cat << EOF > $tmpdir/result.txt
+Valid Intent Classification Result
+1.0
+Test Intent Classification Result
+0.5
+EOF
+}
+
+teardown() {
+    rm -rf $tmpdir
+}
+
+@test "score_intent.py" {
+    python $utils/score_intent.py --exp_root ${tmpdir} --valid_folder ${valid_inference_folder} --test_folder ${test_inference_folder} > $tmpdir/output.txt
+    diff $tmpdir/result.txt $tmpdir/output.txt
+}
diff --git a/test_utils/test_spm.bats b/test_utils/test_spm.bats
index 7f5592d21f0..c868b646495 100755
--- a/test_utils/test_spm.bats
+++ b/test_utils/test_spm.bats
@@ -17,7 +17,7 @@ teardown() {
     bpemode=unigram
     bpemodel=$tmpdir/test_spm
 
-    utils/spm_train --user_defined_symbols --input=${testfile} --vocab_size=${nbpe} --model_type=${bpemode} \
+    utils/spm_train --input=${testfile} --vocab_size=${nbpe} --model_type=${bpemode} \
           --model_prefix=${bpemodel} --input_sentence_size=100000000 \
           --character_coverage=1.0 --bos_id=-1 --eos_id=-1 \
           --unk_id=0 --user_defined_symbols=[laughter],[noise],[vocalized-noise]
diff --git a/tools/Makefile b/tools/Makefile
index e90cf537652..c8c41bbb524 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,54 +1,27 @@
-# If a Python interpreter is specified, then creates a virtualenv from it
-# If empty string is given, chainer is not installed. Note that ESPnet doesn't support any versions except for chainer=6.0.0
-CHAINER_VERSION := 6.0.0
-# Disable cupy installation
-NO_CUPY :=
-# PyTorch version: 0.4.1, 1.0.0, 1.0.1, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0
-TH_VERSION := 1.4.0
-WGET := wget --tries=3
+# PyTorch version: 1.3.1, 1.4.0, 1.5.1, 1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.0 and 1.10.1 are tested.
+TH_VERSION := 1.10.1
 
 # Use pip for pytorch installation even if you have anaconda
 ifneq ($(shell test -f ./activate_python.sh && grep 'conda activate' ./activate_python.sh),)
-IS_CONDA := 0
-USE_PIP :=
+USE_CONDA := 1
 else
-IS_CONDA :=
-USE_PIP := 0
+USE_CONDA :=
 endif
 
 
-# Set if install binaries on CPU mode e.g. make CPU_ONLY=1
-# If you don't have any GPUs, this value will be set automatically
-ifeq ($(shell which nvcc),) # 'nvcc' not found
-CPU_ONLY := 0
-else
+# Set if install binaries on CPU mode e.g. make CPU_ONLY=0
+# If you don't have nvcc, this value will be set automatically
+ifneq ($(shell which nvcc 2>/dev/null),)
 CPU_ONLY :=
-endif
-
-ifeq ($(strip $(CPU_ONLY)),)
 # Derive CUDA version from nvcc
-CUDA_VERSION = $(shell nvcc --version | grep "Cuda compilation tools" | cut -d" " -f5 | sed s/,//)
-CUDA_VERSION_WITHOUT_DOT = $(strip $(subst .,,$(CUDA_VERSION)))
-PIP_PYTORCH := torch==$(TH_VERSION) -f https://download.pytorch.org/whl/cu$(CUDA_VERSION_WITHOUT_DOT)/torch_stable.html
-ifeq ($(strip $(CUDA_VERSION_WITHOUT_DOT)),80)
-CONDA_PYTORCH := pytorch=$(TH_VERSION) cuda$(CUDA_VERSION_WITHOUT_DOT) -c pytorch
-else ifeq ($(strip $(CUDA_VERSION_WITHOUT_DOT)),111)
-CONDA_PYTORCH := pytorch=$(TH_VERSION) cudatoolkit=$(CUDA_VERSION) -c pytorch -c conda-forge
-else
-CONDA_PYTORCH := pytorch=$(TH_VERSION) cudatoolkit=$(CUDA_VERSION) -c pytorch
-endif
+CUDA_VERSION := $(shell nvcc --version | grep "Cuda compilation tools" | cut -d" " -f5 | sed s/,//)
+CUDA_VERSION_WITHOUT_DOT := $(strip $(subst .,,$(CUDA_VERSION)))
 
 else
-CONDA_PYTORCH := pytorch=$(TH_VERSION) cpuonly -c pytorch
-PIP_PYTORCH := torch==$(TH_VERSION) -f https://download.pytorch.org/whl/cpu/torch_stable.html
-NO_CUPY := 0
-endif
-
-ifeq ($(shell expr $(CUDA_VERSION_WITHOUT_DOT) \>= 102), 1)
-# cupy==6.0.0 doesn't support CUDA=10.2 or later
-NO_CUPY := 0
+CPU_ONLY := 0
+CUDA_VERSION :=
+CUDA_VERSION_WITHOUT_DOT :=
 endif
-PIP_CHAINER := chainer==$(CHAINER_VERSION)
 
 
 .PHONY: all clean
@@ -56,11 +29,11 @@ PIP_CHAINER := chainer==$(CHAINER_VERSION)
 all: kaldi showenv python conda_packages.done sctk.done sph2pipe.done check_install
 
 ifneq ($(strip $(CHAINER_VERSION)),)
-python: activate_python.sh espnet.done pytorch.done chainer.done
-extra: warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done
+python: activate_python.sh espnet.done pytorch.done chainer.done fairscale.done torch_optimizer.done
+extra: warp-ctc.done warp-transducer.done chainer_ctc.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done longformer.done
 else
-python: activate_python.sh espnet.done pytorch.done
-extra: warp-ctc.done warp-transducer.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done
+python: activate_python.sh espnet.done pytorch.done fairscale.done torch_optimizer.done
+extra: warp-ctc.done warp-transducer.done nkf.done moses.done mwerSegmenter.done pesq kenlm.done pyopenjtalk.done py3mmseg.done beamformit.done fairseq.done s3prl.done k2.done transformers.done phonemizer.done longformer.done
 endif
 
 
@@ -74,23 +47,19 @@ activate_python.sh:
 showenv: activate_python.sh
 ifeq ($(strip $(CPU_ONLY)),)
 	@echo CUDA_VERSION=$(CUDA_VERSION)
-	. ./activate_python.sh && python3 check_pytorch_cuda_compatibility.py $(TH_VERSION) ${CUDA_VERSION} || { echo "Change TH_VERSION or CUDA_VERSION"; exit 1; }
 else
 	@echo Perform on CPU mode: CPU_ONLY=$(CPU_ONLY)
 endif
 	@echo PYTHON=$(shell . ./activate_python.sh && command -v python3)
 	@echo PYTHON_VERSION=$(shell . ./activate_python.sh && python3 --version)
-	@echo IS_CONDA=$(IS_CONDA)
-	@echo USE_PIP=$(USE_PIP)
+	@echo USE_CONDA=$(USE_CONDA)
 	@echo TH_VERSION=$(TH_VERSION)
-	@echo CONDA_PYTORCH=$(CONDA_PYTORCH)
-	@echo PIP_PYTORCH=$(PIP_PYTORCH)
-	@echo CHAINER_VERSION=$(CHAINER_VERSION)
-	@echo PIP_CHAINER=$(PIP_CHAINER)
-	@echo NO_CUPY=$(NO_CUPY)
 
 #########################################
 
+bc.done: activate_python.sh
+	. ./activate_python.sh && { command -v bc || conda install -y bc -c conda-forge; }
+	touch bc.done
 cmake.done: activate_python.sh
 	. ./activate_python.sh && { command -v cmake || conda install -y cmake; }
 	touch cmake.done
@@ -106,8 +75,8 @@ sox.done: activate_python.sh
 sndfile.done: activate_python.sh
 	. ./activate_python.sh && { python3 -c "from ctypes.util import find_library as F; assert F('sndfile') is not None" || conda install -y libsndfile=1.0.28 -c conda-forge; }
 	touch sndfile.done
-ifneq ($(strip $(IS_CONDA)),)
-conda_packages.done: cmake.done flac.done ffmpeg.done sox.done sndfile.done
+ifneq ($(strip $(USE_CONDA)),)
+conda_packages.done: bc.done cmake.done flac.done ffmpeg.done sox.done sndfile.done
 else
 conda_packages.done:
 endif
@@ -122,38 +91,25 @@ sph2pipe.done:
 	touch sph2pipe.done
 
 
-ifneq ($(strip $(USE_PIP)),)
 pytorch.done: activate_python.sh
-	. ./activate_python.sh && python3 -m pip install numpy --upgrade  # Workaround for "ImportError: numpy.core.multiarray failed to import"
-	. ./activate_python.sh && python3 -m pip install $(PIP_PYTORCH)
-	touch pytorch.done
+ifeq ($(strip $(USE_CONDA)),)
+	# NOTE(kan-bayashi): Temporary fixed numpy version
+	. ./activate_python.sh && pip install "numpy<=1.21.3"
+	. ./activate_python.sh && ./installers/install_torch.sh "false" "${TH_VERSION}" "${CUDA_VERSION}"
 else
-pytorch.done: activate_python.sh
-	. ./activate_python.sh && conda install -y $(CONDA_PYTORCH)
-	touch pytorch.done
+	# NOTE(kan-bayashi): Temporary fixed numpy version
+	. ./activate_python.sh && conda install -y "numpy<=1.21.3"
+	. ./activate_python.sh && ./installers/install_torch.sh "true" "${TH_VERSION}" "${CUDA_VERSION}"
 endif
+	touch pytorch.done
 
 # NOTE(kamo): conda_packages is not necessary for installation of espnet, but add it the dependencies just in case.
 espnet.done: pytorch.done conda_packages.done
-	. ./activate_python.sh && python3 -m pip install -e "..[recipe]"  # Install editable mode by default
+	. ./activate_python.sh && python3 -m pip install -e "..[train, recipe]"  # Install editable mode by default
 	touch espnet.done
 
 chainer.done: espnet.done
-ifneq ($(strip $(NO_CUPY)),)
-	. ./activate_python.sh && python3 -m pip install $(PIP_CHAINER)
-else
-	# Precompiled cupy==6.0.0 for python>=3.8 is not provided
-	. ./activate_python.sh && \
-		if python3 -c "import sys; from distutils.version import LooseVersion as L; assert L(sys.version) < L('3.8')" 2>&1 /dev/null; then \
-			python3 -m pip install $(PIP_CHAINER) cupy-cuda$(CUDA_VERSION_WITHOUT_DOT)==$(CHAINER_VERSION); \
-		else \
-			python3 -m pip install $(PIP_CHAINER) cupy==$(CHAINER_VERSION); \
-		fi
-endif
-	# chainer=6.0.0 depends on typing<=3.6.6, but this causes the following error when installing some modules. e.g. fairseq
-	# AttributeError: type object 'Callable' has no attribute '_abc_registry'
-	# "typing" modules is not required for python>=3.6, so uninstall here
-	. ./activate_python.sh && python3 -m pip uninstall -y typing
+	. ./activate_python.sh && ./installers/install_chainer.sh "${CUDA_VERSION}"
 	touch chainer.done
 
 # NOTE(kamo): Add conda_packages.done if cmake is used
@@ -168,7 +124,6 @@ warp-transducer.done: pytorch.done conda_packages.done
 ifeq ($(strip $(CPU_ONLY)),)
 	[ -n "${CUDA_HOME}" ] || { echo -e "Error: CUDA_HOME is not set.\n    $$ . ./setup_cuda_env.sh <cuda-root>"; exit 1; }
 	. ./activate_python.sh && ./installers/install_warp-transducer.sh
-	. ./activate_python.sh &&  ./installers/install_warp-rnnt.sh $(CUDA_VERSION)
 	touch warp-transducer.done
 else
 	. ./activate_python.sh && ./installers/install_warp-transducer.sh
@@ -190,6 +145,14 @@ pyopenjtalk.done: espnet.done conda_packages.done
 	. ./activate_python.sh && ./installers/install_pyopenjtalk.sh
 	touch pyopenjtalk.done
 
+phonemizer.done: espnet.done conda_packages.done
+	. ./activate_python.sh && ./installers/install_phonemizer.sh
+	touch phonemizer.done
+
+speechbrain.done: espnet.done conda_packages.done
+	. ./activate_python.sh && ./installers/install_speechbrain.sh
+	touch speechbrain.done
+
 moses.done:
 	git clone --depth 1 https://github.com/moses-smt/mosesdecoder.git moses
 	touch moses.done
@@ -214,21 +177,48 @@ beamformit.done:
 	./installers/install_beamformit.sh
 	touch beamformit.done
 
+torch_optimizer.done: espnet.done
+	. ./activate_python.sh && ./installers/install_torch_optimizer.sh
+	touch torch_optimizer.done
+
+fairscale.done: espnet.done
+	. ./activate_python.sh && ./installers/install_fairscale.sh
+	touch fairscale.done
+
 fairseq.done: espnet.done
 	. ./activate_python.sh && ./installers/install_fairseq.sh
 	touch fairseq.done
 
+s3prl.done: espnet.done
+	. ./activate_python.sh && ./installers/install_s3prl.sh
+	touch s3prl.done
+
+k2.done: espnet.done
+	. ./activate_python.sh && ./installers/install_k2.sh
+	touch k2.done
+
+gtn.done: espnet.done
+	. ./activate_python.sh && ./installers/install_gtn.sh
+	touch gtn.done
+
+transformers.done: espnet.done
+	. ./activate_python.sh && ./installers/install_transformers.sh
+	touch transformers.done
+
+longformer.done: espnet.done
+	. ./activate_python.sh && ./installers/install_longformer.sh
+	touch longformer.done
+
 check_install: python
 	. ./activate_python.sh; . ./extra_path.sh; python3 check_install.py
 
-
 clean: clean_extra
-	rm -rf warp-ctc warp-transducer warp-rnnt chainer_ctc
+	rm -rf warp-ctc warp-transducer chainer_ctc
 	rm -rf *.done
 	find . -iname "*.pyc" -delete
 
 clean_python:
-	rm -rf warp-ctc warp-transducer warp-rnnt chainer_ctc
+	rm -rf warp-ctc warp-transducer chainer_ctc
 	rm -f warp-ctc.done chainer_ctc.done espnet.done chainer.done pytorch.done warp-transducer.done
 	find . -iname "*.pyc" -delete
 
diff --git a/tools/activate_python.sh b/tools/activate_python.sh
deleted file mode 100644
index 0e8eb32baca..00000000000
--- a/tools/activate_python.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env bash
-echo "Warning! You haven't set Python environment yet. Go to espnet/tools and generate 'activate_python.sh' using setup_anaconda.sh or setup_python.sh" >&2
diff --git a/tools/check_install.py b/tools/check_install.py
index 20e3c9859bd..82081986123 100644
--- a/tools/check_install.py
+++ b/tools/check_install.py
@@ -16,14 +16,22 @@
     ("torch_optimizer", None, None),
     ("warpctc_pytorch", None, "installers/install_warp-ctc.sh"),
     ("warprnnt_pytorch", None, "installers/install_warp-transducer.sh"),
-    ("warp_rnnt", None, "installers/install_warp-rnnt.sh"),
     ("chainer_ctc", None, "installers/install_chainer_ctc.sh"),
     ("pyopenjtalk", None, "installers/install_pyopenjtalk.sh"),
+    ("tdmelodic_pyopenjtalk", None, "installers/install_tdmelodic_pyopenjtalk.sh"),
     ("kenlm", None, "installers/install_kenlm.sh"),
     ("mmseg", None, "installers/install_py3mmseg.sh"),
     ("espnet", None, None),
     ("fairseq", None, "installers/install_fairseq.sh"),
     ("phonemizer", None, "installers/install_phonemizer.sh"),
+    ("gtn", None, "installers/install_gtn.sh"),
+    ("s3prl", None, "installers/install_s3prl.sh"),
+    ("transformers", None, "installers/install_transformers.sh"),
+    ("speechbrain", None, "installers/install_speechbrain.sh"),
+    ("k2", None, "installers/install_k2.sh"),
+    ("longformer",None,"installers/install_longformer.sh"),
+    ("nlg-eval",None,"installers/install_longformer.sh"),
+    ("datasets",None,"installers/install_longformer.sh"),
 ]
 
 executable_list = [
diff --git a/tools/check_pytorch_cuda_compatibility.py b/tools/check_pytorch_cuda_compatibility.py
deleted file mode 100755
index 76a77f8db2a..00000000000
--- a/tools/check_pytorch_cuda_compatibility.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-import argparse
-from distutils.version import LooseVersion
-import warnings
-
-
-def check(pytorch_version: str, cuda_version: str):
-    # NOTE(kamo):  Supported cuda version is defined
-    # as existing prebuilt binaries in
-    # https://anaconda.org/pytorch/pytorch/files
-    # You probably could perform pytorch with the cuda-version
-    # if you built pytorch at local.
-    maybe_supported = []
-    # 1.8.0
-    if LooseVersion("1.9") > LooseVersion(pytorch_version) >= LooseVersion("1.8"):
-        supported = ["11.1", "10.2", "10.1"]
-    # 1.7.0 or 1.7.1
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.7"):
-        supported = ["11.0", "10.2", "10.1", "9.2"]
-    # 1.6.0
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.6"):
-        supported = ["10.2", "10.1", "9.2"]
-        # FIXME(kamo): 10.0 is not existing, but it seems to work in my environment
-        maybe_supported = ["10.0"]
-    # 1.5.0 or 1.5.1
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.5"):
-        supported = ["10.2", "10.1", "9.2"]
-        # FIXME(kamo): 10.0 is not existing, but it seems to work in my environment
-        maybe_supported = ["10.0"]
-    # 1.4.0
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.4"):
-        supported = ["10.1", "10.0", "9.2"]
-    # 1.3.0 or 1.3.1
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.3"):
-        supported = ["10.1", "10.0", "9.2"]
-    # 1.2.0
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.2"):
-        supported = ["10.0", "9.2"]
-    # 1.1.0
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.1"):
-        supported = ["10.0", "9.0"]
-    # 1.0.1
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.0.1"):
-        supported = ["10.0", "9.0", "8.0"]
-    # 1.0.0
-    elif LooseVersion(pytorch_version) >= LooseVersion("1.0.0"):
-        supported = ["10.0", "9.0", "8.0"]
-    else:
-        raise NotImplementedError(f"pytorch={pytorch_version}")
-
-    for v in supported + maybe_supported:
-        if cuda_version == v:
-            print(v)
-            if v in maybe_supported:
-                warnings.warn(
-                    f"pytorch={pytorch_version} with cuda={cuda_version} might not work."
-                )
-            break
-    else:
-        raise RuntimeError(
-            f"Not compatible: pytorch={pytorch_version}, cuda={cuda_version}: "
-            f"Supported cuda versions: {supported + maybe_supported}"
-        )
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(description="Check pytorch-cuda compatibility")
-    parser.add_argument("pytorch_version")
-    parser.add_argument("cuda_version")
-    return parser
-
-
-def main():
-    parser = get_parser()
-    check(**vars(parser.parse_args()))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/extra_path.sh b/tools/extra_path.sh
index a97129a8762..930409d9587 100644
--- a/tools/extra_path.sh
+++ b/tools/extra_path.sh
@@ -3,7 +3,7 @@ if [ -n "${BASH_VERSION:-}" ]; then
     # shellcheck disable=SC2046
     TOOL_DIR="$( cd $( dirname ${BASH_SOURCE[0]} ) >/dev/null 2>&1 && pwd )"
 elif [ -n "${ZSH_VERSION:-}" ]; then
-    # shellcheck disable=SC2046
+    # shellcheck disable=SC2046,SC2296
     TOOL_DIR="$( cd $( dirname ${(%):-%N} ) >/dev/null 2>&1 && pwd )"
 else
     # If POSIX sh, there are no ways to get the script path if it is sourced,
@@ -25,3 +25,5 @@ export PATH="${TOOL_DIR}"/espeak-ng/bin:"${PATH:-}"
 export PATH="${TOOL_DIR}"/MBROLA/Bin:"${PATH:-}"
 export PATH="${TOOL_DIR}"/festival/bin:"${PATH:-}"
 export LD_LIBRARY_PATH="${TOOL_DIR}"/lib:"${TOOL_DIR}"/lib64:"${LD_LIBRARY_PATH:-}"
+export LD_LIBRARY_PATH="${TOOL_DIR}"/espeak-ng/lib:"${LD_LIBRARY_PATH:-}"
+export PYTHONPATH="${TOOL_DIR}"/s3prl:"${PYTHONPATH:-}"
diff --git a/tools/installers/install_chainer.sh b/tools/installers/install_chainer.sh
new file mode 100755
index 00000000000..9ce037f68f6
--- /dev/null
+++ b/tools/installers/install_chainer.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+set -euo pipefail
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $# -ne 1 ]; then
+    log "Usage: $0 <cuda_version>"
+    exit 1
+else
+    cuda_version="$1"
+fi
+if [ "${cuda_version}" = cpu ] || [ "${cuda_version}" = CPU ]; then
+    cuda_version=
+fi
+
+
+# espnet requires chiner=6.0.0
+chainer_version=6.0.0
+python_version=$(python3 -c "import sys; print(sys.version.split()[0])")
+cuda_version_without_dot="${cuda_version/\./}"
+python_plus(){
+    python3 <<EOF
+from distutils.version import LooseVersion as L
+if L('$python_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+cuda_plus(){
+    python3 <<EOF
+from distutils.version import LooseVersion as L
+if L('$cuda_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+
+
+
+
+if [ ! -d chainer/ ]; then
+    git clone --depth 1 https://github.com/chainer/chainer -b v${chainer_version}
+fi
+
+if [ ! -e chainer/.patched ]; then
+
+    # Remove typing and make the protobuf-requirements relaxing
+    cat <<EOF | patch -u chainer/setup.py
+diff --git a b
+index 269bff4..cee3f71 100644
+--- a
++++ b
+@@ -24,7 +24,6 @@ set CHAINER_PYTHON_350_FORCE environment variable to 1."""
+ requirements = {
+     'install': [
+         'setuptools',
+-        'typing',
+         'typing_extensions',
+         'filelock',
+         'numpy>=1.9.0',
+@@ -32,7 +31,7 @@ requirements = {
+         # TODO(niboshi): Probably we should always use pip in CIs for
+         # installing chainer. It avoids pre-release dependencies by default.
+         # See also: https://github.com/pypa/setuptools/issues/855
+-        'protobuf>=3.0.0,<3.8.0rc1',
++        'protobuf',
+         'six>=1.9.0',
+     ],
+     'stylecheck': [
+EOF
+    touch chainer/.patched
+fi
+
+
+python3 -m pip install -e chainer/
+
+
+# CUPY installation
+if [ -n "${cuda_version}" ]; then
+    if $(cuda_plus 10.2); then
+        echo "[INFO] Skip cupy installation"
+    else
+        if $(python_plus 3.8); then
+            python3 -m pip install "cupy==${chainer_version}"
+        else
+            python3 -m pip install "cupy-cuda${cuda_version_without_dot}==${chainer_version}"
+        fi
+    fi
+fi
diff --git a/tools/installers/install_fairscale.sh b/tools/installers/install_fairscale.sh
new file mode 100755
index 00000000000..876c0b31ead
--- /dev/null
+++ b/tools/installers/install_fairscale.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_version=$(python3 -c "import torch; print(torch.__version__)")
+python_36_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import sys
+
+if V(sys.version) >= V("3.6"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+
+pt_plus(){
+    python3 <<EOF
+import sys
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+
+echo "[INFO] torch_version=${torch_version}"
+
+
+if ! "${python_36_plus}"; then
+    echo "[ERROR] python<3.6 is not supported"
+    exit 1
+else
+
+    if $(pt_plus 1.8.1); then
+        pip install fairscale
+    else
+        echo "[WARNING] fairscale requires pytorch>=1.8.1"
+    fi
+
+fi
+
+
+# Check the pytorch version is not changed from the original version
+current_torch_version="$(python3 -c 'import torch; print(torch.__version__)')"
+if [ ${torch_version} != "${current_torch_version}" ]; then
+    echo "[ERROR] The torch version has been changed. Please report to espnet administrators"
+    exit 1
+fi
diff --git a/tools/installers/install_fairseq.sh b/tools/installers/install_fairseq.sh
index ebb61cd9891..780d8ce81b0 100755
--- a/tools/installers/install_fairseq.sh
+++ b/tools/installers/install_fairseq.sh
@@ -2,57 +2,52 @@
 
 set -euo pipefail
 
-MAKE=make
-
 if [ $# != 0 ]; then
     echo "Usage: $0"
     exit 1;
 fi
 
-torch_15_plus=$(python3 <<EOF
+torch_version=$(python3 -c "import torch; print(torch.__version__)")
+python_36_plus=$(python3 <<EOF
 from distutils.version import LooseVersion as V
-import torch
+import sys
 
-if V(torch.__version__) >= V("1.5"):
+if V(sys.version) >= V("3.6"):
     print("true")
 else:
     print("false")
 EOF
 )
 
-python_36_plus=$(python3 <<EOF
-from distutils.version import LooseVersion as V
+pt_plus(){
+    python3 <<EOF
 import sys
-
-if V(sys.version) >= V("3.6"):
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
     print("true")
 else:
     print("false")
 EOF
-)
+}
+
+echo "[INFO] torch_version=${torch_version}"
 
-cuda_version=$(python3 <<EOF
-import torch
-if torch.cuda.is_available():
-    version=torch.version.cuda.split(".")
-    # 10.1.aa -> 101
-    print(version[0] + version[1])
-else:
-    print("")
-EOF
-)
-echo "cuda_version=${cuda_version}"
 
-if "${torch_15_plus}" && "${python_36_plus}"; then
+if "$(pt_plus 1.8.0)" && "${python_36_plus}"; then
 
     rm -rf fairseq
 
-    # FairSeq Commit id when making this PR: `commit 6225dccb989ebfb268274bad36a794b27e4dd43f`
+    # FairSeq Commit id when making this PR: `commit 313ff0581561c7725ea9430321d6af2901573dfb`
+    # git clone --depth 1 https://github.com/pytorch/fairseq.git
+    # TODO(jiatong): to fix after the issue #4035 is fixed in fairseq
     git clone https://github.com/pytorch/fairseq.git
+    cd fairseq
+    git checkout -b sync_commit 313ff0581561c7725ea9430321d6af2901573dfb
+    cd ..
     python3 -m pip install --editable ./fairseq
     python3 -m pip install filelock
 
 else
-    echo "[WARNING] fairseq is not prepared for pytorch<1.5.0, python<3.6 now"
+    echo "[WARNING] fairseq requires pytorch>=1.8.0, python>=3.6"
 
 fi
diff --git a/tools/installers/install_gtn.sh b/tools/installers/install_gtn.sh
new file mode 100755
index 00000000000..bc181ed5252
--- /dev/null
+++ b/tools/installers/install_gtn.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+# Install gtn
+if [ ! -e gtn.done ]; then
+    (
+        set -euo pipefail
+        python3 -m pip install gtn==0.0.0
+    )
+    touch gtn.done
+else
+    echo "gtn is already installed"
+fi
diff --git a/tools/installers/install_k2.sh b/tools/installers/install_k2.sh
new file mode 100755
index 00000000000..667edb86a03
--- /dev/null
+++ b/tools/installers/install_k2.sh
@@ -0,0 +1,215 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Please update if too old. See https://k2-fsa.org/nightly/, https://anaconda.org/k2-fsa/k2/files
+pip_k2_version="1.10.dev20211112"
+conda_k2_version="1.10.dev20211103"  # Empty indicates latest version
+
+if [ $# -gt 2 ]; then
+    echo "Usage: $0 [use-conda|true or false] [<k2-version>]"
+    exit 1;
+elif [ $# -gt 0 ]; then
+    use_conda="$1"
+    if [ "${use_conda}" != false ] && [ "${use_conda}" != true ]; then
+        echo "[ERROR] <use_conda> must be true or false, but ${use_conda} is given."
+        echo "Usage: $0 [use-conda|true or false] [<k2-version>]"
+        exit 1
+    fi
+
+    if [ $# -eq 2 ]; then
+        k2_version="$2"
+        pip_k2_version="${k2_version}"
+        conda_k2_version="${k2_version}"
+    fi
+else
+    use_conda=$([[ $(conda list -e -c -f --no-pip pytorch 2>/dev/null) =~ pytorch ]] && echo true || echo false)
+fi
+
+
+python_36_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import sys
+
+if V(sys.version) >= V("3.6"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+
+cuda_version=$(python3 <<EOF
+try:
+    import torch
+except:
+    raise RuntimeError("Please install torch before running this script")
+
+if torch.cuda.is_available():
+    version=torch.version.cuda.split(".")
+    # 10.1.aa -> 10.1
+    print(version[0] + "." + version[1])
+else:
+    print("")
+EOF
+)
+torch_version=$(python3 <<EOF
+import torch
+# e.g. 1.10.0+cpu -> 1.10.0
+torch_version=torch.__version__.split("+")[0]
+print(torch_version)
+EOF
+)
+libc_path="$(ldd /bin/bash | grep libc.so | awk '{ print $3 }')"
+libc_version="$(${libc_path} | grep "GNU C Library" | grep -oP "version [0-9]*.[0-9]*" | cut -d" " -f2)"
+
+pytorch_plus(){
+    python3 <<EOF
+import sys
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+libc_plus(){
+    python3 <<EOF
+import sys
+from distutils.version import LooseVersion as L
+if L('$libc_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+
+echo "[INFO] torch_version=${torch_version}"
+echo "[INFO] cuda_version=${cuda_version}"
+echo "[INFO] libc_version=${libc_version}"
+
+if ! "${python_36_plus}"; then
+    echo "[ERROR] k2 requires python>=3.6"
+    exit 1
+fi
+
+# Check pytorch version.
+# Please exit without error code for CI.
+if "${use_conda}"; then
+    if [ "${conda_k2_version}" = "1.10.dev20211103" ]; then
+        if ! $(libc_plus 2.27); then
+            echo "[WARNING] k2=${conda_k2_version} requires GLIBC_2.27, but your GLIBC is ${libc_version}. Skip k2-installation"
+            exit
+        fi
+        if "$(pytorch_plus 1.10.1)"; then
+            echo "[WARNING] k2=${conda_k2_version} doesn't provide conda package for pytorch=${torch_version}. Skip k2-installation"
+            exit
+        elif ! "$(pytorch_plus 1.5.0)"; then
+            echo "[WARNING] k2=${conda_k2_version} doesn't provide conda package for pytorch=${torch_version}. Skip k2-installation"
+            exit
+        fi
+        if "$(pytorch_plus 1.10.0)"; then
+            if [ -n "${cuda_version}" ] && [ "${cuda_version}" != "10.2" ] && [ "${cuda_version}" != "11.1" ] && [ "${cuda_version}" != "11.3" ]; then
+                echo "[WARNING] k2=${conda_k2_version} for pytorch=${torch_version} provides conda package for CUDA10.2, 11.1, and 11.3 only. Skip k2-installation"
+                exit
+            fi
+        elif "$(pytorch_plus 1.9.0)"; then
+            if [ -n "${cuda_version}" ] && [ "${cuda_version}" != "10.2" ] && [ "${cuda_version}" != "11.1" ]; then
+                echo "[WARNING] k2=${conda_k2_version} for pytorch=${torch_version} provides conda package for CUDA10.2, and 11.1 only. Skip k2-installation"
+                exit
+            fi
+        elif "$(pytorch_plus 1.8.0)"; then
+            if [ -n "${cuda_version}" ] && [ "${cuda_version}" != "10.1" ] && [ "${cuda_version}" != "10.2" ] && [ "${cuda_version}" != "11.1" ]; then
+                echo "[WARNING] k2=${conda_k2_version} for pytorch=${torch_version} provides conda package for CUDA10.1, 10.2 and 11.1 only. Skip k2-installation"
+                exit
+            fi
+        elif "$(pytorch_plus 1.7.0)"; then
+            if [ -n "${cuda_version}" ] && [ "${cuda_version}" != "10.1" ] && [ "${cuda_version}" != "10.2" ] && [ "${cuda_version}" != "11.0" ]; then
+                echo "[WARNING] k2=${conda_k2_version} for pytorch=${torch_version} provides conda package for CUDA10.1, 10.2 and 11.0 only. Skip k2-installation"
+                exit
+            fi
+        elif "$(pytorch_plus 1.6.0)"; then
+            if [ -n "${cuda_version}" ] && [ "${cuda_version}" != "10.1" ] && [ "${cuda_version}" != "10.2" ]; then
+                echo "[WARNING] k2=${conda_k2_version} for pytorch=${torch_version} provides conda package for CUDA10.1, 10.2 and 11.0 only. Skip k2-installation"
+                exit
+            fi
+        else
+            if [ -n "${cuda_version}" ]; then
+                echo "[WARNING] k2=${conda_k2_version} for pytorch=${torch_version} doesn't provides conda package for CUDA. Skip k2-installation"
+                exit
+            fi
+        fi
+    elif [ "${conda_k2_version}" = "1.6.dev20210824" ]; then
+        if "$(pytorch_plus 1.9.1)"; then
+            echo "[WARNING] k2=${conda_k2_version} doesn't provide conda package for pytorch=${torch_version}. Skip k2-installation"
+            exit
+        elif ! "$(pytorch_plus 1.8.1)"; then
+            echo "[WARNING] k2=${conda_k2_version} doesn't provide conda package for pytorch=${torch_version}. Skip k2-installation"
+            exit
+        fi
+        if [ -n "${cuda_version}" ] && [ "${cuda_version}" != "10.2" ] && [ "${cuda_version}" != "11.0" ] && [ "${cuda_version}" != "11.1" ]; then
+            echo "[WARNING] k2=${conda_k2_version} provides conda package for CUDA10.2, 11.0, and 11.1 only. Skip k2-installation"
+            exit
+        fi
+    fi
+else
+    if [ "${pip_k2_version}" = "1.10.dev20211112" ]; then
+        if ! $(libc_plus 2.27); then
+            echo "[WARNING] k2=${conda_k2_version} requires GLIBC_2.27, but your GLIBC is ${libc_version}. Skip k2-installation"
+            exit
+        fi
+        if "$(pytorch_plus 1.10.1)"; then
+            echo "[WARNING] k2=${pip_k2_version} for pip doesn't provide pytorch=${torch_version} binary. Skip k2-installation"
+            exit
+        elif ! "$(pytorch_plus 1.4.0)"; then
+            echo "[WARNING] k2=${pip_k2_version} for pip doesn't provide pytorch=${torch_version} binary. Skip k2-installation"
+            exit
+        fi
+        if [ -n "${cuda_version}" ] && [ "${torch_version}" != "1.7.1" ]; then
+            echo "[WARNING] k2=${pip_k2_version}+cuda for pip provides pytorch=1.7.1 binary only. Skip k2-installation"
+            exit
+        fi
+        if [ -n "${cuda_version}" ] && [ "${cuda_version}" != "10.1" ] && [ "${cuda_version}" != "10.2" ] && [ "${cuda_version}" != "11.0" ]; then
+            echo "[WARNING] k2=${pip_k2_version} for pip provides CUDA10.1, 10.2, and 11.0 binary only. Skip k2-installation"
+            exit
+        fi
+    elif [ "${pip_k2_version}" = "1.6.dev20210907" ]; then
+        if "$(pytorch_plus 1.9.1)"; then
+            echo "[WARNING] k2=${pip_k2_version} for pip doesn't provide pytorch=${torch_version} binary. Skip k2-installation"
+            exit
+        elif ! "$(pytorch_plus 1.3.1)"; then
+            echo "[WARNING] k2=${pip_k2_version} for pip  doesn't provide pytorch=${torch_version} binary. Skip k2-installation"
+            exit
+        fi
+        if [ -n "${cuda_version}" ] && [ "${torch_version}" != "1.7.1" ]; then
+            echo "[WARNING] k2=${pip_k2_version}+cuda for pip provides pytorch=1.7.1 binary only. Skip k2-installation"
+            exit
+        fi
+        if [ -n "${cuda_version}" ] && [ "${cuda_version}" != "10.1" ] && [ "${cuda_version}" != "10.2" ] && [ "${cuda_version}" != "11.0" ]; then
+            echo "[WARNING] k2=${pip_k2_version} for pip provides CUDA10.1, 10.2, and 11.0 binary only. Skip k2-installation"
+            exit
+        fi
+    fi
+fi
+
+
+
+if "${use_conda}"; then
+    [ -z "${conda_k2_version}" ] && k2="k2" || k2="k2=${conda_k2_version}"
+
+    if [ -z "${cuda_version}" ]; then
+        echo conda install -y -c k2-fsa -c pytorch cpuonly "${k2}" "pytorch=${torch_version}"
+        conda install -y -c k2-fsa -c pytorch cpuonly "${k2}" "pytorch=${torch_version}"
+    else
+        # NOTE(kamo): K2 requires cudatoolkit from conda-forge channel and k2-cpu is installed if the other channel is used, e.g. anaconda, nvidia
+        echo conda install -y -c k2-fsa -c pytorch -c conda-forge "${k2}" "cudatoolkit=${cuda_version}" "pytorch=${torch_version}"
+        conda install -y -c k2-fsa -c pytorch -c conda-forge "${k2}" "cudatoolkit=${cuda_version}" "pytorch=${torch_version}"
+    fi
+
+else
+    if [ -z "${cuda_version}" ]; then
+        echo pip install "k2==${pip_k2_version}+cpu.torch${torch_version}" -f https://k2-fsa.org/nightly/
+        pip install "k2==${pip_k2_version}+cpu.torch${torch_version}" -f https://k2-fsa.org/nightly/
+    else
+        echo pip install "k2==${pip_k2_version}+cuda${cuda_version}.torch${torch_version}" -f https://k2-fsa.org/nightly/
+        pip install "k2==${pip_k2_version}+cuda${cuda_version}.torch${torch_version}" -f https://k2-fsa.org/nightly/
+    fi
+fi
diff --git a/tools/installers/install_longformer.sh b/tools/installers/install_longformer.sh
new file mode 100755
index 00000000000..c942abb0dd9
--- /dev/null
+++ b/tools/installers/install_longformer.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_version=$(python3 -c "import torch; print(torch.__version__)")
+python_36_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import sys
+
+if V(sys.version) >= V("3.6"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+pt_plus(){
+    python3 <<EOF
+import sys
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+
+echo "[INFO] torch_version=${torch_version}"
+
+if ! "${python_36_plus}"; then
+    echo "[ERROR] python<3.6 is not supported"
+    exit 1
+else
+
+    if $(pt_plus 1.8.0); then
+        python -m pip install git+https://github.com/roshansh-cmu/longformer.git
+        python -m pip install datasets bert-score
+        python -m pip install git+https://github.com/Maluuba/nlg-eval.git@master
+    else
+        echo "[WARNING] Longformer requires pytorch>=1.8.*"
+    fi
+
+fi
+
+
+# Check the pytorch version is not changed from the original version
+current_torch_version="$(python3 -c 'import torch; print(torch.__version__)')"
+if [ ${torch_version} != "${current_torch_version}" ]; then
+    echo "[ERROR] The torch version has been changed. Please report to espnet administrators"
+    exit 1
+fi
+
diff --git a/tools/installers/install_pesq.sh b/tools/installers/install_pesq.sh
index c6b4dbd9d16..29677c5c32e 100755
--- a/tools/installers/install_pesq.sh
+++ b/tools/installers/install_pesq.sh
@@ -8,7 +8,8 @@ if [ $# != 0 ]; then
 fi
 
 if [ ! -e PESQ.zip ]; then
-    wget --tries=3 'http://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200511-I!Amd2!SOFT-ZST-E&type=items' -O PESQ.zip
+    wget --tries=3 --no-check-certificate \
+        'http://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200511-I!Amd2!SOFT-ZST-E&type=items' -O PESQ.zip
 fi
 if [ ! -e PESQ ]; then
     mkdir -p PESQ_P.862.2
diff --git a/tools/installers/install_phonemizer.sh b/tools/installers/install_phonemizer.sh
index 90761527f67..394eb213d51 100755
--- a/tools/installers/install_phonemizer.sh
+++ b/tools/installers/install_phonemizer.sh
@@ -60,12 +60,9 @@ fi
 
 # Install phonemizer
 if [ ! -e phonemizer.done ]; then
-    rm -rf phonemizer
-    # NOTE(kan-bayashi): It is better to use fixed tag
-    git clone https://github.com/bootphon/phonemizer
     (
         set -euo pipefail
-        cd phonemizer && python3 -m pip install -e .
+        pip install phonemizer==3.0
     )
     touch phonemizer.done
 else
diff --git a/tools/installers/install_pyopenjtalk.sh b/tools/installers/install_pyopenjtalk.sh
index c836ae660bd..1c0e215f9a3 100755
--- a/tools/installers/install_pyopenjtalk.sh
+++ b/tools/installers/install_pyopenjtalk.sh
@@ -7,44 +7,16 @@ if [ $# != 0 ]; then
     exit 1;
 fi
 
-# Install hts_engine_API
-if [ ! -e hts_engine_API.done ]; then
-    rm -rf hts_engine_API
-    # NOTE(kan-bayashi): It is better to use fixed tag
-    git clone https://github.com/r9y9/hts_engine_API.git
-    (
-        set -euo pipefail
-        mkdir hts_engine_API/src/build
-        cd hts_engine_API/src/build && cmake -DCMAKE_INSTALL_PREFIX=../../ .. && make -j && make install
-    )
-    touch hts_engine_API.done
-else
-    echo "hts_engine_API is already installed"
-fi
-
-# Install open_jtalk
-if [ ! -e open_jtalk.done ]; then
-    rm -rf open_jtalk
-    git clone https://github.com/r9y9/open_jtalk.git
-    mkdir -p open_jtalk/src/build
-    (
-        set -euo pipefail
-        cd open_jtalk/src/build && cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=../../../ -DHTS_ENGINE_LIB=../../../hts_engine_API/lib -DHTS_ENGINE_INCLUDE_DIR=../../../hts_engine_API/include .. && make install
-    )
-    touch open_jtalk.done
-else
-    echo "open_jtalk is already installed"
-fi
-
 # Install pyopenjtalk
 if [ ! -e pyopenjtalk.done ]; then
-    rm -rf pyopenjtalk
-    git clone https://github.com/r9y9/pyopenjtalk.git
     (
         set -euo pipefail
-        cd pyopenjtalk && OPEN_JTALK_INSTALL_PREFIX=$(pwd)/../ python3 -m pip install -e .
+        # Since this installer overwrite existing pyopenjtalk, remove done file.
+        [ -e tdmelodic_pyopenjtalk.done ] && rm tdmelodic_pyopenjtalk.done
+        python3 -m pip install pyopenjtalk==0.1.6
+        python3 -c "import pyopenjtalk; pyopenjtalk.g2p('download dict')"
     )
     touch pyopenjtalk.done
 else
-    echo "pyopenjtalk is already installed"
+    echo "pyopenjtalk is already installed."
 fi
diff --git a/tools/installers/install_s3prl.sh b/tools/installers/install_s3prl.sh
new file mode 100755
index 00000000000..66f38af0e36
--- /dev/null
+++ b/tools/installers/install_s3prl.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+MAKE=make
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_17_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import torch
+
+if V(torch.__version__) >= V("1.7"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+
+python_36_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import sys
+
+if V(sys.version) >= V("3.6"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+
+cuda_version=$(python3 <<EOF
+import torch
+if torch.cuda.is_available():
+    version=torch.version.cuda.split(".")
+    # 10.1.aa -> 101
+    print(version[0] + version[1])
+else:
+    print("")
+EOF
+)
+echo "cuda_version=${cuda_version}"
+
+if "${torch_17_plus}" && "${python_36_plus}"; then
+
+    rm -rf s3prl
+
+    # S3PRL Commit id when making this PR: `commit a9b3ba906f3406c3e00aa65b08fbcefa0ea115ef`
+    git clone https://github.com/s3prl/s3prl.git
+
+else
+    echo "[WARNING] s3prl is not prepared for pytorch<1.7.0, python<3.6 now"
+
+fi
diff --git a/tools/installers/install_speechbrain.sh b/tools/installers/install_speechbrain.sh
new file mode 100755
index 00000000000..b3c2310206e
--- /dev/null
+++ b/tools/installers/install_speechbrain.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_18_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import torch
+
+if V(torch.__version__) >= V("1.8"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+
+
+# Install speechbrain
+if [ ! -e speechbrain.done ]; then
+    if "${torch_18_plus}"; then
+        python3 -m pip install speechbrain==0.5.11
+        touch speechbrain.done
+    else
+        echo "[ERROR]: speechbrain requires pytorch>=1.8.0"
+        exit 1
+    fi
+else
+    echo "speechbrain is already installed."
+fi
diff --git a/tools/installers/install_tdmelodic_pyopenjtalk.sh b/tools/installers/install_tdmelodic_pyopenjtalk.sh
new file mode 100755
index 00000000000..d9af1dba667
--- /dev/null
+++ b/tools/installers/install_tdmelodic_pyopenjtalk.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+# Install pyopenjtalk
+if [ ! -e tdmelodic_pyopenjtalk.done ]; then
+    (
+        set -euo pipefail
+        # Since this installer overwrite existing pyopenjtalk, remove done file.
+        [ -e pyopenjtalk.done ] && rm pyopenjtalk.done
+        # TODO(kan-bayashi): Better to fix tagged version
+        #   commit id when creating PR: 766477584a423a1e62b0f81f79fb7e5e189962b5
+        rm -rf tdmelodic_openjtalk && git clone https://github.com/sarulab-speech/tdmelodic_openjtalk.git
+        rm -rf pyopenjtalk && git clone https://github.com/r9y9/pyopenjtalk.git -b v0.1.6
+        cd pyopenjtalk
+        git switch -c v0.1.6
+        git submodule update --recursive --init
+
+        # concatenate the dictionary
+        cp lib/open_jtalk/src/mecab-naist-jdic/naist-jdic.csv lib/open_jtalk/src/mecab-naist-jdic/naist-jdic_org.csv
+        cat ../tdmelodic_openjtalk/tdmelodic_openjtalk.csv lib/open_jtalk/src/mecab-naist-jdic/naist-jdic_org.csv \
+            > lib/open_jtalk/src/mecab-naist-jdic/naist-jdic.csv
+
+        # install and check
+        pip install -e .
+        python3 -c "import pyopenjtalk; pyopenjtalk.g2p('download dict')"
+    )
+    touch tdmelodic_pyopenjtalk.done
+else
+    echo "tdmelodic_pyopenjtalk is already installed"
+fi
diff --git a/tools/installers/install_torch.sh b/tools/installers/install_torch.sh
new file mode 100755
index 00000000000..285e37b6fd4
--- /dev/null
+++ b/tools/installers/install_torch.sh
@@ -0,0 +1,236 @@
+#!/usr/bin/env bash
+set -euo pipefail
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $# -ne 3 ]; then
+    log "Usage: $0 <use_conda| true or false> <torch_version> <cuda_version>"
+    exit 1
+elif [ $# -eq 3 ]; then
+    use_conda="$1"
+    if [ "${use_conda}" != false ] && [ "${use_conda}" != true ]; then
+        log "[ERROR] <use_conda> must be true or false, but ${use_conda} is given."
+        log "Usage: $0 <use_conda| true or false> <torch_version> <cuda_version>"
+        exit 1
+    fi
+    torch_version="$2"
+    cuda_version="$3"
+fi
+if [ "${cuda_version}" = cpu ] || [ "${cuda_version}" = CPU ]; then
+    cuda_version=
+fi
+
+
+python_version=$(python3 -c "import sys; print(sys.version.split()[0])")
+cuda_version_without_dot="${cuda_version/\./}"
+
+
+python_plus(){
+    python3 <<EOF
+from distutils.version import LooseVersion as L
+if L('$python_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+pytorch_plus(){
+    python3 <<EOF
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+install_torch(){
+# Usage: install_torch <torchaudio-version> <default-cuda-version-for-pip-install-torch>
+    if "${use_conda}"; then
+        if [ -z "${cuda_version}" ]; then
+            log conda install -y "pytorch=${torch_version}" "torchaudio=$1" cpuonly -c pytorch
+            conda install -y "pytorch=${torch_version}" "torchaudio=$1" cpuonly -c pytorch
+        elif [ "${cuda_version}" = "11.1" ] || [ "${cuda_version}" = "11.2" ]; then
+            # Anaconda channel, which is default main channel, doesn't provide cudatoolkit=11.1, 11.2 now (Any pytorch version doesn't provide cuda=11.2).
+            # https://anaconda.org/anaconda/cudatoolkit/files
+
+            # https://anaconda.org/nvidia/cudatoolkit/files
+            cudatoolkit_channel=nvidia
+
+            # https://anaconda.org/conda-forge/cudatoolkit/files
+            # cudatoolkit_channel=conda-forge
+
+            log conda install -y "pytorch=${torch_version}" "torchaudio=$1" "cudatoolkit=${cuda_version}" -c pytorch -c "${cudatoolkit_channel}"
+            conda install -y "pytorch=${torch_version}" "torchaudio=$1" "cudatoolkit=${cuda_version}" -c pytorch -c "${cudatoolkit_channel}"
+        else
+            log conda install -y "pytorch=${torch_version}" "torchaudio=$1" "cudatoolkit=${cuda_version}" -c pytorch
+            conda install -y "pytorch=${torch_version}" "torchaudio=$1" "cudatoolkit=${cuda_version}" -c pytorch
+        fi
+    else
+        if $(pytorch_plus 1.10.0); then
+            if [ -z "${cuda_version}" ]; then
+                log python3 -m pip install "torch==${torch_version}+cpu" "torchaudio==$1+cpu" -f https://download.pytorch.org/whl/torch_stable.html
+                python3 -m pip install "torch==${torch_version}+cpu" "torchaudio==$1+cpu" -f https://download.pytorch.org/whl/torch_stable.html
+            else
+                if [ "${cuda_version}" = "$2" ]; then
+                    log python3 -m pip install "torch==${torch_version}" "torchaudio==$1"
+                    python3 -m pip install "torch==${torch_version}" "torchaudio==$1"
+                else
+                    log python3 -m pip install "torch==${torch_version}+cu${cuda_version_without_dot}" "torchaudio==$1+cu${cuda_version_without_dot}" -f "https://download.pytorch.org/whl/torch_stable.html"
+                    python3 -m pip install "torch==${torch_version}+cu${cuda_version_without_dot}" "torchaudio==$1+cu${cuda_version_without_dot}" -f "https://download.pytorch.org/whl/torch_stable.html"
+                fi
+            fi
+        else
+            if [ -z "${cuda_version}" ]; then
+                log python3 -m pip install "torch==${torch_version}+cpu" "torchaudio==$1" -f https://download.pytorch.org/whl/torch_stable.html
+                python3 -m pip install "torch==${torch_version}+cpu" "torchaudio==$1" -f https://download.pytorch.org/whl/torch_stable.html
+            else
+                if [ "${cuda_version}" = "$2" ]; then
+                    log python3 -m pip install "torch==${torch_version}" "torchaudio==$1"
+                    python3 -m pip install "torch==${torch_version}" "torchaudio==$1"
+                else
+                    log python3 -m pip install "torch==${torch_version}+cu${cuda_version_without_dot}" "torchaudio==$1" -f "https://download.pytorch.org/whl/torch_stable.html"
+                    python3 -m pip install "torch==${torch_version}+cu${cuda_version_without_dot}" "torchaudio==$1" -f "https://download.pytorch.org/whl/torch_stable.html"
+                fi
+            fi
+        fi
+    fi
+}
+check_python_version(){
+    if $(python_plus $1) || ! $(python_plus 3.6); then
+        log "[ERROR] pytorch=${torch_version} doesn't provide binary build for python>=$1,<3.6, but your python is ${python_version}"
+        exit 1
+    fi
+}
+check_cuda_version(){
+    supported=false
+    for v in "" $@; do
+        [ "${cuda_version}" = "${v}" ] && supported=true
+    done
+    if ! "${supported}"; then
+        # See https://anaconda.org/pytorch/pytorch/files to provided version
+        log "[ERROR] Pytorch=${torch_version} binary for CUDA=${cuda_version} is not provided. $@ are supported."
+        exit 1
+    fi
+}
+
+
+log "[INFO] python_version=${python_version}"
+log "[INFO] torch_version=${torch_version}"
+log "[INFO] cuda_version=${cuda_version}"
+
+
+if $(pytorch_plus 1.10.2); then
+    log "[ERROR] This script doesn't support pytorch=${torch_version}"
+    exit 1
+
+elif $(pytorch_plus 1.10.1); then
+    check_python_version 3.10  # Error if python>=<number>
+    check_cuda_version 11.3 11.1 10.2  # Error if cuda_version doesn't match with any given numbers
+    install_torch 0.10.1 10.2  # install_torch <torch-audio-ver> <default-cuda-version-for-pip-install-torch>
+elif $(pytorch_plus 1.10.0); then
+    check_python_version 3.10  # Error if python>=<number>
+    check_cuda_version 11.3 11.1 10.2  # Error if cuda_version doesn't match with any given numbers
+    install_torch 0.10.0 10.2  # install_torch <torch-audio-ver> <default-cuda-version-for-pip-install-torch>
+elif $(pytorch_plus 1.9.2); then
+    log "[ERROR] pytorch=${torch_version} doesn't exist"
+    exit 1
+
+elif $(pytorch_plus 1.9.1); then
+    check_python_version 3.10
+    check_cuda_version 11.1 10.2
+    install_torch 0.9.1 10.2
+
+elif $(pytorch_plus 1.9.0); then
+    check_python_version 3.10
+    check_cuda_version 11.1 10.2
+    install_torch 0.9.0 10.2
+
+elif $(pytorch_plus 1.8.2); then
+    log "[ERROR] pytorch=${torch_version} doesn't exist"
+    exit 1
+
+elif $(pytorch_plus 1.8.1); then
+    check_python_version 3.10
+    check_cuda_version 11.1 10.2 10.1
+    install_torch 0.8.1 10.2
+
+elif $(pytorch_plus 1.8.0); then
+    check_python_version 3.10
+    check_cuda_version 11.1 10.2 10.1
+    install_torch 0.8.0 10.2
+
+elif $(pytorch_plus 1.7.2); then
+    log "[ERROR] pytorch=${torch_version} doesn't exist"
+    exit 1
+
+elif $(pytorch_plus 1.7.1); then
+    check_python_version 3.10
+    check_cuda_version 11.0 10.2 10.1 9.2
+    install_torch 0.7.2 10.2
+
+elif $(pytorch_plus 1.7.0); then
+    check_python_version 3.10
+    check_cuda_version 11.0 10.2 10.1 9.2
+    install_torch 0.7.0 10.2
+
+elif $(pytorch_plus 1.6.1); then
+    log "[ERROR] pytorch=${torch_version} doesn't exist"
+    exit 1
+
+elif $(pytorch_plus 1.6.0); then
+    check_python_version 3.9
+    check_cuda_version 10.2 10.1 9.2
+    install_torch 0.6.0 10.2
+
+elif $(pytorch_plus 1.5.2); then
+    log "[ERROR] pytorch=${torch_version} doesn't exist"
+    exit 1
+
+elif $(pytorch_plus 1.5.1); then
+    check_python_version 3.9
+    check_cuda_version 10.2 10.1 9.2
+    install_torch 0.5.1 10.2
+
+elif $(pytorch_plus 1.5.0); then
+    check_python_version 3.9
+    check_cuda_version 10.2 10.1 9.2
+    install_torch 0.5.0 10.2
+
+elif $(pytorch_plus 1.4.1); then
+    log "[ERROR] pytorch=${torch_version} doesn't exist"
+    exit 1
+
+elif $(pytorch_plus 1.4.0); then
+    check_python_version 3.9
+    check_cuda_version 10.1 10.0 9.2
+    install_torch 0.4.0 10.1
+
+elif $(pytorch_plus 1.3.2); then
+    log "[ERROR] pytorch=${torch_version} doesn't exist"
+    exit 1
+
+elif $(pytorch_plus 1.3.1); then
+    check_python_version 3.8
+    check_cuda_version 10.1 10.0 9.2
+    install_torch 0.3.2 10.1
+
+elif $(pytorch_plus 1.3.0); then
+    check_python_version 3.8
+    check_cuda_version 10.1 10.0 9.2
+    install_torch 0.3.1 10.1
+
+elif $(pytorch_plus 1.2.1); then
+    log "[ERROR] pytorch=${torch_version} doesn't exist"
+    exit 1
+
+elif $(pytorch_plus 1.2.0); then
+    check_python_version 3.8
+    check_cuda_version 10.0 9.2
+    install_torch 0.3.0 10.0
+else
+    log "[ERROR] This script doesn't support pytorch=${torch_version}"
+    exit 1
+fi
+
diff --git a/tools/installers/install_torch_optimizer.sh b/tools/installers/install_torch_optimizer.sh
new file mode 100755
index 00000000000..5d8565deead
--- /dev/null
+++ b/tools/installers/install_torch_optimizer.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+torch_version=$(python3 -c "import torch; print(torch.__version__)")
+python_36_plus=$(python3 <<EOF
+from distutils.version import LooseVersion as V
+import sys
+
+if V(sys.version) >= V("3.6"):
+    print("true")
+else:
+    print("false")
+EOF
+)
+
+pt_plus(){
+    python3 <<EOF
+import sys
+from distutils.version import LooseVersion as L
+if L('$torch_version') >= L('$1'):
+    print("true")
+else:
+    print("false")
+EOF
+}
+
+echo "[INFO] torch_version=${torch_version}"
+
+
+if ! "${python_36_plus}"; then
+    echo "[ERROR] python<3.6 is not supported"
+    exit 1
+else
+
+    if $(pt_plus 1.5.0); then
+        pip install torch_optimizer
+    else
+        echo "[WARNING] torch_optimizer requires pytorch>=1.5.0"
+    fi
+
+fi
+
+
+# Check the pytorch version is not changed from the original version
+current_torch_version="$(python3 -c 'import torch; print(torch.__version__)')"
+if [ ${torch_version} != "${current_torch_version}" ]; then
+    echo "[ERROR] The torch version has been changed. Please report to espnet developers"
+    exit 1
+fi
diff --git a/tools/installers/install_transformers.sh b/tools/installers/install_transformers.sh
new file mode 100755
index 00000000000..007a1417664
--- /dev/null
+++ b/tools/installers/install_transformers.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [ $# != 0 ]; then
+    echo "Usage: $0"
+    exit 1;
+fi
+
+python3 -m pip install transformers>=4.9.1
diff --git a/tools/installers/install_warp-rnnt.sh b/tools/installers/install_warp-rnnt.sh
deleted file mode 100755
index 104351b6f07..00000000000
--- a/tools/installers/install_warp-rnnt.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-cuda_version=$(python3 <<EOF
-import torch
-if torch.cuda.is_available():
-   version=torch.version.cuda.split(".")
-   print(version[0] + version[1])
-else:
-   print("")
-EOF
-)
-
-if ! [[ "$cuda_version" =~ ^(100|101|102)$ ]]; then
-    echo "warp-rnnt was not tested with CUDA_VERSION=$cuda_version. Skipping install."
-    exit 0
-fi
-
-# TODO(kamo): Consider clang case
-# Note: Requires gcc>=5.4 to build extensions with pytorch>=1.0
-if python3 -c 'import torch as t;assert t.__version__[0] >= "1.0"' &> /dev/null; then \
-    python3 -c "from distutils.version import LooseVersion as V;assert V('$(gcc -dumpversion)') >= V('5.4'), 'Requires gcc>=5.4'"; \
-    
-fi
-
-rm -rf warp-rnnt
-git clone https://github.com/1ytic/warp-rnnt
-
-(
-    set -euo pipefail
-    cd warp-rnnt/pytorch_binding && python3 -m pip install -e .
-)
diff --git a/tools/installers/install_warp-transducer.sh b/tools/installers/install_warp-transducer.sh
index bbb369df542..9ed3ce18fc3 100755
--- a/tools/installers/install_warp-transducer.sh
+++ b/tools/installers/install_warp-transducer.sh
@@ -1,7 +1,6 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-
 if [ $# != 0 ]; then
     echo "Usage: $0"
     exit 1;
@@ -14,7 +13,7 @@ if python3 -c 'import torch as t;assert t.__version__[0] == "1"' &> /dev/null; t
 fi
 
 rm -rf warp-transducer
-git clone https://github.com/HawkAaron/warp-transducer.git
+git clone --single-branch --branch espnet_v1.1 https://github.com/b-flo/warp-transducer.git
 
 (
     set -euo pipefail
diff --git a/tools/setup_cuda_env.sh b/tools/setup_cuda_env.sh
old mode 100644
new mode 100755
diff --git a/utils/asr_align_wav.sh b/utils/asr_align_wav.sh
index 61c2396ba81..fdd716da338 100755
--- a/utils/asr_align_wav.sh
+++ b/utils/asr_align_wav.sh
@@ -44,17 +44,20 @@ scoring_length=30
 models=tedlium2.rnn.v2
 dict=
 nlsyms=
+download_dir=${align_dir}/download
 
 . utils/parse_options.sh || exit 1;
 
 help_message=$(cat <<EOF
 Usage:
     $0 [options] <wav_file> "<text>"
+    $0 [options] <wav_file> <utt_text_file>
 
 Options:
     --backend <chainer|pytorch>     # chainer or pytorch (Default: pytorch)
     --ngpu <ngpu>                   # Number of GPUs (Default: 0)
     --align-dir <directory_name>    # Name of directory to store decoding temporary data
+    --download-dir <directory_name> # Name of directory to store download files
     --models <model_name>           # Model name (e.g. tedlium2.transformer.v1)
     --cmvn <path>                   # Location of cmvn.ark
     --align-model <path>            # Location of E2E model
@@ -69,6 +72,8 @@ Example:
     # Align using model name
     $0 --models tedlium2.transformer.v1 example.wav "example text"
 
+    $0 --models tedlium2.transformer.v1 example.wav utt_text.txt
+
     # Align using model file
     $0 --cmvn cmvn.ark --align_model model.acc.best --align_config conf/align.yaml example.wav
 
@@ -98,7 +103,6 @@ train_cmd=
 
 wav=$1
 text=$2
-download_dir=${align_dir}/download
 
 if [ ! $# -eq 2 ]; then
     echo "${help_message}"
@@ -226,7 +230,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     echo "$base $wav" > ${align_dir}/data/wav.scp
     echo "X $base" > ${align_dir}/data/spk2utt
     echo "$base X" > ${align_dir}/data/utt2spk
-    echo "$base $text" > ${align_dir}/data/text
+    utt_text="${align_dir}/data/text"
+    if [ -f "$text" ]; then
+        cp -v "$text" "$utt_text"
+        utt_text="${text}" # Use the original file, because copied file will be truncated
+    else
+        echo "$base $text" > "${utt_text}"
+    fi
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -270,7 +280,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --min-window-size ${min_window_size} \
         --scoring-length ${scoring_length} \
         --api ${api} \
-        --utt-text ${align_dir}/utt_text \
+        --utt-text ${utt_text} \
         --output ${align_dir}/aligned_segments || exit 1;
 
     echo ""
diff --git a/utils/compute-cmvn-stats.py b/utils/compute-cmvn-stats.py
index be5e7595777..067daec6cfb 100755
--- a/utils/compute-cmvn-stats.py
+++ b/utils/compute-cmvn-stats.py
@@ -146,7 +146,7 @@ def utt2spk(x):
 
         counts[spk] += matrix.shape[0]
         sum_feats[spk] += matrix.sum(axis=0)
-        square_sum_feats[spk] += (matrix ** 2).sum(axis=0)
+        square_sum_feats[spk] += (matrix**2).sum(axis=0)
     logging.info("Processed {} utterances".format(idx))
     assert idx > 0, idx
 
diff --git a/utils/compute-fbank-feats.py b/utils/compute-fbank-feats.py
index d5defc7d899..fabded4d5ac 100755
--- a/utils/compute-fbank-feats.py
+++ b/utils/compute-fbank-feats.py
@@ -39,7 +39,7 @@ def get_parser():
         type=int_or_none,
         default=None,
         nargs="?",
-        help="Analisys window length in point",
+        help="Analysis window length in point",
     )
     parser.add_argument(
         "--window",
diff --git a/utils/compute-stft-feats.py b/utils/compute-stft-feats.py
index a020dd1e489..fe4cdc563b2 100755
--- a/utils/compute-stft-feats.py
+++ b/utils/compute-stft-feats.py
@@ -32,7 +32,7 @@ def get_parser():
         type=int_or_none,
         default=None,
         nargs="?",
-        help="Analisys window length in point",
+        help="Analysis window length in point",
     )
     parser.add_argument(
         "--window",
diff --git a/utils/convert_fbank_to_wav.py b/utils/convert_fbank_to_wav.py
index f100a7f078a..e38feb90593 100755
--- a/utils/convert_fbank_to_wav.py
+++ b/utils/convert_fbank_to_wav.py
@@ -39,7 +39,9 @@ def logmelspc_to_linearspc(lmspc, fs, n_mels, n_fft, fmin=None, fmax=None):
     fmin = 0 if fmin is None else fmin
     fmax = fs / 2 if fmax is None else fmax
     mspc = np.power(10.0, lmspc)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+    )
     inv_mel_basis = np.linalg.pinv(mel_basis)
     spc = np.maximum(EPS, np.dot(inv_mel_basis, mspc.T).T)
 
@@ -116,7 +118,7 @@ def get_parser():
         type=int,
         default=None,
         nargs="?",
-        help="Analisys window length in point",
+        help="Analysis window length in point",
     )
     parser.add_argument(
         "--n_mels", type=int, default=None, nargs="?", help="Number of mel basis"
diff --git a/utils/divide_lang.sh b/utils/divide_lang.sh
new file mode 100755
index 00000000000..b1fd3c22349
--- /dev/null
+++ b/utils/divide_lang.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright 2021 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <set> <langs divided by space>"
+    echo "e.g.: $0 dev"
+    exit 1
+fi
+
+set=$1
+langs=$2
+
+# Copy stuff intoc its final locations [this has been moved from the format_data script]
+for lang in ${langs}; do
+    mkdir -p data/${set}.${lang}
+    for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+        if [ -f data/${set}/${f} ]; then
+            sort data/${set}/${f} > data/${set}.${lang}/${f}
+        fi
+    done
+    sort data/${set}/text.lc.rm.${lang} > data/${set}.${lang}/text  # dummy
+    for case in lc.rm lc tc; do
+        sort data/${set}/text.${case}.${lang} > data/${set}.${lang}/text.${case}
+    done
+    utils/fix_data_dir.sh --utt_extra_files "text.tc text.lc text.lc.rm" data/${set}.${lang}
+    if [ -f data/${set}.${lang}/feats.scp ]; then
+        utils/validate_data_dir.sh data/${set}.${lang} || exit 1;
+    else
+        utils/validate_data_dir.sh --no-feats --no-wav data/${set}.${lang} || exit 1;
+    fi
+done
diff --git a/utils/download_from_google_drive.sh b/utils/download_from_google_drive.sh
index af8176d87e4..e0a27654abb 100755
--- a/utils/download_from_google_drive.sh
+++ b/utils/download_from_google_drive.sh
@@ -48,4 +48,4 @@ decompress "${tmp}" "${download_dir}"
 
 # remove tmpfiles
 rm "${tmp}"
-echo "Sucessfully downloaded ${file_ext} file from ${share_url}"
+echo "Successfully downloaded ${file_ext} file from ${share_url}"
diff --git a/utils/eval_perm_free_error.py b/utils/eval_perm_free_error.py
index d0c9910bf55..2f1b15132b2 100755
--- a/utils/eval_perm_free_error.py
+++ b/utils/eval_perm_free_error.py
@@ -178,7 +178,7 @@ def main():
     parser = get_parser()
     args = parser.parse_args()
 
-    if len(args.results) != args.num_spkrs ** 2:
+    if len(args.results) != args.num_spkrs**2:
         parser.print_help()
         sys.exit(1)
 
diff --git a/utils/json2sctm.py b/utils/json2sctm.py
index 13e43691657..e482f958a9c 100644
--- a/utils/json2sctm.py
+++ b/utils/json2sctm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 import argparse
diff --git a/utils/json2trn_mt.py b/utils/json2trn_mt.py
index c64756f9094..be56aab6adf 100755
--- a/utils/json2trn_mt.py
+++ b/utils/json2trn_mt.py
@@ -78,20 +78,15 @@ def convert(jsonf, dic, refs, hyps, srcs, dic_src):
     for x in j["utts"]:
         # hyps
         if hyps:
-            seq = [
-                char_list_tgt[int(i)]
-                for i in j["utts"][x]["output"][0]["rec_tokenid"].split()
-            ]
-            hyp_file.write(" ".join(seq).replace("<eos>", "")),
+            hyp_file.write(j["utts"][x]["output"][0]["rec_text"].replace("<eos>", "")),
+
             hyp_file.write(
                 " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
             )
 
         # ref
-        seq = [
-            char_list_tgt[int(i)] for i in j["utts"][x]["output"][0]["tokenid"].split()
-        ]
-        ref_file.write(" ".join(seq).replace("<eos>", "")),
+        ref_file.write(j["utts"][x]["output"][0]["text"]),
+
         ref_file.write(
             " (" + j["utts"][x]["utt2spk"].replace("-", "_") + "-" + x + ")\n"
         )
diff --git a/utils/queue-freegpu.pl b/utils/queue-freegpu.pl
index 561ef7e71eb..14cba484514 100755
--- a/utils/queue-freegpu.pl
+++ b/utils/queue-freegpu.pl
@@ -432,7 +432,7 @@ sub caught_signal {
         print STDERR "queue.pl: It looks like the queue master may be inaccessible. " .
           " Trying again after $waitfor seconts\n";
         sleep($waitfor);
-        # ... and continue throught the loop.
+        # ... and continue through the loop.
       } else {
         exit(1);
       }
diff --git a/utils/remove_longshortdata.sh b/utils/remove_longshortdata.sh
index be8f79f1af2..e0b9da098fa 100755
--- a/utils/remove_longshortdata.sh
+++ b/utils/remove_longshortdata.sh
@@ -11,6 +11,7 @@ maxchars=200
 minchars=0
 nlsyms=""
 no_feat=false
+trans_type=char
 
 help_message="usage: $0 olddatadir newdatadir"
 
@@ -39,12 +40,12 @@ fi
 echo "extract utterances having less than $maxchars or more than $minchars characters"
 # counting number of chars. Use (NF - 1) instead of NF to exclude the utterance ID column
 if [ -z ${nlsyms} ]; then
-text2token.py -s 1 -n 1 ${sdir}/text \
+text2token.py -s 1 -n 1 ${sdir}/text --trans_type ${trans_type} \
     | awk -v maxchars="$maxchars" '{ if (NF - 1 < maxchars) print }' \
     | awk -v minchars="$minchars" '{ if (NF - 1 > minchars) print }' \
     | awk '{print $1}' > ${odir}/tmp/reclist2
 else
-text2token.py -l ${nlsyms} -s 1 -n 1 ${sdir}/text \
+text2token.py -l ${nlsyms} -s 1 -n 1 ${sdir}/text --trans_type ${trans_type} \
     | awk -v maxchars="$maxchars" '{ if (NF - 1 < maxchars) print }' \
     | awk -v minchars="$minchars" '{ if (NF - 1 > minchars) print }' \
     | awk '{print $1}' > ${odir}/tmp/reclist2
diff --git a/utils/score_bleu.sh b/utils/score_bleu.sh
index 4c9cc2cdc75..5d82971d490 100755
--- a/utils/score_bleu.sh
+++ b/utils/score_bleu.sh
@@ -8,12 +8,12 @@ export LC_ALL=C
 . ./path.sh
 
 nlsyms=""
-bpe=""
 bpemodel=""
 filter=""
-case=lc
+case=tc
 set=""
 remove_nonverbal=true
+character_level=false
 
 . utils/parse_options.sh
 
@@ -35,24 +35,25 @@ json2trn_mt.py ${dir}/data.json ${dic_tgt} --refs ${dir}/ref.trn.org \
 perl -pe 's/\([^\)]+\)\n/\n/g;' ${dir}/ref.trn.org > ${dir}/ref.trn
 perl -pe 's/\([^\)]+\)\n/\n/g;' ${dir}/hyp.trn.org > ${dir}/hyp.trn
 perl -pe 's/\([^\)]+\)\n/\n/g;' ${dir}/src.trn.org > ${dir}/src.trn
+perl -pe 's/.+\s\(([^\)]+)\)\n/\($1\)\n/g;' ${dir}/ref.trn.org > ${dir}/utt_id
 
 # remove non-verbal labels (optional)
 perl -pe 's/\([^\)]+\)//g;' ${dir}/ref.trn > ${dir}/ref.rm.trn
 perl -pe 's/\([^\)]+\)//g;' ${dir}/hyp.trn > ${dir}/hyp.rm.trn
 perl -pe 's/\([^\)]+\)//g;' ${dir}/src.trn > ${dir}/src.rm.trn
 
-if [ -n "$bpe" ]; then
-    if [ ${remove_nonverbal} ]; then
-        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.rm.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
+if [ -n "${bpemodel}" ]; then
+    if [ ${remove_nonverbal} = true ]; then
+        cat ${dir}/ref.rm.trn > ${dir}/ref.wrd.trn
         spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.rm.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
         spm_decode --model=${bpemodel} --input_format=piece < ${dir}/src.rm.trn | sed -e "s/▁/ /g" > ${dir}/src.wrd.trn
     else
-        spm_decode --model=${bpemodel} --input_format=piece < ${dir}/ref.trn | sed -e "s/▁/ /g" > ${dir}/ref.wrd.trn
+        cat ${dir}/ref.trn > ${dir}/ref.wrd.trn
         spm_decode --model=${bpemodel} --input_format=piece < ${dir}/hyp.trn | sed -e "s/▁/ /g" > ${dir}/hyp.wrd.trn
         spm_decode --model=${bpemodel} --input_format=piece < ${dir}/src.trn | sed -e "s/▁/ /g" > ${dir}/src.wrd.trn
     fi
 else
-    if [ ${remove_nonverbal} ]; then
+    if [ ${remove_nonverbal} = true ]; then
         sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/ref.rm.trn > ${dir}/ref.wrd.trn
         sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/hyp.rm.trn > ${dir}/hyp.wrd.trn
         sed -e "s/ //g" -e "s/(/ (/" -e "s/<space>/ /g" -e "s/>/> /g" ${dir}/src.rm.trn > ${dir}/src.wrd.trn
@@ -84,16 +85,33 @@ if [ -n "${filter}" ]; then
 fi
 # NOTE: this must be performed after detokenization so that punctuation marks are not removed
 
+if [ ${character_level} = true ]; then
+    # for Japanese/Chinese
+    cp ${dir}/ref.wrd.trn.detok ${dir}/ref.wrd.trn.detok.tmp
+    cp ${dir}/hyp.wrd.trn.detok ${dir}/hyp.wrd.trn.detok.tmp
+    cp ${dir}/src.wrd.trn.detok ${dir}/src.wrd.trn.detok.tmp
+    LC_ALL=en_US.UTF-8 sed -e 's/\(.\)/ \1/g' ${dir}/ref.wrd.trn.detok.tmp > ${dir}/ref.wrd.trn.detok
+    LC_ALL=en_US.UTF-8 sed -e 's/\(.\)/ \1/g' ${dir}/hyp.wrd.trn.detok.tmp > ${dir}/hyp.wrd.trn.detok
+    LC_ALL=en_US.UTF-8 sed -e 's/\(.\)/ \1/g' ${dir}/src.wrd.trn.detok.tmp > ${dir}/src.wrd.trn.detok
+fi
+
+if [ -f ${dir}/result.${case}.txt ]; then
+    rm ${dir}/result.${case}.txt
+    touch ${dir}/result.${case}.txt
+fi
+if [ -n "${set}" ]; then
+    echo ${set} > ${dir}/result.${case}.txt
+fi
+echo "########################################################################################################################" >> ${dir}/result.${case}.txt
+echo "sacleBLEU" >> ${dir}/result.${case}.txt
 if [ ${case} = tc ]; then
-    echo ${set} > ${dir}/result.tc.txt
-    multi-bleu-detok.perl ${dir}/ref.wrd.trn.detok < ${dir}/hyp.wrd.trn.detok >> ${dir}/result.tc.txt
+    sacrebleu ${dir}/ref.wrd.trn.detok -i ${dir}/hyp.wrd.trn.detok -m bleu chrf ter >> ${dir}/result.${case}.txt
     echo "write a case-sensitive BLEU result in ${dir}/result.tc.txt"
-    cat ${dir}/result.tc.txt
 else
-    echo ${set} > ${dir}/result.lc.txt
-    multi-bleu-detok.perl -lc ${dir}/ref.wrd.trn.detok < ${dir}/hyp.wrd.trn.detok > ${dir}/result.lc.txt
+    sacrebleu -lc ${dir}/ref.wrd.trn.detok -i ${dir}/hyp.wrd.trn.detok -m bleu chrf ter >> ${dir}/result.${case}.txt
     echo "write a case-insensitive BLEU result in ${dir}/result.lc.txt"
-    cat ${dir}/result.lc.txt
 fi
+echo "########################################################################################################################" >> ${dir}/result.${case}.txt
+cat ${dir}/result.${case}.txt
 
-# TODO(hirofumi): add TER & METEOR metrics here
+# TODO(hirofumi): add METEOR, BERTscore here
diff --git a/utils/translate_wav.sh b/utils/translate_wav.sh
index a57021eeb12..b586c6e54ee 100755
--- a/utils/translate_wav.sh
+++ b/utils/translate_wav.sh
@@ -11,6 +11,7 @@ fi
 . ./path.sh
 
 # general configuration
+python=python3
 stage=0        # start from 0 if you need to start from data preparation
 stop_stage=100
 ngpu=0         # number of gpus ("0" uses cpu, otherwise use gpu)
@@ -188,7 +189,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
 
 
     ${decode_cmd} ${decode_dir}/log/decode.log \
-        st_trans.py \
+        ${python} -m espnet.bin.st_trans \
         --config ${decode_config} \
         --ngpu ${ngpu} \
         --backend pytorch \
diff --git a/utils/trim_silence.py b/utils/trim_silence.py
index 684128b87f8..d448438a4b6 100755
--- a/utils/trim_silence.py
+++ b/utils/trim_silence.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 
 # Copyright 2018 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
+"""Trim silence at the beginning and the end of audio."""
 
 import argparse
 import codecs
@@ -14,53 +14,87 @@
 import librosa
 import matplotlib.pyplot as plt
 import numpy
+import resampy
 
 from espnet.utils.cli_utils import get_commandline_args
 
 
 def _time_to_str(time_idx):
-    time_idx = time_idx * 10 ** 4
+    time_idx = time_idx * 10**4
     return "%06d" % time_idx
 
 
 def get_parser():
+    """Get argument parser."""
     parser = argparse.ArgumentParser(
         description="Trim slience with simple power thresholding "
         "and make segments file.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
-    parser.add_argument("--fs", type=int, help="Sampling frequency")
     parser.add_argument(
-        "--threshold", type=float, default=60, help="Threshold in decibels"
+        "--fs",
+        type=int,
+        help="Sampling frequency.",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=60,
+        help="Threshold in decibels.",
     )
     parser.add_argument(
-        "--win_length", type=int, default=1024, help="Analisys window length in point"
+        "--win_length",
+        type=int,
+        default=1200,
+        help="Analysis window length in point.",
+    )
+    parser.add_argument(
+        "--shift_length",
+        type=int,
+        default=300,
+        help="Shift length in point.",
     )
     parser.add_argument(
-        "--shift_length", type=int, default=256, help="Shift length in point"
+        "--min_silence",
+        type=float,
+        default=0.01,
+        help="Minimum silence length in sec.",
     )
     parser.add_argument(
-        "--min_silence", type=float, default=0.01, help="minimum silence length"
+        "--figdir",
+        type=str,
+        default=None,
+        help="Directory to save figures.",
     )
     parser.add_argument(
-        "--figdir", type=str, default="figs", help="Directory to save figures"
+        "--verbose",
+        default=0,
+        type=int,
+        help="Verbosity level.",
     )
-    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
     parser.add_argument(
         "--normalize",
         choices=[1, 16, 24, 32],
         type=int,
         default=None,
         help="Give the bit depth of the PCM, "
-        "then normalizes data to scale in [-1,1]",
+        "then normalizes data to scale in [-1,1].",
+    )
+    parser.add_argument(
+        "rspecifier",
+        type=str,
+        help="WAV scp file.",
+    )
+    parser.add_argument(
+        "wspecifier",
+        type=str,
+        help="Segments file.",
     )
-    parser.add_argument("rspecifier", type=str, help="WAV scp file")
-    parser.add_argument("wspecifier", type=str, help="Segments file")
-
     return parser
 
 
 def main():
+    """Run silence trimming and generate segments."""
     parser = get_parser()
     args = parser.parse_args()
 
@@ -72,17 +106,17 @@ def main():
         logging.basicConfig(level=logging.WARN, format=logfmt)
     logging.info(get_commandline_args())
 
-    if not os.path.exists(args.figdir):
-        os.makedirs(args.figdir)
+    os.makedirs(args.figdir, exist_ok=True)
 
     with kaldiio.ReadHelper(args.rspecifier) as reader, codecs.open(
         args.wspecifier, "w", encoding="utf-8"
     ) as f:
         for utt_id, (rate, array) in reader:
-            assert rate == args.fs
             array = array.astype(numpy.float32)
             if args.normalize is not None and args.normalize != 1:
                 array = array / (1 << (args.normalize - 1))
+            if rate != args.fs:
+                array = resampy.resample(array, rate, args.fs, axis=0)
             array_trim, idx = librosa.effects.trim(
                 y=array,
                 top_db=args.threshold,
@@ -92,15 +126,16 @@ def main():
             start, end = idx / args.fs
 
             # save figure
-            plt.subplot(2, 1, 1)
-            plt.plot(array)
-            plt.title("Original")
-            plt.subplot(2, 1, 2)
-            plt.plot(array_trim)
-            plt.title("Trim")
-            plt.tight_layout()
-            plt.savefig(args.figdir + "/" + utt_id + ".png")
-            plt.close()
+            if args.figdir is not None:
+                plt.subplot(2, 1, 1)
+                plt.plot(array)
+                plt.title("Original")
+                plt.subplot(2, 1, 2)
+                plt.plot(array_trim)
+                plt.title("Trim")
+                plt.tight_layout()
+                plt.savefig(args.figdir + "/" + utt_id + ".png")
+                plt.close()
 
             # added minimum silence part
             start = max(0.0, start - args.min_silence)
diff --git a/utils/trim_silence.sh b/utils/trim_silence.sh
index 9b008794a47..bd4d18117e8 100755
--- a/utils/trim_silence.sh
+++ b/utils/trim_silence.sh
@@ -20,13 +20,14 @@ Options:
   --win_length <win_length>      # window length in point (default=1024)
   --shift_length <shift_length>  # shift length in point (default=256)
   --threshold <threshold>        # power threshold in db (default=60)
-  --min_silence <sec>            # minimum silence lenght in sec (default=0.01)
+  --min_silence <sec>            # minimum silence length in sec (default=0.01)
   --normalize <bit>              # audio bit (default=16)
   --cmd <cmd>                    # how to run jobs (default=run.pl)
   --nj <nj>                      # number of parallel jobs (default=32)
 EOF
 )
 
+# shellcheck disable=SC1091
 . utils/parse_options.sh || exit 1;
 
 if [ ! $# -eq 2 ]; then
@@ -38,32 +39,33 @@ set -euo pipefail
 data=$1
 logdir=$2
 
-tmpdir=$(mktemp -d ${data}/tmp-XXXX)
+tmpdir=$(mktemp -d "${data}"/tmp-XXXX)
 split_scps=""
-for n in $(seq ${nj}); do
+for n in $(seq "${nj}"); do
     split_scps="${split_scps} ${tmpdir}/wav.${n}.scp"
 done
-utils/split_scp.pl ${data}/wav.scp ${split_scps} || exit 1;
+# shellcheck disable=SC2086
+utils/split_scp.pl "${data}/wav.scp" ${split_scps} || exit 1;
 
 # make segments file describing start and end time
-${cmd} JOB=1:${nj} ${logdir}/trim_silence.JOB.log \
+${cmd} JOB=1:"${nj}" "${logdir}/trim_silence.JOB.log" \
     MPLBACKEND=Agg trim_silence.py \
-        --fs ${fs} \
-        --win_length ${win_length} \
-        --shift_length ${shift_length} \
-        --threshold ${threshold} \
-        --min_silence ${min_silence} \
-        --normalize ${normalize} \
-        --figdir ${logdir}/figs \
-        scp:${tmpdir}/wav.JOB.scp \
-        ${tmpdir}/segments.JOB
+        --fs "${fs}" \
+        --win_length "${win_length}" \
+        --shift_length "${shift_length}" \
+        --threshold "${threshold}" \
+        --min_silence "${min_silence}" \
+        --normalize "${normalize}" \
+        --figdir "${logdir}/figs" \
+        scp:"${tmpdir}/wav.JOB.scp" \
+        "${tmpdir}/segments.JOB"
 
 # concatenate segments
-for n in $(seq ${nj}); do
-    cat ${tmpdir}/segments.${n} || exit 1;
-done > ${data}/segments || exit 1
-rm -rf ${tmpdir}
+for n in $(seq "${nj}"); do
+    cat "${tmpdir}/segments.${n}" || exit 1;
+done > "${data}/segments" || exit 1
+rm -rf "${tmpdir}"
 
 # check
-utils/validate_data_dir.sh --no-feats ${data}
+utils/validate_data_dir.sh --no-feats "${data}"
 echo "Successfully trimed silence part."
diff --git a/utils/trn2ctm.py b/utils/trn2ctm.py
index 8d9e525d7e2..07f98e1820b 100755
--- a/utils/trn2ctm.py
+++ b/utils/trn2ctm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 import argparse
 import codecs
diff --git a/utils/trn2stm.py b/utils/trn2stm.py
index a7826265383..59bd719e92d 100644
--- a/utils/trn2stm.py
+++ b/utils/trn2stm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 
 import argparse
 import codecs
diff --git a/utils/update_json.sh b/utils/update_json.sh
index 53003adf507..8e41b2642eb 100755
--- a/utils/update_json.sh
+++ b/utils/update_json.sh
@@ -10,6 +10,7 @@ nlsyms=""
 oov="<unk>"
 bpecode=""
 verbose=0
+trans_type=char
 
 text=""
 multilingual=false
@@ -56,9 +57,9 @@ if [ -n "${bpecode}" ]; then
             > ${tmpdir}/output/token.scp
     fi
 elif [ -n "${nlsyms}" ]; then
-    text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
+    text2token.py -s 1 -n 1 -l ${nlsyms} ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp
 else
-    text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
+    text2token.py -s 1 -n 1 ${text} --trans_type ${trans_type} > ${tmpdir}/output/token.scp
 fi
 < ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
 awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp